numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
# Vector-Vector Dot Products in NumKong
|
|
2
|
+
|
|
3
|
+
NumKong implements dot products for every numeric type supported by the library, as the most important building block of higher-level functionality for vectors and higher rank tensors.
|
|
4
|
+
|
|
5
|
+
Dot product for real numbers and integers is defined as:
|
|
6
|
+
|
|
7
|
+
```math
|
|
8
|
+
\text{dot}(a, b) = \sum_{i=0}^{n-1} a_i \cdot b_i
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
For complex numbers, the dot product expands via the distributive property of complex multiplication:
|
|
12
|
+
|
|
13
|
+
```math
|
|
14
|
+
\text{dot}(a, b) = \sum_{i=0}^{n-1} (a_{i,re} \cdot b_{i,re} - a_{i,im} \cdot b_{i,im}) + j \sum_{i=0}^{n-1} (a_{i,re} \cdot b_{i,im} + a_{i,im} \cdot b_{i,re})
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
The conjugate dot product negates the imaginary part of $b$:
|
|
18
|
+
|
|
19
|
+
```math
|
|
20
|
+
\text{vdot}(a, b) = \sum_{i=0}^{n-1} a_i \cdot \bar{b_i} = \sum_{i=0}^{n-1} (a_{i,re} \cdot b_{i,re} + a_{i,im} \cdot b_{i,im}) + j \sum_{i=0}^{n-1} (a_{i,im} \cdot b_{i,re} - a_{i,re} \cdot b_{i,im})
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Where $\bar{b_i}$ is the complex conjugate of $b_i$.
|
|
24
|
+
Reformulating as Python pseudocode for interleaved real/imaginary scalar arrays:
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
def dot_real(a: List[number], b: List[number]) -> number:
|
|
28
|
+
return sum(ai * bi for ai, bi in zip(a, b))
|
|
29
|
+
|
|
30
|
+
def dot_complex(a: List[number], b: List[number]) -> Tuple[number, number]:
|
|
31
|
+
a_re, a_im = a[0::2], a[1::2]
|
|
32
|
+
b_re, b_im = b[0::2], b[1::2]
|
|
33
|
+
ab_re = sum(ar * br - ai * bi for ar, ai, br, bi in zip(a_re, a_im, b_re, b_im))
|
|
34
|
+
ab_im = sum(ar * bi + ai * br for ar, ai, br, bi in zip(a_re, a_im, b_re, b_im))
|
|
35
|
+
return ab_re, ab_im
|
|
36
|
+
|
|
37
|
+
def vdot_complex(a: List[number], b: List[number]) -> Tuple[number, number]:
|
|
38
|
+
a_re, a_im = a[0::2], a[1::2]
|
|
39
|
+
b_re, b_im = b[0::2], b[1::2]
|
|
40
|
+
ab_re = sum(ar * br + ai * bi for ar, ai, br, bi in zip(a_re, a_im, b_re, b_im))
|
|
41
|
+
ab_im = sum(ai * br - ar * bi for ar, ai, br, bi in zip(a_re, a_im, b_re, b_im))
|
|
42
|
+
return ab_re, ab_im
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Input & Output Types
|
|
46
|
+
|
|
47
|
+
Real and integer dot products:
|
|
48
|
+
|
|
49
|
+
| Input Type | Output Type | Description |
|
|
50
|
+
| ---------- | ----------- | ---------------------------------------------- |
|
|
51
|
+
| `f64` | `f64` | 64-bit IEEE 754 double precision |
|
|
52
|
+
| `f32` | `f32` | 32-bit IEEE 754 single precision |
|
|
53
|
+
| `f16` | `f32` | 16-bit IEEE 754 half precision, widened output |
|
|
54
|
+
| `bf16` | `f32` | 16-bit brain float, widened output |
|
|
55
|
+
| `e4m3` | `f32` | 8-bit Float8: 4 exponent, 3 mantissa bits |
|
|
56
|
+
| `e5m2` | `f32` | 8-bit Float8: 5 exponent, 2 mantissa bits |
|
|
57
|
+
| `e2m3` | `f32` | 8-bit MX format: 2 exponent, 3 mantissa bits |
|
|
58
|
+
| `e3m2` | `f32` | 8-bit MX format: 3 exponent, 2 mantissa bits |
|
|
59
|
+
| `i8` | `i32` | 8-bit signed integers |
|
|
60
|
+
| `u8` | `u32` | 8-bit unsigned integers |
|
|
61
|
+
| `i4` | `i32` | 4-bit signed integers, packed nibble pairs |
|
|
62
|
+
| `u4` | `u32` | 4-bit unsigned integers, packed nibble pairs |
|
|
63
|
+
| `u1` | `u32` | 1-bit binary packed octets, popcount of AND |
|
|
64
|
+
|
|
65
|
+
Complex dot products (both `dot` and `vdot`):
|
|
66
|
+
|
|
67
|
+
| Input Type | Output Type | Description |
|
|
68
|
+
| ---------- | ----------- | ------------------------------------------ |
|
|
69
|
+
| `f64c` | `f64c` | 64-bit complex pairs |
|
|
70
|
+
| `f32c` | `f32c` | 32-bit complex pairs |
|
|
71
|
+
| `f16c` | `f32c` | 16-bit complex pairs, widened output |
|
|
72
|
+
| `bf16c` | `f32c` | 16-bit brain complex pairs, widened output |
|
|
73
|
+
|
|
74
|
+
## Optimizations
|
|
75
|
+
|
|
76
|
+
### Compensated Arithmetic for Large Floats
|
|
77
|
+
|
|
78
|
+
`nk_dot_f64_serial` uses Neumaier compensated summation — tracking a correction term adjusted by magnitude comparison at each step.
|
|
79
|
+
`nk_dot_f64_haswell`, `nk_dot_f64_skylake`, `nk_dot_f64_sve` implement the Dot2 algorithm by Ogita, Rump, and Oishi: TwoProd via FMA captures the rounding error of each product exactly, and a TwoSum chain propagates it through the accumulator.
|
|
80
|
+
On SVE, the final horizontal reduction uses `svtbl` to extract upper halves at each tree level, applying TwoSum at every stage.
|
|
81
|
+
The serial path uses Neumaier because it processes one element at a time and can cheaply branch on magnitudes.
|
|
82
|
+
Dot2 avoids those branches entirely — TwoProd and TwoSum are pure arithmetic with no comparisons, mapping naturally to wide SIMD where branching per lane is impossible.
|
|
83
|
+
|
|
84
|
+
### Lookup Tables for Mini-Floats
|
|
85
|
+
|
|
86
|
+
`nk_dot_e2m3_haswell`, `nk_dot_e3m2_haswell`, `nk_dot_e2m3_skylake`, `nk_dot_e3m2_skylake` encode 32 MX format values into scaled integers via dual 16-entry LUTs loaded into vector registers.
|
|
87
|
+
The low 4 magnitude bits index `VPSHUFB`, bit 4 selects between the lower and upper table via blending, and the results feed into `VPMADDUBSW` + `VPMADDWD` chains with a final $\div 256$ scaling.
|
|
88
|
+
The Sapphire-specific MX implementation in `sapphire.h` replaces this with a single 64-entry signed LUT via `VPERMUTEX2VAR`, where the sign bit naturally selects between positive and negative tables.
|
|
89
|
+
That path accumulates in native Float16 via `VFMADD_PH` and flushes to Float32 every 128 elements to avoid overflow.
|
|
90
|
+
|
|
91
|
+
### Algebraic Domain Shifting
|
|
92
|
+
|
|
93
|
+
`nk_dot_i8_icelake`, `nk_dot_u8_icelake` work around `VPDPBUSD` requiring UInt8 × Int8 operands.
|
|
94
|
+
For Int8 × Int8, one operand is XORed with `0x80` to shift to unsigned, and the correction $128 \cdot \sum b_i$ is computed via `VPSADBW`, which runs on port 5 and avoids contention with `DPBUSD` on ports 0-1.
|
|
95
|
+
`nk_dot_i4_icelake` extends this to packed nibbles using the identity $(a'-8)(b'-8) = a' b' - 8(a'+b') + 64$ — two `VPDPBUSD` calls handle low and high nibbles separately, with SAD-based correction.
|
|
96
|
+
`nk_dot_i8_v128relaxed`, `nk_dot_u8_v128relaxed` face an even tighter constraint: WASM's `i32x4_relaxed_dot_i8x16_i7x16_add` computes Int8 × Int7, so the sign bit of one operand must be masked off entirely.
|
|
97
|
+
For Int8 × Int8, the sign bit of $b$ is cleared to produce a 7-bit value, and a windowed correction $-128 \cdot \sum_{b_i < 0} a_i$ is accumulated in Int16 and flushed every 127 iterations to prevent overflow.
|
|
98
|
+
For UInt8 × UInt8, $b$ is XORed with `0x80` to shift into signed range, same as Ice Lake, with the correction $128 \cdot \sum a_i$ computed via pairwise widening adds.
|
|
99
|
+
|
|
100
|
+
### Widening Fusion Through BFloat16 on x86
|
|
101
|
+
|
|
102
|
+
`nk_dot_e4m3_genoa`, `nk_dot_e5m2_genoa` convert FP8 values to BF16, then accumulate via `VDPBF16PS` — repurposing Genoa's BF16 dot-product hardware for types it was never designed for.
|
|
103
|
+
Each `VDPBF16PS` fuses two BF16 multiply-adds per 32-bit lane at 6-cycle throughput.
|
|
104
|
+
`nk_dot_bf16c_genoa` uses the same instruction for complex BF16, preparing operands with `VPSHUFB` for lane swapping and `VPXORD` with `0x80000000` for sign flips before feeding into `VDPBF16PS`.
|
|
105
|
+
|
|
106
|
+
### Deferred Sign-Flip in Complex Dot Products
|
|
107
|
+
|
|
108
|
+
The Haswell BFloat16Complex/Float16Complex/Float32Complex kernels compute $\sum (a_r b_r - a_i b_i)$ without per-pair subtraction.
|
|
109
|
+
Instead, two accumulators collect interleaved products $[a_r b_r, a_i b_i, \ldots]$ and $[a_r b_i, a_i b_r, \ldots]$, and a post-loop XOR flips the sign of every odd lane to produce the subtraction.
|
|
110
|
+
This gives one FMA per accumulator per iteration, but each lane grows $O(n)$ while the true result is $O(\sqrt{n})$.
|
|
111
|
+
The Float32Complex kernel absorbs this via Float64 accumulators; Genoa's `VDPBF16PS` and ARM's `FMLSL` pair terms naturally.
|
|
112
|
+
For BFloat16Complex/Float16Complex on Haswell the accumulator is Float32, so the $O(\log n)$ precision loss from lane separation is visible in max ULP at large $n$, though mean ULP remains low.
|
|
113
|
+
|
|
114
|
+
### Widening Fusion Through Float16 on Arm
|
|
115
|
+
|
|
116
|
+
`nk_dot_f16_neonfhm`, `nk_dot_f16c_neonfhm` use the ARMv8.4-FHM instructions `FMLAL`/`FMLSL`, which fuse FP16-to-FP32 conversion with multiply-accumulate in a single operation.
|
|
117
|
+
`vfmlalq_low_f16` and `vfmlalq_high_f16` process the lower and upper 4 elements of an 8-wide FP16 vector respectively.
|
|
118
|
+
For complex dot products, `FMLSL` provides the subtraction path $a_{re} b_{im} - a_{im} b_{re}$ without a separate negate step.
|
|
119
|
+
|
|
120
|
+
### Widening Chains on RISC-V
|
|
121
|
+
|
|
122
|
+
`nk_dot_i8_rvv`, `nk_dot_u8_rvv` use `vwmul` for Int8 × Int8 → Int16 widening multiply followed by `vwadd` to widen-accumulate into Int32 — a two-stage chain that naturally prevents overflow.
|
|
123
|
+
`nk_dot_bf16_rvvbf16` uses the Zvfbfwma extension's `vfwmaccbf16` for fused BFloat16 × BFloat16 → Float32 widening multiply-accumulate.
|
|
124
|
+
`nk_dot_e4m3_rvvbf16`, `nk_dot_e5m2_rvvbf16` convert Float8 to BFloat16 via 256-entry LUTs, then feed the same `vfwmaccbf16` path.
|
|
125
|
+
|
|
126
|
+
## Performance
|
|
127
|
+
|
|
128
|
+
The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
|
|
129
|
+
The input size is controlled by the `NK_DENSE_DIMENSIONS` environment variable and set to 256, 1024, and 4096 elements.
|
|
130
|
+
The throughput is measured in gb/s as the number of bytes read per second amortized for a large batch of vector pairs.
|
|
131
|
+
Accuracy is reported as mean ULP (units in last place) unless noted otherwise — the average number of representable floating-point values between the result and the exact answer.
|
|
132
|
+
Rows marked `🧩` use external BLAS baselines rather than NumKong kernels.
|
|
133
|
+
Each kernel runs for at least 20 seconds per configuration.
|
|
134
|
+
Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
|
|
135
|
+
Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
|
|
136
|
+
|
|
137
|
+
### Intel Sapphire Rapids
|
|
138
|
+
|
|
139
|
+
#### Native
|
|
140
|
+
|
|
141
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
142
|
+
| :---------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
143
|
+
| __f64c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
144
|
+
| `dot_f64c_with_blas` 🧩 | 29.1 gb/s, 25 ulp | 27.6 gb/s, 97 ulp | 13.6 gb/s, 32 ulp |
|
|
145
|
+
| `vdot_f64c_with_blas` 🧩 | 29.1 gb/s, 18 ulp | 27.9 gb/s, 17 ulp | 15.3 gb/s, 25 ulp |
|
|
146
|
+
| `nk_dot_f64c_serial` | 5.45 gb/s, 3.9 ulp | 6.49 gb/s, 9.0 ulp | 6.84 gb/s, 2.9 ulp |
|
|
147
|
+
| `nk_vdot_f64c_serial` | 5.47 gb/s, 4.6 ulp | 6.41 gb/s, 1.6 ulp | 6.76 gb/s, 2.2 ulp |
|
|
148
|
+
| `nk_dot_f64c_skylake` | 23.8 gb/s, 0 ulp | 23.4 gb/s, 0 ulp | 11.8 gb/s, 0 ulp |
|
|
149
|
+
| `nk_vdot_f64c_skylake` | 23.6 gb/s, 0 ulp | 23.7 gb/s, 0 ulp | 11.6 gb/s, 0 ulp |
|
|
150
|
+
| __f32c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
151
|
+
| `dot_f32c_with_blas` 🧩 | 28.7 gb/s, 8.6 ulp | 29.8 gb/s, 13 ulp | 15.8 gb/s, 19 ulp |
|
|
152
|
+
| `vdot_f32c_with_blas` 🧩 | 29.2 gb/s, 11 ulp | 30.2 gb/s, 14 ulp | 15.7 gb/s, 21 ulp |
|
|
153
|
+
| `nk_dot_f32c_serial` | 9.46 gb/s, 0 ulp | 9.82 gb/s, 0 ulp | 9.71 gb/s, 0 ulp |
|
|
154
|
+
| `nk_vdot_f32c_serial` | 9.64 gb/s, 0 ulp | 9.95 gb/s, 0 ulp | 10.1 gb/s, 0 ulp |
|
|
155
|
+
| `nk_dot_f32c_haswell` | 22.4 gb/s, 0 ulp | 22.2 gb/s, 0 ulp | 12.6 gb/s, 0 ulp |
|
|
156
|
+
| `nk_vdot_f32c_haswell` | 22.4 gb/s, 0 ulp | 21.8 gb/s, 0 ulp | 14.7 gb/s, 0 ulp |
|
|
157
|
+
| `nk_dot_f32c_skylake` | 25.6 gb/s, 0 ulp | 27.2 gb/s, 0 ulp | 17.0 gb/s, 0 ulp |
|
|
158
|
+
| `nk_vdot_f32c_skylake` | 27.8 gb/s, 0 ulp | 27.4 gb/s, 0 ulp | 18.8 gb/s, 0 ulp |
|
|
159
|
+
| __bf16c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
160
|
+
| `nk_dot_bf16c_serial` | 0.628 gb/s, 0.1 ulp | 0.627 gb/s, 2.3 ulp | 0.626 gb/s, 7.9 ulp |
|
|
161
|
+
| `nk_vdot_bf16c_serial` | 0.622 gb/s, 0.2 ulp | 0.624 gb/s, 2.1 ulp | 0.627 gb/s, 11.2 ulp |
|
|
162
|
+
| `nk_dot_bf16c_haswell` | 21.5 gb/s, 0.1 ulp | 18.5 gb/s, 1.3 ulp | 18.4 gb/s, 3.4 ulp |
|
|
163
|
+
| `nk_vdot_bf16c_haswell` | 21.9 gb/s, 0.8 ulp | 19.0 gb/s, 2.0 ulp | 18.5 gb/s, 4.5 ulp |
|
|
164
|
+
| `nk_dot_bf16c_genoa` | 37.9 gb/s, 0 ulp | 30.3 gb/s, 1.1 ulp | 29.5 gb/s, 2.8 ulp |
|
|
165
|
+
| `nk_vdot_bf16c_genoa` | 36.1 gb/s, 0.7 ulp | 30.2 gb/s, 1.2 ulp | 30.2 gb/s, 3.3 ulp |
|
|
166
|
+
| __f16c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
167
|
+
| `nk_dot_f16c_serial` | 2.02 gb/s, 14.4 ulp | 2.00 gb/s, 27.3 ulp | 2.02 gb/s, 34.0 ulp |
|
|
168
|
+
| `nk_vdot_f16c_serial` | 1.67 gb/s, 15.0 ulp | 1.64 gb/s, 26.3 ulp | 1.64 gb/s, 34.2 ulp |
|
|
169
|
+
| `nk_dot_f16c_haswell` | 23.9 gb/s, 12.7 ulp | 19.4 gb/s, 22.3 ulp | 19.3 gb/s, 40.1 ulp |
|
|
170
|
+
| `nk_vdot_f16c_haswell` | 24.0 gb/s, 11.1 ulp | 20.0 gb/s, 17.4 ulp | 17.1 gb/s, 29.2 ulp |
|
|
171
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
172
|
+
| `dot_f64_with_blas` 🧩 | 27.8 gb/s, 6.9 ulp | 30.1 gb/s, 9.3 ulp | 15.7 gb/s, 20 ulp |
|
|
173
|
+
| `nk_dot_f64_serial` | 5.41 gb/s, 2.2 ulp | 6.31 gb/s, 2.0 ulp | 6.77 gb/s, 3.3 ulp |
|
|
174
|
+
| `nk_dot_f64_haswell` | 21.9 gb/s, 0 ulp | 26.1 gb/s, 0 ulp | 12.4 gb/s, 0 ulp |
|
|
175
|
+
| `nk_dot_f64_skylake` | 23.9 gb/s, 0 ulp | 27.0 gb/s, 0 ulp | 16.4 gb/s, 0 ulp |
|
|
176
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
177
|
+
| `dot_f32_with_blas` 🧩 | 47.8 gb/s, 14 ulp | 30.7 gb/s, 14 ulp | 29.7 gb/s, 15 ulp |
|
|
178
|
+
| `nk_dot_f32_serial` | 11.3 gb/s, 0 ulp | 11.7 gb/s, 0 ulp | 10.7 gb/s, 0 ulp |
|
|
179
|
+
| `nk_dot_f32_haswell` | 28.0 gb/s, 0 ulp | 23.6 gb/s, 0 ulp | 21.3 gb/s, 0 ulp |
|
|
180
|
+
| `nk_dot_f32_skylake` | 36.3 gb/s, 0 ulp | 29.2 gb/s, 0 ulp | 23.7 gb/s, 0 ulp |
|
|
181
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
182
|
+
| `nk_dot_bf16_serial` | 0.655 gb/s, 0 ulp | 0.644 gb/s, 0.6 ulp | 0.651 gb/s, 5.2 ulp |
|
|
183
|
+
| `nk_dot_bf16_haswell` | 30.1 gb/s, 0 ulp | 20.3 gb/s, 0.2 ulp | 19.4 gb/s, 41.3 ulp |
|
|
184
|
+
| `nk_dot_bf16_skylake` | 53.6 gb/s, 0 ulp | 30.4 gb/s, 0.2 ulp | 29.6 gb/s, 21.8 ulp |
|
|
185
|
+
| `nk_dot_bf16_genoa` | 88.1 gb/s, 0 ulp | 31.6 gb/s, 0.2 ulp | 31.1 gb/s, 2.2 ulp |
|
|
186
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
187
|
+
| `nk_dot_f16_serial` | 1.38 gb/s, 11.5 ulp | 1.37 gb/s, 33.7 ulp | 1.32 gb/s, 59.7 ulp |
|
|
188
|
+
| `nk_dot_f16_haswell` | 30.6 gb/s, 7.0 ulp | 23.3 gb/s, 14.0 ulp | 20.0 gb/s, 29.8 ulp |
|
|
189
|
+
| `nk_dot_f16_skylake` | 54.4 gb/s, 6.2 ulp | 31.4 gb/s, 8.6 ulp | 30.0 gb/s, 22.8 ulp |
|
|
190
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
191
|
+
| `nk_dot_e5m2_serial` | 1.99 gb/s, 0 ulp | 2.11 gb/s, 0 ulp | 2.13 gb/s, 0 ulp |
|
|
192
|
+
| `nk_dot_e5m2_haswell` | 4.90 gb/s, 0 ulp | 4.87 gb/s, 0 ulp | 4.56 gb/s, 0 ulp |
|
|
193
|
+
| `nk_dot_e5m2_skylake` | 6.34 gb/s, 0 ulp | 6.45 gb/s, 0 ulp | 6.17 gb/s, 0 ulp |
|
|
194
|
+
| `nk_dot_e5m2_genoa` | 12.6 gb/s, 0 ulp | 12.7 gb/s, 0 ulp | 12.8 gb/s, 0 ulp |
|
|
195
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
196
|
+
| `nk_dot_e4m3_serial` | 0.797 gb/s, 0 ulp | 0.801 gb/s, 0 ulp | 0.816 gb/s, 0 ulp |
|
|
197
|
+
| `nk_dot_e4m3_haswell` | 3.25 gb/s, 0 ulp | 3.25 gb/s, 0 ulp | 2.53 gb/s, 0 ulp |
|
|
198
|
+
| `nk_dot_e4m3_skylake` | 4.99 gb/s, 0 ulp | 5.03 gb/s, 0 ulp | 4.94 gb/s, 0 ulp |
|
|
199
|
+
| `nk_dot_e4m3_genoa` | 12.4 gb/s, 0 ulp | 13.0 gb/s, 0 ulp | 12.7 gb/s, 0 ulp |
|
|
200
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
201
|
+
| `nk_dot_e3m2_serial` | 2.02 gb/s, 0 ulp | 2.08 gb/s, 0 ulp | 2.14 gb/s, 0 ulp |
|
|
202
|
+
| `nk_dot_e3m2_haswell` | 11.9 gb/s, 0 ulp | 12.0 gb/s, 0 ulp | 11.5 gb/s, 0 ulp |
|
|
203
|
+
| `nk_dot_e3m2_icelake` | 22.6 gb/s, 0 ulp | 24.1 gb/s, 0 ulp | 22.5 gb/s, 0 ulp |
|
|
204
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
205
|
+
| `nk_dot_e2m3_serial` | 2.07 gb/s, 0 ulp | 2.05 gb/s, 0 ulp | 2.14 gb/s, 0 ulp |
|
|
206
|
+
| `nk_dot_e2m3_haswell` | 20.0 gb/s, 0 ulp | 19.5 gb/s, 0 ulp | 18.7 gb/s, 0 ulp |
|
|
207
|
+
| `nk_dot_e2m3_icelake` | 56.9 gb/s, 0 ulp | 43.1 gb/s, 0 ulp | 30.1 gb/s, 0 ulp |
|
|
208
|
+
| `nk_dot_e2m3_alder` | 29.8 gb/s, 0 ulp | 30.2 gb/s, 0 ulp | 25.6 gb/s, 0 ulp |
|
|
209
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
210
|
+
| `nk_dot_i8_serial` | 17.4 gb/s | 17.2 gb/s | 16.0 gb/s |
|
|
211
|
+
| `nk_dot_i8_haswell` | 33.4 gb/s | 23.5 gb/s | 24.9 gb/s |
|
|
212
|
+
| `nk_dot_i8_skylake` | 53.6 gb/s | 39.9 gb/s | 29.7 gb/s |
|
|
213
|
+
| `nk_dot_i8_icelake` | 63.3 gb/s | 49.5 gb/s | 30.4 gb/s |
|
|
214
|
+
| `nk_dot_i8_alder` | 43.8 gb/s | 43.0 gb/s | 30.4 gb/s |
|
|
215
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
216
|
+
| `nk_dot_u8_serial` | 17.4 gb/s | 17.1 gb/s | 16.3 gb/s |
|
|
217
|
+
| `nk_dot_u8_haswell` | 32.2 gb/s | 37.5 gb/s | 28.3 gb/s |
|
|
218
|
+
| `nk_dot_u8_skylake` | 54.6 gb/s | 41.0 gb/s | 28.4 gb/s |
|
|
219
|
+
| `nk_dot_u8_icelake` | 74.4 gb/s | 48.4 gb/s | 30.3 gb/s |
|
|
220
|
+
| `nk_dot_u8_alder` | 54.2 gb/s | 43.9 gb/s | 32.3 gb/s |
|
|
221
|
+
| __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
222
|
+
| `nk_dot_i4_serial` | 9.37 gb/s | 11.8 gb/s | 11.8 gb/s |
|
|
223
|
+
| `nk_dot_i4_haswell` | 8.39 gb/s | 8.47 gb/s | 8.30 gb/s |
|
|
224
|
+
| `nk_dot_i4_icelake` | 24.9 gb/s | 35.9 gb/s | 25.8 gb/s |
|
|
225
|
+
| __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
226
|
+
| `nk_dot_u4_serial` | 10.6 gb/s | 12.0 gb/s | 11.9 gb/s |
|
|
227
|
+
| `nk_dot_u4_haswell` | 15.2 gb/s | 16.0 gb/s | 14.4 gb/s |
|
|
228
|
+
| `nk_dot_u4_icelake` | 49.6 gb/s | 58.3 gb/s | 29.6 gb/s |
|
|
229
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
230
|
+
| `nk_dot_u1_serial` | 3.92 gb/s | 5.04 gb/s | 4.97 gb/s |
|
|
231
|
+
| `nk_dot_u1_haswell` | 14.2 gb/s | 46.7 gb/s | 70.9 gb/s |
|
|
232
|
+
| `nk_dot_u1_icelake` | 21.1 gb/s | 70.9 gb/s | 109 gb/s |
|
|
233
|
+
|
|
234
|
+
#### WASM
|
|
235
|
+
|
|
236
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
237
|
+
|
|
238
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
239
|
+
| :------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
240
|
+
| __f64c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
241
|
+
| `nk_dot_f64c_serial` | 1.76 gb/s, 5.3 ulp | 2.43 gb/s, 3.3 ulp | 0.28 gb/s, 2.9 ulp |
|
|
242
|
+
| `nk_vdot_f64c_serial` | 1.02 gb/s, 3.5 ulp | 2.44 gb/s, 5.5 ulp | 0.15 gb/s, 2.2 ulp |
|
|
243
|
+
| `nk_dot_f64c_v128relaxed` | 2.80 gb/s, 37.8 ulp | 3.01 gb/s, 34.9 ulp | 0.21 gb/s, 167 ulp |
|
|
244
|
+
| `nk_vdot_f64c_v128relaxed` | 2.06 gb/s, 20.1 ulp | 2.87 gb/s, 51.4 ulp | 0.04 gb/s, 57.2 ulp |
|
|
245
|
+
| __f32c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
246
|
+
| `nk_dot_f32c_serial` | 1.26 gb/s, 0 ulp | 1.74 gb/s, 0 ulp | 0.08 gb/s, 0 ulp |
|
|
247
|
+
| `nk_vdot_f32c_serial` | 1.13 gb/s, 0 ulp | 1.78 gb/s, 0 ulp | 0.21 gb/s, 0 ulp |
|
|
248
|
+
| `nk_dot_f32c_v128relaxed` | 1.62 gb/s, 0 ulp | 1.92 gb/s, 0 ulp | 0.20 gb/s, 0 ulp |
|
|
249
|
+
| `nk_vdot_f32c_v128relaxed` | 1.66 gb/s, 0 ulp | 1.69 gb/s, 0 ulp | 0.13 gb/s, 0 ulp |
|
|
250
|
+
| __bf16c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
251
|
+
| `nk_dot_bf16c_serial` | 1.00 gb/s, 0.1 ulp | 2.29 gb/s, 1.7 ulp | 0.08 gb/s, 7.9 ulp |
|
|
252
|
+
| `nk_vdot_bf16c_serial` | 0.581 gb/s, 0.1 ulp | 0.919 gb/s, 2.9 ulp | 0.30 gb/s, 11.2 ulp |
|
|
253
|
+
| __f16c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
254
|
+
| `nk_dot_f16c_serial` | 1.39 gb/s, 12.6 ulp | 0.759 gb/s, 22.2 ulp | 0.23 gb/s, 34 ulp |
|
|
255
|
+
| `nk_vdot_f16c_serial` | 1.11 gb/s, 14.4 ulp | 0.828 gb/s, 41.8 ulp | 0.02 gb/s, 34.2 ulp |
|
|
256
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
257
|
+
| `nk_dot_f64_serial` | 1.53 gb/s, 3.1 ulp | 1.72 gb/s, 2.5 ulp | 0.20 gb/s, 3.3 ulp |
|
|
258
|
+
| `nk_dot_f64_v128relaxed` | 2.62 gb/s, 3.2 ulp | 2.11 gb/s, 3.6 ulp | 0.28 gb/s, 3.8 ulp |
|
|
259
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
260
|
+
| `nk_dot_f32_serial` | 1.95 gb/s, 0 ulp | 1.89 gb/s, 0 ulp | 0.28 gb/s, 0 ulp |
|
|
261
|
+
| `nk_dot_f32_v128relaxed` | 0.083 gb/s, 0 ulp | 1.61 gb/s, 0 ulp | 1.37 gb/s, 0 ulp |
|
|
262
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
263
|
+
| `nk_dot_bf16_serial` | 2.90 gb/s, 0 ulp | 2.27 gb/s, 0.5 ulp | 0.22 gb/s, 5.2 ulp |
|
|
264
|
+
| `nk_dot_bf16_v128relaxed` | 0.521 gb/s, 0 ulp | 2.30 gb/s, 0.3 ulp | 0.30 gb/s, 2.4 ulp |
|
|
265
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
266
|
+
| `nk_dot_f16_serial` | 0.648 gb/s, 13.5 ulp | 0.712 gb/s, 32 ulp | 0.08 gb/s, 59.7 ulp |
|
|
267
|
+
| `nk_dot_f16_v128relaxed` | 1.58 gb/s, 7.0 ulp | 1.05 gb/s, 30.8 ulp | 0.09 gb/s, 65.1 ulp |
|
|
268
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
269
|
+
| `nk_dot_e5m2_serial` | 1.27 gb/s, 0 ulp | 0.679 gb/s, 0 ulp | 0.10 gb/s, 0 ulp |
|
|
270
|
+
| `nk_dot_e5m2_v128relaxed` | 0.970 gb/s, 0 ulp | 0.955 gb/s, 0 ulp | 0.17 gb/s, 0 ulp |
|
|
271
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
272
|
+
| `nk_dot_e4m3_serial` | 0.312 gb/s, 0 ulp | 0.342 gb/s, 0 ulp | 0.12 gb/s, 0 ulp |
|
|
273
|
+
| `nk_dot_e4m3_v128relaxed` | 1.05 gb/s, 0 ulp | 0.721 gb/s, 0 ulp | 0.30 gb/s, 0 ulp |
|
|
274
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
275
|
+
| `nk_dot_e3m2_serial` | 0.565 gb/s, 0 ulp | 0.552 gb/s, 0 ulp | 0.06 gb/s, 0 ulp |
|
|
276
|
+
| `nk_dot_e3m2_v128relaxed` | 0.670 gb/s, 0 ulp | 2.91 gb/s, 0 ulp | 0.24 gb/s, 0 ulp |
|
|
277
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
278
|
+
| `nk_dot_e2m3_serial` | 0.584 gb/s, 0 ulp | 0.661 gb/s, 0 ulp | 0.07 gb/s, 0 ulp |
|
|
279
|
+
| `nk_dot_e2m3_v128relaxed` | 2.69 gb/s, 0 ulp | 0.131 gb/s, 0 ulp | 0.09 gb/s, 0 ulp |
|
|
280
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
281
|
+
| `nk_dot_i8_serial` | 1.17 gb/s | 1.17 gb/s | 0.29 gb/s |
|
|
282
|
+
| `nk_dot_i8_v128relaxed` | 1.71 gb/s | 0.896 gb/s | 0.24 gb/s |
|
|
283
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
284
|
+
| `nk_dot_u8_serial` | 1.16 gb/s | 0.658 gb/s | 0.30 gb/s |
|
|
285
|
+
| `nk_dot_u8_v128relaxed` | 0.873 gb/s | 0.997 gb/s | 0.15 gb/s |
|
|
286
|
+
| __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
287
|
+
| `nk_dot_i4_serial` | 0.217 gb/s | 0.226 gb/s | 0.28 gb/s |
|
|
288
|
+
| `nk_dot_i4_v128relaxed` | 1.53 gb/s | 2.87 gb/s | 0.24 gb/s |
|
|
289
|
+
| __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
290
|
+
| `nk_dot_u4_serial` | 0.303 gb/s | 0.250 gb/s | 0.003 gb/s |
|
|
291
|
+
| `nk_dot_u4_v128relaxed` | 0.126 gb/s | 2.70 gb/s | 0.08 gb/s |
|
|
292
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
293
|
+
| `nk_dot_u1_serial` | 1.95 gb/s | 1.53 gb/s | 0.09 gb/s |
|
|
294
|
+
| `nk_dot_u1_v128relaxed` | 0.548 gb/s | 1.88 gb/s | 0.13 gb/s |
|
|
295
|
+
|
|
296
|
+
### Apple M4
|
|
297
|
+
|
|
298
|
+
#### Native
|
|
299
|
+
|
|
300
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
301
|
+
| :------------------------ | -----------------------: | -----------------------: | -----------------------: |
|
|
302
|
+
| __f64c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
303
|
+
| `nk_dot_f64c_serial` | 12.5 gb/s, 5 ulp | 12.4 gb/s, 3 ulp | 11.9 gb/s, 9.7 ulp |
|
|
304
|
+
| `nk_vdot_f64c_serial` | 12.4 gb/s, 4.2 ulp | 12.3 gb/s, 3.3 ulp | 11.7 gb/s, 3.3 ulp |
|
|
305
|
+
| `nk_dot_f64c_neon` | 8.06 gb/s, 0 ulp | 8.05 gb/s, 0 ulp | 7.85 gb/s, 0 ulp |
|
|
306
|
+
| `nk_vdot_f64c_neon` | 7.79 gb/s, 0 ulp | 8.05 gb/s, 0 ulp | 7.88 gb/s, 0 ulp |
|
|
307
|
+
| __f32c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
308
|
+
| `nk_dot_f32c_serial` | 12.9 gb/s, 0 ulp | 12.4 gb/s, 0 ulp | 11.9 gb/s, 0 ulp |
|
|
309
|
+
| `nk_vdot_f32c_serial` | 12.8 gb/s, 0 ulp | 12.4 gb/s, 0 ulp | 12.1 gb/s, 0 ulp |
|
|
310
|
+
| `nk_dot_f32c_neon` | 7.97 gb/s, 0 ulp | 7.26 gb/s, 0 ulp | 7.00 gb/s, 0 ulp |
|
|
311
|
+
| `nk_vdot_f32c_neon` | 8.29 gb/s, 0 ulp | 7.58 gb/s, 0 ulp | 7.36 gb/s, 0 ulp |
|
|
312
|
+
| __bf16c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
313
|
+
| `nk_dot_bf16c_serial` | 7.47 gb/s, 0.2 ulp | 6.33 gb/s, 2.8 ulp | 6.08 gb/s, 15.8 ulp |
|
|
314
|
+
| `nk_vdot_bf16c_serial` | 7.47 gb/s, 0.2 ulp | 6.43 gb/s, 2.6 ulp | 6.08 gb/s, 11.4 ulp |
|
|
315
|
+
| `nk_dot_bf16c_neonbfdot` | 12.9 gb/s, 0.1 ulp | 9.31 gb/s, 2 ulp | 8.56 gb/s, 8.8 ulp |
|
|
316
|
+
| `nk_vdot_bf16c_neonbfdot` | 12.9 gb/s, 0.1 ulp | 9.29 gb/s, 1.8 ulp | 8.56 gb/s, 8.8 ulp |
|
|
317
|
+
| __f16c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
318
|
+
| `nk_dot_f16c_serial` | 7.53 gb/s, 20.8 ulp | 6.34 gb/s, 64.1 ulp | 6.07 gb/s, 73.1 ulp |
|
|
319
|
+
| `nk_vdot_f16c_serial` | 7.53 gb/s, 24.8 ulp | 6.34 gb/s, 31.9 ulp | 6.07 gb/s, 137 ulp |
|
|
320
|
+
| `nk_dot_f16c_neonhalf` | 9.94 gb/s, 3.0 ulp | 7.94 gb/s, 6.5 ulp | 7.60 gb/s, 20.5 ulp |
|
|
321
|
+
| `nk_vdot_f16c_neonhalf` | 9.85 gb/s, 34.9 ulp | 7.79 gb/s, 40.7 ulp | 7.57 gb/s, 73.1 ulp |
|
|
322
|
+
| `nk_dot_f16c_neonfhm` | 9.39 gb/s, 3.0 ulp | 7.46 gb/s, 6.5 ulp | 7.19 gb/s, 20.5 ulp |
|
|
323
|
+
| `nk_vdot_f16c_neonfhm` | 9.75 gb/s, 31.4 ulp | 7.50 gb/s, 38.6 ulp | 7.29 gb/s, 67.6 ulp |
|
|
324
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
325
|
+
| `nk_dot_f64_serial` | 7.79 gb/s, 2.4 ulp | 7.79 gb/s, 175 ulp | 7.74 gb/s, 2.7 ulp |
|
|
326
|
+
| `nk_dot_f64_neon` | 14.8 gb/s, 0 ulp | 15.1 gb/s, 0 ulp | 14.7 gb/s, 0 ulp |
|
|
327
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
328
|
+
| `nk_dot_f32_serial` | 11.0 gb/s, 0 ulp | 7.77 gb/s, 0 ulp | 7.18 gb/s, 0 ulp |
|
|
329
|
+
| `nk_dot_f32_neon` | 9.23 gb/s, 0 ulp | 7.30 gb/s, 0 ulp | 6.96 gb/s, 0 ulp |
|
|
330
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
331
|
+
| `nk_dot_bf16_serial` | 5.76 gb/s, 0 ulp | 4.10 gb/s, 0.9 ulp | 3.62 gb/s, 6 ulp |
|
|
332
|
+
| `nk_dot_bf16_neonbfdot` | 35.1 gb/s, 0 ulp | 28.6 gb/s, 0.6 ulp | 22.9 gb/s, 4.5 ulp |
|
|
333
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
334
|
+
| `nk_dot_f16_serial` | 5.66 gb/s, 19 ulp | 3.99 gb/s, 31.1 ulp | 3.51 gb/s, 57.8 ulp |
|
|
335
|
+
| `nk_dot_f16_neonhalf` | 12.1 gb/s, 19.4 ulp | 9.18 gb/s, 21.5 ulp | 7.72 gb/s, 36.3 ulp |
|
|
336
|
+
| `nk_dot_f16_neonfhm` | 16.3 gb/s, 14.9 ulp | 10.7 gb/s, 26.7 ulp | 7.95 gb/s, 39.9 ulp |
|
|
337
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
338
|
+
| `nk_dot_e5m2_serial` | 1.87 gb/s, 0 ulp | 1.76 gb/s, 0 ulp | 1.74 gb/s, 0 ulp |
|
|
339
|
+
| `nk_dot_e5m2_neon` | 7.15 gb/s, 0 ulp | 4.95 gb/s, 0 ulp | 4.23 gb/s, 0 ulp |
|
|
340
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
341
|
+
| `nk_dot_e4m3_serial` | 0.874 gb/s, 0 ulp | 0.873 gb/s, 0 ulp | 0.871 gb/s, 0 ulp |
|
|
342
|
+
| `nk_dot_e4m3_neon` | 1.62 gb/s, 0 ulp | 1.62 gb/s, 0 ulp | 1.60 gb/s, 0 ulp |
|
|
343
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
344
|
+
| `nk_dot_e3m2_serial` | 1.24 gb/s, 0 ulp | 1.15 gb/s, 0 ulp | 1.13 gb/s, 0 ulp |
|
|
345
|
+
| `nk_dot_e3m2_neonsdot` | 7.69 gb/s, 0 ulp | 7.64 gb/s, 0 ulp | 7.70 gb/s, 0 ulp |
|
|
346
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
347
|
+
| `nk_dot_e2m3_serial` | 1.23 gb/s, 0 ulp | 1.15 gb/s, 0 ulp | 1.13 gb/s, 0 ulp |
|
|
348
|
+
| `nk_dot_e2m3_neonsdot` | 16.6 gb/s, 0 ulp | 16.8 gb/s, 0 ulp | 16.4 gb/s, 0 ulp |
|
|
349
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
350
|
+
| `nk_dot_i8_serial` | 43.2 gb/s | 48.5 gb/s | 48.0 gb/s |
|
|
351
|
+
| `nk_dot_i8_neonsdot` | 29.8 gb/s | 29.4 gb/s | 22.9 gb/s |
|
|
352
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
353
|
+
| `nk_dot_u8_serial` | 44.1 gb/s | 45.7 gb/s | 50.0 gb/s |
|
|
354
|
+
| `nk_dot_u8_neonsdot` | 30.6 gb/s | 28.4 gb/s | 22.3 gb/s |
|
|
355
|
+
| __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
356
|
+
| `nk_dot_i4_serial` | 11.9 gb/s | 12.4 gb/s | 12.6 gb/s |
|
|
357
|
+
| `nk_dot_i4_neonsdot` | 19.5 gb/s | 15.5 gb/s | 11.3 gb/s |
|
|
358
|
+
| __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
359
|
+
| `nk_dot_u4_serial` | 12.9 gb/s | 13.7 gb/s | 13.9 gb/s |
|
|
360
|
+
| `nk_dot_u4_neonsdot` | 21.7 gb/s | 16.1 gb/s | 11.4 gb/s |
|
|
361
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
362
|
+
| `nk_dot_u1_serial` | 3.32 gb/s | 3.56 gb/s | 3.58 gb/s |
|
|
363
|
+
| `nk_dot_u1_neon` | 11.2 gb/s | 21.8 gb/s | 27.4 gb/s |
|
|
364
|
+
|
|
365
|
+
#### WASM
|
|
366
|
+
|
|
367
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
368
|
+
|
|
369
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
370
|
+
| :------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
371
|
+
| __f64c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
372
|
+
| `nk_dot_f64c_serial` | 27.7 gb/s, 3.8 ulp | 24.1 gb/s, 3.9 ulp | 26.9 gb/s, 3.2 ulp |
|
|
373
|
+
| `nk_vdot_f64c_serial` | 5.87 gb/s, 3.8 ulp | 5.47 gb/s, 3.4 ulp | 5.66 gb/s, 15.1 ulp |
|
|
374
|
+
| `nk_dot_f64c_v128relaxed` | 45.0 gb/s, 26 ulp | 34.0 gb/s, 42 ulp | 35.9 gb/s, 88 ulp |
|
|
375
|
+
| `nk_vdot_f64c_v128relaxed` | 22.0 gb/s, 22.8 ulp | 19.0 gb/s, 37.3 ulp | 17.6 gb/s, 43.6 ulp |
|
|
376
|
+
| __f32c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
377
|
+
| `nk_dot_f32c_serial` | 21.3 gb/s, 0 ulp | 19.8 gb/s, 0 ulp | 20.4 gb/s, 0 ulp |
|
|
378
|
+
| `nk_vdot_f32c_serial` | 11.1 gb/s, 0 ulp | 10.7 gb/s, 0 ulp | 10.7 gb/s, 0 ulp |
|
|
379
|
+
| `nk_dot_f32c_v128relaxed` | 21.4 gb/s, 0 ulp | 17.2 gb/s, 0 ulp | 18.0 gb/s, 0 ulp |
|
|
380
|
+
| `nk_vdot_f32c_v128relaxed` | 10.4 gb/s, 0 ulp | 9.47 gb/s, 0 ulp | 8.71 gb/s, 0 ulp |
|
|
381
|
+
| __bf16c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
382
|
+
| `nk_dot_bf16c_serial` | 11.1 gb/s, 0.1 ulp | 11.2 gb/s, 2.5 ulp | 11.2 gb/s, 10 ulp |
|
|
383
|
+
| `nk_vdot_bf16c_serial` | 5.79 gb/s, 0.2 ulp | 5.68 gb/s, 2.1 ulp | 5.80 gb/s, 11.4 ulp |
|
|
384
|
+
| __f16c__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
385
|
+
| `nk_dot_f16c_serial` | 3.52 gb/s, 13 ulp | 3.48 gb/s, 20 ulp | 3.49 gb/s, 90 ulp |
|
|
386
|
+
| `nk_vdot_f16c_serial` | 1.84 gb/s, 13.9 ulp | 1.79 gb/s, 35.5 ulp | 1.85 gb/s, 42.4 ulp |
|
|
387
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
388
|
+
| `nk_dot_f64_serial` | 22.2 gb/s, 2.4 ulp | 19.3 gb/s, 2.6 ulp | 21.1 gb/s, 2.2 ulp |
|
|
389
|
+
| `nk_dot_f64_v128relaxed` | 39.6 gb/s, 2.6 ulp | 41.0 gb/s, 3.2 ulp | 32.9 gb/s, 2.6 ulp |
|
|
390
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
391
|
+
| `nk_dot_f32_serial` | 17.9 gb/s, 16 ulp | 12.6 gb/s, 69 ulp | 12.7 gb/s, 104 ulp |
|
|
392
|
+
| `nk_dot_f32_v128relaxed` | 19.5 gb/s, 0 ulp | 17.5 gb/s, 0 ulp | 17.2 gb/s, 0 ulp |
|
|
393
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
394
|
+
| `nk_dot_bf16_serial` | 8.70 gb/s, 0 ulp | 6.36 gb/s, 0.6 ulp | 6.57 gb/s, 5.9 ulp |
|
|
395
|
+
| `nk_dot_bf16_v128relaxed` | 9.07 gb/s, 0 ulp | 7.98 gb/s, 0.4 ulp | 8.24 gb/s, 3.7 ulp |
|
|
396
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
397
|
+
| `nk_dot_f16_serial` | 3.15 gb/s, 16 ulp | 2.74 gb/s, 26 ulp | 3.14 gb/s, 53 ulp |
|
|
398
|
+
| `nk_dot_f16_v128relaxed` | 4.78 gb/s, 9.0 ulp | 4.70 gb/s, 23 ulp | 4.92 gb/s, 39 ulp |
|
|
399
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
400
|
+
| `nk_dot_e5m2_serial` | 2.90 gb/s, 0 ulp | 2.51 gb/s, 0 ulp | 2.88 gb/s, 0 ulp |
|
|
401
|
+
| `nk_dot_e5m2_v128relaxed` | 3.05 gb/s, 0 ulp | 2.65 gb/s, 0 ulp | 2.99 gb/s, 0 ulp |
|
|
402
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
403
|
+
| `nk_dot_e4m3_serial` | 0.903 gb/s, 0 ulp | 0.776 gb/s, 0 ulp | 0.874 gb/s, 0 ulp |
|
|
404
|
+
| `nk_dot_e4m3_v128relaxed` | 2.42 gb/s, 0 ulp | 2.12 gb/s, 0 ulp | 2.36 gb/s, 0 ulp |
|
|
405
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
406
|
+
| `nk_dot_e3m2_serial` | 2.90 gb/s, 0 ulp | 2.53 gb/s, 0 ulp | 2.88 gb/s, 0 ulp |
|
|
407
|
+
| `nk_dot_e3m2_v128relaxed` | 11.8 gb/s, 0 ulp | 10.5 gb/s, 0 ulp | 11.7 gb/s, 0 ulp |
|
|
408
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
409
|
+
| `nk_dot_e2m3_serial` | 2.90 gb/s, 0 ulp | 2.53 gb/s, 0 ulp | 2.88 gb/s, 0 ulp |
|
|
410
|
+
| `nk_dot_e2m3_v128relaxed` | 20.0 gb/s, 0 ulp | 20.0 gb/s, 0 ulp | 20.0 gb/s, 0 ulp |
|
|
411
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
412
|
+
| `nk_dot_i8_serial` | 21.7 gb/s | 16.8 gb/s | 16.3 gb/s |
|
|
413
|
+
| `nk_dot_i8_v128relaxed` | 42.0 gb/s | 47.7 gb/s | 46.3 gb/s |
|
|
414
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
415
|
+
| `nk_dot_u8_serial` | 21.4 gb/s | 16.8 gb/s | 16.3 gb/s |
|
|
416
|
+
| `nk_dot_u8_v128relaxed` | 43.8 gb/s | 51.2 gb/s | 49.2 gb/s |
|
|
417
|
+
| __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
418
|
+
| `nk_dot_i4_serial` | 0.984 gb/s | 0.824 gb/s | 0.957 gb/s |
|
|
419
|
+
| `nk_dot_i4_v128relaxed` | 14.8 gb/s | 15.0 gb/s | 17.9 gb/s |
|
|
420
|
+
| __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
421
|
+
| `nk_dot_u4_serial` | 0.988 gb/s | 0.957 gb/s | 0.959 gb/s |
|
|
422
|
+
| `nk_dot_u4_v128relaxed` | 30.5 gb/s | 27.5 gb/s | 31.4 gb/s |
|
|
423
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
424
|
+
| `nk_dot_u1_serial` | 4.85 gb/s | 5.38 gb/s | 5.67 gb/s |
|
|
425
|
+
| `nk_dot_u1_v128relaxed` | 21.8 gb/s | 29.8 gb/s | 52.0 gb/s |
|