numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
# Batched Distance Matrices in NumKong
|
|
2
|
+
|
|
3
|
+
NumKong implements batched distance matrix computation via pre-packed dot products plus normalization. Angular distance and Euclidean distance are computed from the packed dot product output without materializing an intermediate C matrix.
|
|
4
|
+
|
|
5
|
+
Angular distance from pre-packed dot products:
|
|
6
|
+
|
|
7
|
+
```math
|
|
8
|
+
D_{ij} = 1 - \frac{C_{ij}}{\sqrt{\|A_i\|^2 \cdot \|B_j\|^2}}
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Euclidean distance from pre-packed dot products:
|
|
12
|
+
|
|
13
|
+
```math
|
|
14
|
+
D_{ij} = \sqrt{\|A_i\|^2 + \|B_j\|^2 - 2 C_{ij}}
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Reformulating as Python pseudocode:
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
def angulars_packed(a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
|
23
|
+
dots = a @ b.T
|
|
24
|
+
a_norms = np.sum(a ** 2, axis=1, keepdims=True)
|
|
25
|
+
b_norms = np.sum(b ** 2, axis=1, keepdims=True)
|
|
26
|
+
return 1 - dots / np.sqrt(a_norms * b_norms.T)
|
|
27
|
+
|
|
28
|
+
def euclideans_packed(a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
|
29
|
+
dots = a @ b.T
|
|
30
|
+
a_norms = np.sum(a ** 2, axis=1, keepdims=True)
|
|
31
|
+
b_norms = np.sum(b ** 2, axis=1, keepdims=True)
|
|
32
|
+
return np.sqrt(np.maximum(a_norms + b_norms.T - 2 * dots, 0))
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Input & Output Types
|
|
36
|
+
|
|
37
|
+
| Input Type | Output Type | Description |
|
|
38
|
+
| ---------- | ----------- | ---------------------------------------------- |
|
|
39
|
+
| `f64` | `f64` | 64-bit IEEE 754 double precision |
|
|
40
|
+
| `f32` | `f32` | 32-bit IEEE 754 single precision |
|
|
41
|
+
| `f16` | `f32` | 16-bit IEEE 754 half precision, widened output |
|
|
42
|
+
| `bf16` | `f32` | 16-bit brain float, widened output |
|
|
43
|
+
| `e4m3` | `f32` | 8-bit Float8: 4 exponent, 3 mantissa bits |
|
|
44
|
+
| `e5m2` | `f32` | 8-bit Float8: 5 exponent, 2 mantissa bits |
|
|
45
|
+
| `e2m3` | `f32` | 8-bit MX format: 2 exponent, 3 mantissa bits |
|
|
46
|
+
| `e3m2` | `f32` | 8-bit MX format: 3 exponent, 2 mantissa bits |
|
|
47
|
+
| `i8` | `f32` | 8-bit signed integers, float output |
|
|
48
|
+
| `u8` | `f32` | 8-bit unsigned integers, float output |
|
|
49
|
+
| `i4` | `f32` | 4-bit signed integers, float output |
|
|
50
|
+
| `u4` | `f32` | 4-bit unsigned integers, float output |
|
|
51
|
+
|
|
52
|
+
## Optimizations
|
|
53
|
+
|
|
54
|
+
### Distance-from-Dot Algebraic Reduction
|
|
55
|
+
|
|
56
|
+
`nk_angulars_packed_f32_haswell`, `nk_angulars_packed_f32_skylake`, `nk_euclideans_packed_f32_haswell`, `nk_euclideans_packed_f32_skylake` derive distance matrices from pre-packed dot product output without materializing an intermediate result matrix.
|
|
57
|
+
Angular distance rewrites as $1 - \text{dot}(a,b) \cdot \text{rsqrt}(\|a\|^2 \cdot \|b\|^2)$, converting two separate square roots and a division into one rsqrt and one multiply.
|
|
58
|
+
Euclidean distance expands the identity $\|a - b\|^2 = \|a\|^2 + \|b\|^2 - 2 \cdot \text{dot}(a,b)$, requiring only one final sqrt per output element.
|
|
59
|
+
Both formulas decompose into: (1) a batched GEMM for all M×N dot products, (2) per-vector squared norms precomputed once during packing.
|
|
60
|
+
The singular `spatial/` kernels compute these three sums ($\sum a_i b_i$, $\sum a_i^2$, $\sum b_i^2$) in a single pass with three interleaved accumulators; the batched `spatials/` kernels separate them — norms are computed once per vector during packing, and dots come from the GEMM — trading register pressure for amortized cost across the full M×N output.
|
|
61
|
+
|
|
62
|
+
### Serial vs Vectorized Sqrt and Rsqrt Cost
|
|
63
|
+
|
|
64
|
+
`nk_angular_through_f32_from_dot_serial_` uses the Quake 3 fast inverse square root (magic constant `0x5F375A86`, three Newton-Raphson iterations, ~34.9 correct bits for Float32) to compute `dot * rsqrt(query_norm * target_norm)`.
|
|
65
|
+
`nk_angular_through_f32_from_dot_haswell_` replaces this with hardware `_mm_rsqrt_ps` (~12-bit approximation, 5cy latency, 1/cy on port 0) plus one Newton-Raphson refinement step (~22–24 correct bits).
|
|
66
|
+
`nk_euclidean_through_f32_from_dot_serial_` computes `sqrt(x)` as `x * rsqrt(x)` — reusing the same rsqrt path.
|
|
67
|
+
`nk_euclidean_through_f32_from_dot_haswell_` uses exact `_mm_sqrt_ps` (11cy latency, 7cy throughput for XMM) instead of the rsqrt approximation — the subtraction $\|a\|^2 + \|b\|^2 - 2 \cdot \text{dot}$ can produce values near zero where rsqrt error would be amplified by the subsequent multiply.
|
|
68
|
+
For Float64, all backends use exact division and sqrt — no fast rsqrt approximation, since reaching 52 mantissa bits of precision would need 4+ Newton-Raphson iterations, negating the speed advantage.
|
|
69
|
+
The 4-wide finalizer batching amortizes these costs: one rsqrt or sqrt call processes 4 output elements simultaneously, hiding the latency behind the GEMM tile's computation.
|
|
70
|
+
|
|
71
|
+
### Norm Precomputation in Packed Buffers
|
|
72
|
+
|
|
73
|
+
`nk_dots_pack_f32_serial`, `nk_dots_pack_f32_haswell`, `nk_dots_pack_bf16_haswell` compute per-column squared norms $\|b_j\|^2 = \sum_k b_{jk}^2 = \text{dot}(b_j, b_j)$ during the packing step via `nk_reduce_moments_*` primitives.
|
|
74
|
+
The squared norm is a self-dot-product — already a byproduct of touching every element for type conversion and layout transformation.
|
|
75
|
+
Angular and Euclidean finalizers read norms from packed buffer metadata, eliminating a separate O(N·K) norm pass over B.
|
|
76
|
+
|
|
77
|
+
## Performance
|
|
78
|
+
|
|
79
|
+
The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
|
|
80
|
+
The input size is controlled by `NK_MATRIX_HEIGHT`, `NK_MATRIX_WIDTH`, and `NK_MATRIX_DEPTH` environment variables, all set to the same value for batched distance computations over square matrices.
|
|
81
|
+
Columns show throughput for 256³, 1024³, and 4096³ configurations.
|
|
82
|
+
The throughput is measured in GSO/s as Giga Scalar Operations per Second, with $\text{ops} = 2 \cdot M \cdot N \cdot K$ complexity for computing $M \times N$ pairwise distances over $K$-dimensional vectors.
|
|
83
|
+
Accuracy is reported as mean ULP (units in last place) unless noted otherwise — the average number of representable floating-point values between the result and the exact answer.
|
|
84
|
+
Each kernel runs for at least 20 seconds per configuration.
|
|
85
|
+
Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
|
|
86
|
+
Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
|
|
87
|
+
|
|
88
|
+
### Intel Sapphire Rapids
|
|
89
|
+
|
|
90
|
+
#### Native
|
|
91
|
+
|
|
92
|
+
| Kernel | 256³ | 1024³ | 4096³ |
|
|
93
|
+
| :----------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
94
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
95
|
+
| `nk_angulars_packed_f64_serial` | 0.578 gso/s, 0 ulp | 0.691 gso/s, 0 ulp | 0.787 gso/s, 0 ulp |
|
|
96
|
+
| `nk_angulars_symmetric_f64_serial` | 0.477 gso/s, 0 ulp | 0.569 gso/s, 0 ulp | 1.24 gso/s, 0 ulp |
|
|
97
|
+
| `nk_euclideans_packed_f64_serial` | 0.569 gso/s, 0.6 ulp | 0.692 gso/s, 0.6 ulp | 0.775 gso/s, 0.6 ulp |
|
|
98
|
+
| `nk_euclideans_symmetric_f64_serial` | 0.477 gso/s, 0.6 ulp | 0.562 gso/s, 0.6 ulp | 1.26 gso/s, 0.3 ulp |
|
|
99
|
+
| `nk_angulars_packed_f64_haswell` | 5.89 gso/s, 0 ulp | 6.04 gso/s, 0 ulp | 6.08 gso/s, 0 ulp |
|
|
100
|
+
| `nk_angulars_symmetric_f64_haswell` | 5.17 gso/s, 0 ulp | 5.56 gso/s, 0 ulp | 11.3 gso/s, 0 ulp |
|
|
101
|
+
| `nk_euclideans_packed_f64_haswell` | 5.83 gso/s, 0.2 ulp | 6.21 gso/s, 0.2 ulp | 6.24 gso/s, 0.2 ulp |
|
|
102
|
+
| `nk_euclideans_symmetric_f64_haswell` | 5.33 gso/s, 0.2 ulp | 5.62 gso/s, 0.2 ulp | 11.7 gso/s, 0.2 ulp |
|
|
103
|
+
| `nk_angulars_packed_f64_skylake` | 7.56 gso/s, 0 ulp | 8.46 gso/s, 0 ulp | 8.92 gso/s, 0 ulp |
|
|
104
|
+
| `nk_angulars_symmetric_f64_skylake` | 7.37 gso/s, 0 ulp | 8.66 gso/s, 0 ulp | 17.1 gso/s, 0 ulp |
|
|
105
|
+
| `nk_euclideans_packed_f64_skylake` | 8.06 gso/s, 0.2 ulp | 8.37 gso/s, 0.2 ulp | 8.06 gso/s, 0.2 ulp |
|
|
106
|
+
| `nk_euclideans_symmetric_f64_skylake` | 7.14 gso/s, 0.2 ulp | 8.43 gso/s, 0.2 ulp | 17.4 gso/s, 0.2 ulp |
|
|
107
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
108
|
+
| `nk_angulars_packed_f32_serial` | 15.0 gso/s, 0.1 ulp | 16.3 gso/s, 0.1 ulp | 16.4 gso/s, 0.1 ulp |
|
|
109
|
+
| `nk_angulars_symmetric_f32_serial` | 3.86 gso/s, 0.1 ulp | 4.29 gso/s, 0.1 ulp | 8.62 gso/s, 0.1 ulp |
|
|
110
|
+
| `nk_euclideans_packed_f32_serial` | 15.3 gso/s, 0.6 ulp | 17.0 gso/s, 0.5 ulp | 17.0 gso/s, 0.5 ulp |
|
|
111
|
+
| `nk_euclideans_symmetric_f32_serial` | 3.97 gso/s, 0.6 ulp | 4.16 gso/s, 0.5 ulp | 8.38 gso/s, 0.3 ulp |
|
|
112
|
+
| `nk_angulars_packed_f32_haswell` | 29.3 gso/s, 0 ulp | 31.6 gso/s, 0 ulp | 31.6 gso/s, 0 ulp |
|
|
113
|
+
| `nk_angulars_symmetric_f32_haswell` | 21.4 gso/s, 0 ulp | 24.8 gso/s, 0 ulp | 52 gso/s, 0 ulp |
|
|
114
|
+
| `nk_euclideans_packed_f32_haswell` | 29.7 gso/s, 0.2 ulp | 32 gso/s, 0.2 ulp | 32.9 gso/s, 0.2 ulp |
|
|
115
|
+
| `nk_euclideans_symmetric_f32_haswell` | 21.8 gso/s, 0.2 ulp | 25.7 gso/s, 0.2 ulp | 53 gso/s, 0.2 ulp |
|
|
116
|
+
| `nk_angulars_packed_f32_skylake` | 33.3 gso/s, 0 ulp | 39.4 gso/s, 0 ulp | 37.5 gso/s, 0 ulp |
|
|
117
|
+
| `nk_angulars_symmetric_f32_skylake` | 24.8 gso/s, 0 ulp | 25.5 gso/s, 0 ulp | 61.4 gso/s, 0 ulp |
|
|
118
|
+
| `nk_euclideans_packed_f32_skylake` | 34.4 gso/s, 0.2 ulp | 40.3 gso/s, 0.2 ulp | 40.3 gso/s, 0.2 ulp |
|
|
119
|
+
| `nk_euclideans_symmetric_f32_skylake` | 25.1 gso/s, 0.2 ulp | 29.3 gso/s, 0.2 ulp | 65.9 gso/s, 0.2 ulp |
|
|
120
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
121
|
+
| `nk_angulars_packed_bf16_serial` | 1.18 gso/s, 0 ulp | 1.21 gso/s, 0 ulp | 1.19 gso/s, 0.1 ulp |
|
|
122
|
+
| `nk_angulars_symmetric_bf16_serial` | 1.19 gso/s, 0 ulp | 1.18 gso/s, 0 ulp | 2.35 gso/s, 0 ulp |
|
|
123
|
+
| `nk_euclideans_packed_bf16_serial` | 1.20 gso/s, 0.6 ulp | 1.18 gso/s, 0.6 ulp | 1.16 gso/s, 6.0 ulp |
|
|
124
|
+
| `nk_euclideans_symmetric_bf16_serial` | 1.11 gso/s, 0.6 ulp | 1.14 gso/s, 0.6 ulp | 2.34 gso/s, 0.4 ulp |
|
|
125
|
+
| `nk_angulars_packed_bf16_haswell` | 54.6 gso/s, 0 ulp | 65.7 gso/s, 0 ulp | 66.1 gso/s, 0.1 ulp |
|
|
126
|
+
| `nk_angulars_symmetric_bf16_haswell` | 38.3 gso/s, 0 ulp | 50.1 gso/s, 0 ulp | 106 gso/s, 0 ulp |
|
|
127
|
+
| `nk_euclideans_packed_bf16_haswell` | 58 gso/s, 0.2 ulp | 65.7 gso/s, 0.3 ulp | 70.7 gso/s, 5.8 ulp |
|
|
128
|
+
| `nk_euclideans_symmetric_bf16_haswell` | 38.6 gso/s, 0.2 ulp | 49.8 gso/s, 0.3 ulp | 109 gso/s, 0.3 ulp |
|
|
129
|
+
| `nk_angulars_packed_bf16_skylake` | 67.8 gso/s, 0 ulp | 87.7 gso/s, 0 ulp | 86.4 gso/s, 0.1 ulp |
|
|
130
|
+
| `nk_angulars_symmetric_bf16_skylake` | 48.8 gso/s, 0 ulp | 58.7 gso/s, 0 ulp | 125 gso/s, 0 ulp |
|
|
131
|
+
| `nk_euclideans_packed_bf16_skylake` | 64 gso/s, 0.2 ulp | 87.4 gso/s, 0.3 ulp | 90.8 gso/s, 5.8 ulp |
|
|
132
|
+
| `nk_euclideans_symmetric_bf16_skylake` | 48.8 gso/s, 0.2 ulp | 58.9 gso/s, 0.3 ulp | 121 gso/s, 0.3 ulp |
|
|
133
|
+
| `nk_angulars_packed_bf16_genoa` | 59.7 gso/s, 0 ulp | 81.9 gso/s, 0 ulp | 87.2 gso/s, 0 ulp |
|
|
134
|
+
| `nk_angulars_symmetric_bf16_genoa` | 54.9 gso/s, 0 ulp | 61.2 gso/s, 0 ulp | 137 gso/s, 0 ulp |
|
|
135
|
+
| `nk_euclideans_packed_bf16_genoa` | 63 gso/s, 0.2 ulp | 79.6 gso/s, 0.3 ulp | 87.3 gso/s, 0.3 ulp |
|
|
136
|
+
| `nk_euclideans_symmetric_bf16_genoa` | 53.4 gso/s, 0.2 ulp | 60.2 gso/s, 0.3 ulp | 130 gso/s, 0.3 ulp |
|
|
137
|
+
| `nk_angulars_packed_bf16_sapphireamx` | 287 gso/s, 0 ulp | 364 gso/s, 0 ulp | 582 gso/s, 0 ulp |
|
|
138
|
+
| `nk_angulars_symmetric_bf16_sapphireamx` | 75.7 gso/s, 0 ulp | 114 gso/s, 0 ulp | 116 gso/s, 0 ulp |
|
|
139
|
+
| `nk_euclideans_packed_bf16_sapphireamx` | 328 gso/s, 0.3 ulp | 573 gso/s, 0.3 ulp | 632 gso/s, 0.3 ulp |
|
|
140
|
+
| `nk_euclideans_symmetric_bf16_sapphireamx` | 76.3 gso/s, 0.3 ulp | 115 gso/s, 0.3 ulp | 123 gso/s, 0.3 ulp |
|
|
141
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
142
|
+
| `nk_angulars_packed_f16_serial` | 7.46 gso/s, 0.1 ulp | 7.97 gso/s, 0.1 ulp | 8.12 gso/s, 0.1 ulp |
|
|
143
|
+
| `nk_angulars_symmetric_f16_serial` | 4.04 gso/s, 0.1 ulp | 4.09 gso/s, 0.1 ulp | 8.13 gso/s, 0.1 ulp |
|
|
144
|
+
| `nk_euclideans_packed_f16_serial` | 7.69 gso/s, 0.7 ulp | 7.73 gso/s, 1.1 ulp | 8.34 gso/s, 0.6 ulp |
|
|
145
|
+
| `nk_euclideans_symmetric_f16_serial` | 4.08 gso/s, 0.7 ulp | 4.19 gso/s, 1.1 ulp | 8.23 gso/s, 0.5 ulp |
|
|
146
|
+
| `nk_angulars_packed_f16_haswell` | 62 gso/s, 0.1 ulp | 74.4 gso/s, 0.1 ulp | 70.6 gso/s, 0.1 ulp |
|
|
147
|
+
| `nk_angulars_symmetric_f16_haswell` | 38.3 gso/s, 0.1 ulp | 54.9 gso/s, 0.1 ulp | 121 gso/s, 0.1 ulp |
|
|
148
|
+
| `nk_euclideans_packed_f16_haswell` | 62.9 gso/s, 0.4 ulp | 75.2 gso/s, 0.9 ulp | 75.7 gso/s, 0.5 ulp |
|
|
149
|
+
| `nk_euclideans_symmetric_f16_haswell` | 39.6 gso/s, 0.4 ulp | 54.2 gso/s, 0.9 ulp | 123 gso/s, 0.3 ulp |
|
|
150
|
+
| `nk_angulars_packed_f16_skylake` | 66.6 gso/s, 0.1 ulp | 85.2 gso/s, 0.1 ulp | 88.3 gso/s, 0 ulp |
|
|
151
|
+
| `nk_angulars_symmetric_f16_skylake` | 50.1 gso/s, 0.1 ulp | 57.7 gso/s, 0.1 ulp | 126 gso/s, 0 ulp |
|
|
152
|
+
| `nk_euclideans_packed_f16_skylake` | 69.6 gso/s, 0.4 ulp | 93.3 gso/s, 0.9 ulp | 91 gso/s, 0.5 ulp |
|
|
153
|
+
| `nk_euclideans_symmetric_f16_skylake` | 49.4 gso/s, 0.4 ulp | 59.8 gso/s, 0.9 ulp | 134 gso/s, 0.3 ulp |
|
|
154
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
155
|
+
| `nk_angulars_packed_e5m2_serial` | 0.587 gso/s, 0 ulp | 0.553 gso/s, 0 ulp | 0.563 gso/s, 0 ulp |
|
|
156
|
+
| `nk_angulars_symmetric_e5m2_serial` | 0.446 gso/s, 0 ulp | 0.427 gso/s, 0 ulp | 0.847 gso/s, 0 ulp |
|
|
157
|
+
| `nk_euclideans_packed_e5m2_serial` | 0.576 gso/s, 0.5 ulp | 0.571 gso/s, 0.5 ulp | 0.557 gso/s, 0.2 ulp |
|
|
158
|
+
| `nk_euclideans_symmetric_e5m2_serial` | 0.424 gso/s, 0.5 ulp | 0.437 gso/s, 0.5 ulp | 0.836 gso/s, 0.2 ulp |
|
|
159
|
+
| `nk_angulars_packed_e5m2_haswell` | 27.4 gso/s, 0 ulp | 30.4 gso/s, 0 ulp | 31 gso/s, 0 ulp |
|
|
160
|
+
| `nk_angulars_symmetric_e5m2_haswell` | 15.3 gso/s, 0 ulp | 15.7 gso/s, 0 ulp | 32.3 gso/s, 0 ulp |
|
|
161
|
+
| `nk_euclideans_packed_e5m2_haswell` | 28 gso/s, 0 ulp | 30.8 gso/s, 0 ulp | 30.6 gso/s, 0 ulp |
|
|
162
|
+
| `nk_euclideans_symmetric_e5m2_haswell` | 15.4 gso/s, 0 ulp | 15.9 gso/s, 0 ulp | 32 gso/s, 0 ulp |
|
|
163
|
+
| `nk_angulars_packed_e5m2_skylake` | 32.9 gso/s, 0 ulp | 36.7 gso/s, 0 ulp | 40.1 gso/s, 0 ulp |
|
|
164
|
+
| `nk_angulars_symmetric_e5m2_skylake` | 19 gso/s, 0 ulp | 21 gso/s, 0 ulp | 42.7 gso/s, 0 ulp |
|
|
165
|
+
| `nk_euclideans_packed_e5m2_skylake` | 34.1 gso/s, 0 ulp | 37.9 gso/s, 0 ulp | 39.6 gso/s, 0 ulp |
|
|
166
|
+
| `nk_euclideans_symmetric_e5m2_skylake` | 20 gso/s, 0 ulp | 18.4 gso/s, 0 ulp | 41.6 gso/s, 0 ulp |
|
|
167
|
+
| `nk_angulars_packed_e5m2_genoa` | 39.6 gso/s, 0 ulp | 46.8 gso/s, 0 ulp | 47.5 gso/s, 0 ulp |
|
|
168
|
+
| `nk_angulars_symmetric_e5m2_genoa` | 30 gso/s, 0 ulp | 32.5 gso/s, 0 ulp | 66.3 gso/s, 0 ulp |
|
|
169
|
+
| `nk_euclideans_packed_e5m2_genoa` | 42.3 gso/s, 0 ulp | 49.1 gso/s, 0 ulp | 51.3 gso/s, 0 ulp |
|
|
170
|
+
| `nk_euclideans_symmetric_e5m2_genoa` | 30.1 gso/s, 0 ulp | 32.8 gso/s, 0 ulp | 64.9 gso/s, 0 ulp |
|
|
171
|
+
| `nk_angulars_packed_e5m2_sapphireamx` | 216 gso/s, 0 ulp | 355 gso/s, 0 ulp | 427 gso/s, 0 ulp |
|
|
172
|
+
| `nk_angulars_symmetric_e5m2_sapphireamx` | 48.7 gso/s, 0 ulp | 73.3 gso/s, 0 ulp | 72.3 gso/s, 0 ulp |
|
|
173
|
+
| `nk_euclideans_packed_e5m2_sapphireamx` | 220 gso/s, 0 ulp | 375 gso/s, 0 ulp | 408 gso/s, 0 ulp |
|
|
174
|
+
| `nk_euclideans_symmetric_e5m2_sapphireamx` | 48.3 gso/s, 0 ulp | 73.3 gso/s, 0 ulp | 74 gso/s, 0 ulp |
|
|
175
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
176
|
+
| `nk_angulars_packed_e4m3_serial` | 0.479 gso/s, 0 ulp | 0.473 gso/s, 0 ulp | 0.485 gso/s, 0 ulp |
|
|
177
|
+
| `nk_angulars_symmetric_e4m3_serial` | 0.395 gso/s, 0 ulp | 0.390 gso/s, 0 ulp | 0.795 gso/s, 0 ulp |
|
|
178
|
+
| `nk_euclideans_packed_e4m3_serial` | 0.467 gso/s, 0.5 ulp | 0.484 gso/s, 0.5 ulp | 0.480 gso/s, 0.5 ulp |
|
|
179
|
+
| `nk_euclideans_symmetric_e4m3_serial` | 0.395 gso/s, 0.5 ulp | 0.395 gso/s, 0.5 ulp | 0.781 gso/s, 0.3 ulp |
|
|
180
|
+
| `nk_angulars_packed_e4m3_haswell` | 20.6 gso/s, 0 ulp | 22.5 gso/s, 0 ulp | 21.8 gso/s, 0 ulp |
|
|
181
|
+
| `nk_angulars_symmetric_e4m3_haswell` | 12.2 gso/s, 0 ulp | 12.1 gso/s, 0 ulp | 24.7 gso/s, 0 ulp |
|
|
182
|
+
| `nk_euclideans_packed_e4m3_haswell` | 20.7 gso/s, 0 ulp | 22.4 gso/s, 0 ulp | 23.4 gso/s, 0.2 ulp |
|
|
183
|
+
| `nk_euclideans_symmetric_e4m3_haswell` | 11.2 gso/s, 0 ulp | 11.9 gso/s, 0 ulp | 24.6 gso/s, 0.1 ulp |
|
|
184
|
+
| `nk_angulars_packed_e4m3_skylake` | 28.8 gso/s, 0 ulp | 32.8 gso/s, 0 ulp | 31.3 gso/s, 0 ulp |
|
|
185
|
+
| `nk_angulars_symmetric_e4m3_skylake` | 16.4 gso/s, 0 ulp | 17.4 gso/s, 0 ulp | 35.1 gso/s, 0 ulp |
|
|
186
|
+
| `nk_euclideans_packed_e4m3_skylake` | 27.8 gso/s, 0 ulp | 31.2 gso/s, 0 ulp | 31.7 gso/s, 0.2 ulp |
|
|
187
|
+
| `nk_euclideans_symmetric_e4m3_skylake` | 16.1 gso/s, 0 ulp | 16.8 gso/s, 0 ulp | 34.4 gso/s, 0.1 ulp |
|
|
188
|
+
| `nk_angulars_packed_e4m3_genoa` | 40.8 gso/s, 0 ulp | 48.4 gso/s, 0 ulp | 52.1 gso/s, 0 ulp |
|
|
189
|
+
| `nk_angulars_symmetric_e4m3_genoa` | 30.3 gso/s, 0 ulp | 31.5 gso/s, 0 ulp | 69.2 gso/s, 0 ulp |
|
|
190
|
+
| `nk_euclideans_packed_e4m3_genoa` | 43.3 gso/s, 0 ulp | 50.9 gso/s, 0 ulp | 48.8 gso/s, 0.1 ulp |
|
|
191
|
+
| `nk_euclideans_symmetric_e4m3_genoa` | 29.9 gso/s, 0 ulp | 31.9 gso/s, 0 ulp | 64.6 gso/s, 0.1 ulp |
|
|
192
|
+
| `nk_angulars_packed_e4m3_sapphireamx` | 212 gso/s, 0 ulp | 325 gso/s, 0 ulp | 418 gso/s, 0 ulp |
|
|
193
|
+
| `nk_angulars_symmetric_e4m3_sapphireamx` | 50.5 gso/s, 0 ulp | 73.4 gso/s, 0 ulp | 72 gso/s, 0 ulp |
|
|
194
|
+
| `nk_euclideans_packed_e4m3_sapphireamx` | 216 gso/s, 0.1 ulp | 372 gso/s, 0.1 ulp | 394 gso/s, 0.1 ulp |
|
|
195
|
+
| `nk_euclideans_symmetric_e4m3_sapphireamx` | 49.3 gso/s, 0.1 ulp | 70.1 gso/s, 0.1 ulp | 73.1 gso/s, 0.1 ulp |
|
|
196
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
197
|
+
| `nk_angulars_packed_e3m2_serial` | 0.554 gso/s, 0 ulp | 0.524 gso/s, 0 ulp | 0.534 gso/s, 0 ulp |
|
|
198
|
+
| `nk_angulars_symmetric_e3m2_serial` | 0.439 gso/s, 0 ulp | 0.427 gso/s, 0 ulp | 0.839 gso/s, 0 ulp |
|
|
199
|
+
| `nk_euclideans_packed_e3m2_serial` | 0.556 gso/s, 0.5 ulp | 0.549 gso/s, 0.5 ulp | 0.509 gso/s, 0.2 ulp |
|
|
200
|
+
| `nk_euclideans_symmetric_e3m2_serial` | 0.413 gso/s, 0.5 ulp | 0.427 gso/s, 0.5 ulp | 0.829 gso/s, 0.2 ulp |
|
|
201
|
+
| `nk_angulars_packed_e3m2_haswell` | 30.3 gso/s, 0 ulp | 32.2 gso/s, 0 ulp | 32.8 gso/s, 0 ulp |
|
|
202
|
+
| `nk_angulars_symmetric_e3m2_haswell` | 27.1 gso/s, 0 ulp | 32.8 gso/s, 0 ulp | 65.7 gso/s, 0 ulp |
|
|
203
|
+
| `nk_euclideans_packed_e3m2_haswell` | 30.1 gso/s, 0 ulp | 32.3 gso/s, 0 ulp | 33.5 gso/s, 0 ulp |
|
|
204
|
+
| `nk_euclideans_symmetric_e3m2_haswell` | 28.5 gso/s, 0 ulp | 32.6 gso/s, 0 ulp | 66.1 gso/s, 0 ulp |
|
|
205
|
+
| `nk_angulars_packed_e3m2_skylake` | 37.4 gso/s, 0 ulp | 41.4 gso/s, 0 ulp | 44.1 gso/s, 0 ulp |
|
|
206
|
+
| `nk_angulars_symmetric_e3m2_skylake` | 39 gso/s, 0 ulp | 41.9 gso/s, 0 ulp | 87.3 gso/s, 0 ulp |
|
|
207
|
+
| `nk_euclideans_packed_e3m2_skylake` | 35.7 gso/s, 0 ulp | 41.3 gso/s, 0 ulp | 43 gso/s, 0 ulp |
|
|
208
|
+
| `nk_euclideans_symmetric_e3m2_skylake` | 36.2 gso/s, 0 ulp | 36.4 gso/s, 0 ulp | 87.8 gso/s, 0 ulp |
|
|
209
|
+
| `nk_angulars_packed_e3m2_genoa` | 48 gso/s, 0 ulp | 56 gso/s, 0 ulp | 59.3 gso/s, 0 ulp |
|
|
210
|
+
| `nk_angulars_symmetric_e3m2_genoa` | 40 gso/s, 0 ulp | 40.8 gso/s, 0 ulp | 87.4 gso/s, 0 ulp |
|
|
211
|
+
| `nk_euclideans_packed_e3m2_genoa` | 49.8 gso/s, 0 ulp | 58.4 gso/s, 0 ulp | 61 gso/s, 0 ulp |
|
|
212
|
+
| `nk_euclideans_symmetric_e3m2_genoa` | 38.4 gso/s, 0 ulp | 41.6 gso/s, 0 ulp | 87.7 gso/s, 0 ulp |
|
|
213
|
+
| `nk_angulars_packed_e3m2_sapphireamx` | 238 gso/s, 0 ulp | 420 gso/s, 0 ulp | 431 gso/s, 0 ulp |
|
|
214
|
+
| `nk_angulars_symmetric_e3m2_sapphireamx` | 60.7 gso/s, 0 ulp | 96.5 gso/s, 0 ulp | 90.9 gso/s, 0 ulp |
|
|
215
|
+
| `nk_euclideans_packed_e3m2_sapphireamx` | 224 gso/s, 0 ulp | 426 gso/s, 0 ulp | 443 gso/s, 0 ulp |
|
|
216
|
+
| `nk_euclideans_symmetric_e3m2_sapphireamx` | 60.8 gso/s, 0 ulp | 99.2 gso/s, 0 ulp | 92.6 gso/s, 0 ulp |
|
|
217
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
218
|
+
| `nk_angulars_packed_e2m3_serial` | 0.332 gso/s, 0 ulp | 0.325 gso/s, 0 ulp | 0.320 gso/s, 0 ulp |
|
|
219
|
+
| `nk_angulars_symmetric_e2m3_serial` | 0.298 gso/s, 0 ulp | 0.305 gso/s, 0 ulp | 0.568 gso/s, 0 ulp |
|
|
220
|
+
| `nk_euclideans_packed_e2m3_serial` | 0.324 gso/s, 0.5 ulp | 0.310 gso/s, 0.5 ulp | 0.313 gso/s, 0.2 ulp |
|
|
221
|
+
| `nk_euclideans_symmetric_e2m3_serial` | 0.293 gso/s, 0.5 ulp | 0.295 gso/s, 0.5 ulp | 0.586 gso/s, 0.2 ulp |
|
|
222
|
+
| `nk_angulars_packed_e2m3_haswell` | 54.2 gso/s, 0 ulp | 61 gso/s, 0 ulp | 66.2 gso/s, 0 ulp |
|
|
223
|
+
| `nk_angulars_symmetric_e2m3_haswell` | 48.2 gso/s, 0 ulp | 60 gso/s, 0 ulp | 128 gso/s, 0 ulp |
|
|
224
|
+
| `nk_euclideans_packed_e2m3_haswell` | 55.9 gso/s, 0 ulp | 63.4 gso/s, 0 ulp | 64.8 gso/s, 0 ulp |
|
|
225
|
+
| `nk_euclideans_symmetric_e2m3_haswell` | 48.6 gso/s, 0 ulp | 62.1 gso/s, 0 ulp | 128 gso/s, 0 ulp |
|
|
226
|
+
| `nk_angulars_packed_e2m3_skylake` | 65.1 gso/s, 0 ulp | 79.4 gso/s, 0 ulp | 85.4 gso/s, 0 ulp |
|
|
227
|
+
| `nk_angulars_symmetric_e2m3_skylake` | 61.7 gso/s, 0 ulp | 81.1 gso/s, 0 ulp | 163 gso/s, 0 ulp |
|
|
228
|
+
| `nk_euclideans_packed_e2m3_skylake` | 65.1 gso/s, 0 ulp | 80.4 gso/s, 0 ulp | 80.8 gso/s, 0 ulp |
|
|
229
|
+
| `nk_euclideans_symmetric_e2m3_skylake` | 60.8 gso/s, 0 ulp | 62.3 gso/s, 0 ulp | 167 gso/s, 0 ulp |
|
|
230
|
+
| `nk_angulars_packed_e2m3_genoa` | 47.7 gso/s, 0 ulp | 55.4 gso/s, 0 ulp | 60 gso/s, 0 ulp |
|
|
231
|
+
| `nk_angulars_symmetric_e2m3_genoa` | 36.4 gso/s, 0 ulp | 41.5 gso/s, 0 ulp | 86.7 gso/s, 0 ulp |
|
|
232
|
+
| `nk_euclideans_packed_e2m3_genoa` | 50 gso/s, 0 ulp | 59.1 gso/s, 0 ulp | 58.3 gso/s, 0 ulp |
|
|
233
|
+
| `nk_euclideans_symmetric_e2m3_genoa` | 38 gso/s, 0 ulp | 42.3 gso/s, 0 ulp | 85.1 gso/s, 0 ulp |
|
|
234
|
+
| `nk_angulars_packed_e2m3_sapphireamx` | 350 gso/s, 0 ulp | 956 gso/s, 0 ulp | 1,020 gso/s, 0 ulp |
|
|
235
|
+
| `nk_angulars_symmetric_e2m3_sapphireamx` | 88.4 gso/s, 0 ulp | 203 gso/s, 0 ulp | 188 gso/s, 0 ulp |
|
|
236
|
+
| `nk_euclideans_packed_e2m3_sapphireamx` | 337 gso/s, 0 ulp | 990 gso/s, 0 ulp | 992 gso/s, 0 ulp |
|
|
237
|
+
| `nk_euclideans_symmetric_e2m3_sapphireamx` | 88.7 gso/s, 0 ulp | 193 gso/s, 0 ulp | 201 gso/s, 0 ulp |
|
|
238
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
239
|
+
| `nk_angulars_packed_i8_serial` | 8.84 gso/s, 0 ulp | 9.49 gso/s, 0 ulp | 10.1 gso/s, 0 ulp |
|
|
240
|
+
| `nk_angulars_symmetric_i8_serial` | 4.40 gso/s, 0 ulp | 4.45 gso/s, 0 ulp | 9.58 gso/s, ? ulp |
|
|
241
|
+
| `nk_euclideans_packed_i8_serial` | 8.64 gso/s, 0.4 ulp | 9.84 gso/s, 0.4 ulp | 9.94 gso/s, 0.4 ulp |
|
|
242
|
+
| `nk_euclideans_symmetric_i8_serial` | 4.47 gso/s, 0.4 ulp | 4.64 gso/s, 0.4 ulp | 9.15 gso/s, ? ulp |
|
|
243
|
+
| `nk_angulars_packed_i8_haswell` | 79.5 gso/s, 0 ulp | 102 gso/s, 0 ulp | 109 gso/s, 0 ulp |
|
|
244
|
+
| `nk_angulars_symmetric_i8_haswell` | 60.6 gso/s, 0 ulp | 77.4 gso/s, 0 ulp | 168 gso/s, ? ulp |
|
|
245
|
+
| `nk_euclideans_packed_i8_haswell` | 82.5 gso/s, 0 ulp | 102 gso/s, 0 ulp | 109 gso/s, 0 ulp |
|
|
246
|
+
| `nk_euclideans_symmetric_i8_haswell` | 62 gso/s, 0 ulp | 76.5 gso/s, 0 ulp | 166 gso/s, ? ulp |
|
|
247
|
+
| `nk_angulars_packed_i8_icelake` | 155 gso/s, 0 ulp | 206 gso/s, 0 ulp | 402 gso/s, 0 ulp |
|
|
248
|
+
| `nk_angulars_symmetric_i8_icelake` | 103 gso/s, 0 ulp | 263 gso/s, 0 ulp | 690 gso/s, ? ulp |
|
|
249
|
+
| `nk_euclideans_packed_i8_icelake` | 169 gso/s, 0 ulp | 313 gso/s, 0 ulp | 393 gso/s, 0 ulp |
|
|
250
|
+
| `nk_euclideans_symmetric_i8_icelake` | 108 gso/s, 0 ulp | 268 gso/s, 0 ulp | 695 gso/s, ? ulp |
|
|
251
|
+
| `nk_angulars_packed_i8_sapphireamx` | 427 gso/s, 0 ulp | 1,020 gso/s, 0 ulp | 1,170 gso/s, 0 ulp |
|
|
252
|
+
| `nk_angulars_symmetric_i8_sapphireamx` | 106 gso/s, 0 ulp | 261 gso/s, 0 ulp | 210 gso/s, 0 ulp |
|
|
253
|
+
| `nk_euclideans_packed_i8_sapphireamx` | 428 gso/s, 0 ulp | 1,240 gso/s, 0 ulp | 1,170 gso/s, 0 ulp |
|
|
254
|
+
| `nk_euclideans_symmetric_i8_sapphireamx` | 104 gso/s, 0 ulp | 243 gso/s, 0 ulp | 219 gso/s, 0 ulp |
|
|
255
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
256
|
+
| `nk_angulars_packed_u8_serial` | 12.2 gso/s, 0.3 ulp | 12.8 gso/s, 0.3 ulp | 13.0 gso/s, 0.3 ulp |
|
|
257
|
+
| `nk_angulars_symmetric_u8_serial` | 4.48 gso/s, 0.3 ulp | 4.73 gso/s, 0.3 ulp | 9.50 gso/s, ? ulp |
|
|
258
|
+
| `nk_euclideans_packed_u8_serial` | 12.0 gso/s, 0.5 ulp | 13.1 gso/s, 0.5 ulp | 13.4 gso/s, 0.6 ulp |
|
|
259
|
+
| `nk_euclideans_symmetric_u8_serial` | 4.52 gso/s, 0.5 ulp | 4.69 gso/s, 0.5 ulp | 9.65 gso/s, ? ulp |
|
|
260
|
+
| `nk_angulars_packed_u8_haswell` | 54.6 gso/s, 0.3 ulp | 87.8 gso/s, 0.3 ulp | 104 gso/s, 0.3 ulp |
|
|
261
|
+
| `nk_angulars_symmetric_u8_haswell` | 44.6 gso/s, 0.3 ulp | 70.2 gso/s, 0.3 ulp | 161 gso/s, ? ulp |
|
|
262
|
+
| `nk_euclideans_packed_u8_haswell` | 55.5 gso/s, 0.5 ulp | 87.7 gso/s, 0.5 ulp | 105 gso/s, 0.6 ulp |
|
|
263
|
+
| `nk_euclideans_symmetric_u8_haswell` | 45.3 gso/s, 0.5 ulp | 68.4 gso/s, 0.5 ulp | 159 gso/s, ? ulp |
|
|
264
|
+
| `nk_angulars_packed_u8_icelake` | 154 gso/s, 0.3 ulp | 301 gso/s, 0.3 ulp | 404 gso/s, 0.3 ulp |
|
|
265
|
+
| `nk_angulars_symmetric_u8_icelake` | 108 gso/s, 0.3 ulp | 267 gso/s, 0.3 ulp | 699 gso/s, ? ulp |
|
|
266
|
+
| `nk_euclideans_packed_u8_icelake` | 168 gso/s, 0 ulp | 300 gso/s, 0 ulp | 402 gso/s, 0 ulp |
|
|
267
|
+
| `nk_euclideans_symmetric_u8_icelake` | 109 gso/s, 0 ulp | 253 gso/s, 0 ulp | 695 gso/s, ? ulp |
|
|
268
|
+
| `nk_angulars_packed_u8_sapphireamx` | 444 gso/s, 0.2 ulp | 1,210 gso/s, 0.2 ulp | 1,220 gso/s, 0.2 ulp |
|
|
269
|
+
| `nk_angulars_symmetric_u8_sapphireamx` | 103 gso/s, 0.2 ulp | 257 gso/s, 0.2 ulp | 227 gso/s, 0.2 ulp |
|
|
270
|
+
| `nk_euclideans_packed_u8_sapphireamx` | 432 gso/s, 0 ulp | 1,240 gso/s, 0 ulp | 1,200 gso/s, 0 ulp |
|
|
271
|
+
| `nk_euclideans_symmetric_u8_sapphireamx` | 102 gso/s, 0 ulp | 256 gso/s, 0 ulp | 220 gso/s, 0 ulp |
|
|
272
|
+
| __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
273
|
+
| `nk_angulars_packed_i4_serial` | 3.79 gso/s, ? ulp | 3.83 gso/s, ? ulp | 4.06 gso/s, ? ulp |
|
|
274
|
+
| `nk_angulars_symmetric_i4_serial` | 3.52 gso/s, ? ulp | 3.58 gso/s, ? ulp | 7.08 gso/s, ? ulp |
|
|
275
|
+
| `nk_euclideans_packed_i4_serial` | 3.69 gso/s, ? ulp | 3.91 gso/s, ? ulp | 3.76 gso/s, ? ulp |
|
|
276
|
+
| `nk_euclideans_symmetric_i4_serial` | 3.45 gso/s, ? ulp | 3.64 gso/s, ? ulp | 6.99 gso/s, ? ulp |
|
|
277
|
+
| `nk_angulars_packed_i4_icelake` | 117 gso/s, ? ulp | 208 gso/s, ? ulp | 249 gso/s, ? ulp |
|
|
278
|
+
| `nk_angulars_symmetric_i4_icelake` | 103 gso/s, ? ulp | 233 gso/s, ? ulp | 561 gso/s, ? ulp |
|
|
279
|
+
| `nk_euclideans_packed_i4_icelake` | 121 gso/s, ? ulp | 173 gso/s, ? ulp | 246 gso/s, ? ulp |
|
|
280
|
+
| `nk_euclideans_symmetric_i4_icelake` | 101 gso/s, ? ulp | 228 gso/s, ? ulp | 572 gso/s, ? ulp |
|
|
281
|
+
| __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
282
|
+
| `nk_angulars_packed_u4_serial` | 5.49 gso/s, ? ulp | 5.60 gso/s, ? ulp | 5.78 gso/s, ? ulp |
|
|
283
|
+
| `nk_angulars_symmetric_u4_serial` | 5.18 gso/s, ? ulp | 5.57 gso/s, ? ulp | 11.5 gso/s, ? ulp |
|
|
284
|
+
| `nk_euclideans_packed_u4_serial` | 5.23 gso/s, ? ulp | 5.50 gso/s, ? ulp | 5.64 gso/s, ? ulp |
|
|
285
|
+
| `nk_euclideans_symmetric_u4_serial` | 5.22 gso/s, ? ulp | 5.47 gso/s, ? ulp | 11.1 gso/s, ? ulp |
|
|
286
|
+
| `nk_angulars_packed_u4_icelake` | 153 gso/s, ? ulp | 270 gso/s, ? ulp | 381 gso/s, ? ulp |
|
|
287
|
+
| `nk_angulars_symmetric_u4_icelake` | 122 gso/s, ? ulp | 264 gso/s, ? ulp | 658 gso/s, ? ulp |
|
|
288
|
+
| `nk_euclideans_packed_u4_icelake` | 158 gso/s, ? ulp | 285 gso/s, ? ulp | 385 gso/s, ? ulp |
|
|
289
|
+
| `nk_euclideans_symmetric_u4_icelake` | 120 gso/s, ? ulp | 279 gso/s, ? ulp | 624 gso/s, ? ulp |
|
|
290
|
+
|
|
291
|
+
#### WASM
|
|
292
|
+
|
|
293
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
294
|
+
|
|
295
|
+
| Kernel | 256³ | 1024³ | 4096³ |
|
|
296
|
+
| :----------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
297
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
298
|
+
| `nk_angulars_packed_f64_serial` | 1.38 gso/s, 0 ulp | 1.37 gso/s, 0 ulp | 1.36 gso/s, 0 ulp |
|
|
299
|
+
| `nk_angulars_symmetric_f64_serial` | 0.267 gso/s, 0 ulp | 0.268 gso/s, 0 ulp | 0.258 gso/s, 0 ulp |
|
|
300
|
+
| `nk_euclideans_packed_f64_serial` | 1.41 gso/s, 0.6 ulp | 1.37 gso/s, 0.6 ulp | 1.36 gso/s, 0.6 ulp |
|
|
301
|
+
| `nk_euclideans_symmetric_f64_serial` | 0.272 gso/s, 0.6 ulp | 0.271 gso/s, 0.5 ulp | 0.161 gso/s, 0.5 ulp |
|
|
302
|
+
| `nk_angulars_packed_f64_v128relaxed` | 10.9 gso/s, 0.1 ulp | 10.9 gso/s, 0.1 ulp | 10.9 gso/s, 0.1 ulp |
|
|
303
|
+
| `nk_angulars_symmetric_f64_v128relaxed` | 0.238 gso/s, 0.1 ulp | 0.240 gso/s, 0.1 ulp | 0.271 gso/s, 0.1 ulp |
|
|
304
|
+
| `nk_euclideans_packed_f64_v128relaxed` | 11.0 gso/s, 0.6 ulp | 11.2 gso/s, 0.6 ulp | 11.2 gso/s, 0.6 ulp |
|
|
305
|
+
| `nk_euclideans_symmetric_f64_v128relaxed` | 0.0463 gso/s, 0.6 ulp | 0.0465 gso/s, 0.5 ulp | 0.00806 gso/s, 0.5 ulp |
|
|
306
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
307
|
+
| `nk_angulars_packed_f32_serial` | 4.16 gso/s, 0.1 ulp | 4.26 gso/s, 0.1 ulp | 4.39 gso/s, 0.1 ulp |
|
|
308
|
+
| `nk_angulars_symmetric_f32_serial` | 3.08 gso/s, 0.1 ulp | 4.88 gso/s, 0.1 ulp | 5.69 gso/s, 0.1 ulp |
|
|
309
|
+
| `nk_euclideans_packed_f32_serial` | 4.19 gso/s, 0.6 ulp | 4.32 gso/s, 0.6 ulp | 4.33 gso/s, 0.5 ulp |
|
|
310
|
+
| `nk_euclideans_symmetric_f32_serial` | 3.05 gso/s, 0.5 ulp | 4.97 gso/s, 0.5 ulp | 5.64 gso/s, 0.5 ulp |
|
|
311
|
+
| `nk_angulars_packed_f32_v128relaxed` | 9.41 gso/s, 0.1 ulp | 10.6 gso/s, 0.1 ulp | 10.7 gso/s, 0.1 ulp |
|
|
312
|
+
| `nk_angulars_symmetric_f32_v128relaxed` | 3.64 gso/s, 0.1 ulp | 6.14 gso/s, 0.1 ulp | 7.33 gso/s, 0.1 ulp |
|
|
313
|
+
| `nk_euclideans_packed_f32_v128relaxed` | 9.55 gso/s, 0.2 ulp | 10.6 gso/s, 0.2 ulp | 10.6 gso/s, 0.2 ulp |
|
|
314
|
+
| `nk_euclideans_symmetric_f32_v128relaxed` | 3.55 gso/s, 0.2 ulp | 6.15 gso/s, 0.2 ulp | 7.27 gso/s, 0.2 ulp |
|
|
315
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
316
|
+
| `nk_angulars_packed_bf16_serial` | 4.10 gso/s, 0 ulp | 4.33 gso/s, 0.2 ulp | 4.45 gso/s, 0.6 ulp |
|
|
317
|
+
| `nk_angulars_symmetric_bf16_serial` | 3.74 gso/s, 0 ulp | 6.15 gso/s, 0.2 ulp | 7.39 gso/s, 0.6 ulp |
|
|
318
|
+
| `nk_euclideans_packed_bf16_serial` | 4.26 gso/s, 0.7 ulp | 4.35 gso/s, 6.1 ulp | 4.40 gso/s, 32 ulp |
|
|
319
|
+
| `nk_euclideans_symmetric_bf16_serial` | 3.80 gso/s, 0.6 ulp | 6.16 gso/s, 5.3 ulp | 7.40 gso/s, 28 ulp |
|
|
320
|
+
| `nk_angulars_packed_bf16_v128relaxed` | 22.0 gso/s, 0 ulp | 24.8 gso/s, 0.2 ulp | 24.7 gso/s, 0.6 ulp |
|
|
321
|
+
| `nk_angulars_symmetric_bf16_v128relaxed` | 4.78 gso/s, 0 ulp | 9.61 gso/s, 0.2 ulp | 12.5 gso/s, 0.6 ulp |
|
|
322
|
+
| `nk_euclideans_packed_bf16_v128relaxed` | 22.2 gso/s, 0.7 ulp | 24.1 gso/s, 6.1 ulp | 24.8 gso/s, 32 ulp |
|
|
323
|
+
| `nk_euclideans_symmetric_bf16_v128relaxed` | 4.72 gso/s, 0.3 ulp | 9.53 gso/s, 5.1 ulp | 12.4 gso/s, 28 ulp |
|
|
324
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
325
|
+
| `nk_angulars_packed_e2m3_serial` | 2.66 gso/s, 0 ulp | 2.71 gso/s, 0 ulp | 2.63 gso/s, 0 ulp |
|
|
326
|
+
| `nk_angulars_symmetric_e2m3_serial` | 0.0400 gso/s, 0 ulp | 0.0413 gso/s, 0 ulp | 0.238 gso/s, 0 ulp |
|
|
327
|
+
| `nk_euclideans_packed_e2m3_serial` | 2.74 gso/s, 0.5 ulp | 2.70 gso/s, 0.5 ulp | 2.67 gso/s, 0.5 ulp |
|
|
328
|
+
| `nk_euclideans_symmetric_e2m3_serial` | 0.0403 gso/s, 0.5 ulp | 0.0411 gso/s, 0.4 ulp | 0.0401 gso/s, 0.4 ulp |
|
|
329
|
+
| `nk_angulars_packed_e2m3_v128relaxed` | 18.4 gso/s, 0 ulp | 18.6 gso/s, 0 ulp | 18.5 gso/s, 0 ulp |
|
|
330
|
+
| `nk_angulars_symmetric_e2m3_v128relaxed` | 0.0559 gso/s, 0 ulp | 0.0180 gso/s, 0 ulp | 0.131 gso/s, 0 ulp |
|
|
331
|
+
| `nk_euclideans_packed_e2m3_v128relaxed` | 18.5 gso/s, 0 ulp | 18.7 gso/s, 0 ulp | 18.1 gso/s, 0 ulp |
|
|
332
|
+
| `nk_euclideans_symmetric_e2m3_v128relaxed` | 0.206 gso/s, 0 ulp | 0.0170 gso/s, 0 ulp | 0.0554 gso/s, 0 ulp |
|
|
333
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
334
|
+
| `nk_angulars_packed_i8_serial` | 4.73 gso/s, 0 ulp | 4.81 gso/s, 0 ulp | 4.59 gso/s, 0 ulp |
|
|
335
|
+
| `nk_angulars_symmetric_i8_serial` | 0.00447 gso/s, 0 ulp | 0.198 gso/s, 0 ulp | 0.190 gso/s, 0 ulp |
|
|
336
|
+
| `nk_euclideans_packed_i8_serial` | 4.77 gso/s, 0.5 ulp | 4.80 gso/s, 0.4 ulp | 4.65 gso/s, 0.4 ulp |
|
|
337
|
+
| `nk_euclideans_symmetric_i8_serial` | 0.201 gso/s, 0.5 ulp | 0.0819 gso/s, 0.4 ulp | 0.0823 gso/s, 0.4 ulp |
|
|
338
|
+
| `nk_angulars_packed_i8_v128relaxed` | 31.6 gso/s, 0 ulp | 31.7 gso/s, 0 ulp | 31.1 gso/s, 0 ulp |
|
|
339
|
+
| `nk_angulars_symmetric_i8_v128relaxed` | 0.0304 gso/s, 0 ulp | 0.0680 gso/s, 0 ulp | 0.298 gso/s, 0 ulp |
|
|
340
|
+
| `nk_euclideans_packed_i8_v128relaxed` | 31.5 gso/s, 0 ulp | 32.3 gso/s, 0 ulp | 30.8 gso/s, 0 ulp |
|
|
341
|
+
| `nk_euclideans_symmetric_i8_v128relaxed` | 0.224 gso/s, 0 ulp | 0.222 gso/s, 0 ulp | 0.143 gso/s, 0 ulp |
|
|
342
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
343
|
+
| `nk_angulars_packed_u8_serial` | 4.26 gso/s, 0.4 ulp | 5.07 gso/s, 0.3 ulp | 5.11 gso/s, 0.3 ulp |
|
|
344
|
+
| `nk_angulars_symmetric_u8_serial` | 2.64 gso/s, 0.4 ulp | 4.02 gso/s, 0.3 ulp | 4.34 gso/s, 0.3 ulp |
|
|
345
|
+
| `nk_euclideans_packed_u8_serial` | 4.35 gso/s, 0.5 ulp | 4.67 gso/s, 0.5 ulp | 5.09 gso/s, 0.5 ulp |
|
|
346
|
+
| `nk_euclideans_symmetric_u8_serial` | 2.64 gso/s, 0.5 ulp | 3.97 gso/s, 0.5 ulp | 4.38 gso/s, 0.5 ulp |
|
|
347
|
+
| `nk_angulars_packed_u8_v128relaxed` | 23.7 gso/s, 0.3 ulp | 25.1 gso/s, 0.3 ulp | 25.8 gso/s, 0.3 ulp |
|
|
348
|
+
| `nk_angulars_symmetric_u8_v128relaxed` | 19.6 gso/s, 0.3 ulp | 23.2 gso/s, 0.3 ulp | 24.1 gso/s, 0.3 ulp |
|
|
349
|
+
| `nk_euclideans_packed_u8_v128relaxed` | 23.8 gso/s, 0 ulp | 25.3 gso/s, 0 ulp | 25.8 gso/s, 0 ulp |
|
|
350
|
+
| `nk_euclideans_symmetric_u8_v128relaxed` | 19.5 gso/s, 0 ulp | 23.0 gso/s, 0 ulp | 24.6 gso/s, 0 ulp |
|
|
351
|
+
| __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
352
|
+
| `nk_angulars_packed_i4_serial` | 6.22 gso/s, 0.35 ulp | 6.41 gso/s, 0.34 ulp | 6.55 gso/s, 0.35 ulp |
|
|
353
|
+
| `nk_angulars_symmetric_i4_serial` | 2.64 gso/s, 0.34 ulp | 3.69 gso/s, 0.34 ulp | 4.18 gso/s, 0.34 ulp |
|
|
354
|
+
| `nk_euclideans_packed_i4_serial` | 6.00 gso/s, 0.49 ulp | 6.43 gso/s, 0.54 ulp | 6.56 gso/s, 0.64 ulp |
|
|
355
|
+
| `nk_euclideans_symmetric_i4_serial` | 2.61 gso/s, 0.48 ulp | 3.68 gso/s, 0.53 ulp | 4.14 gso/s, 0.63 ulp |
|
|
356
|
+
| __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
357
|
+
| `nk_angulars_packed_u4_serial` | 5.38 gso/s, 0.35 ulp | 5.60 gso/s, 0.34 ulp | 5.81 gso/s, 0.35 ulp |
|
|
358
|
+
| `nk_angulars_symmetric_u4_serial` | 2.90 gso/s, 0.34 ulp | 4.28 gso/s, 0.34 ulp | 4.90 gso/s, 0.34 ulp |
|
|
359
|
+
| `nk_euclideans_packed_u4_serial` | 5.25 gso/s, 0.49 ulp | 5.64 gso/s, 0.54 ulp | 5.82 gso/s, 0.64 ulp |
|
|
360
|
+
| `nk_euclideans_symmetric_u4_serial` | 2.89 gso/s, 0.48 ulp | 4.30 gso/s, 0.53 ulp | 4.86 gso/s, 0.63 ulp |
|
|
361
|
+
|
|
362
|
+
### Apple M4
|
|
363
|
+
|
|
364
|
+
#### Native
|
|
365
|
+
|
|
366
|
+
| Kernel | 256³ | 1024³ | 4096³ |
|
|
367
|
+
| :--------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
368
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
369
|
+
| `nk_angulars_packed_f64_serial` | 1.96 gso/s, 0 ulp | 1.82 gso/s, 0 ulp | 2.42 gso/s, 0 ulp |
|
|
370
|
+
| `nk_angulars_symmetric_f64_serial` | 1.41 gso/s, 0.04 ulp | 1.41 gso/s, 0.02 ulp | 1.42 gso/s, 0.009 ulp |
|
|
371
|
+
| `nk_euclideans_packed_f64_serial` | 1.95 gso/s, 0.6 ulp | 1.82 gso/s, 0.6 ulp | 2.42 gso/s, 0.6 ulp |
|
|
372
|
+
| `nk_euclideans_symmetric_f64_serial` | 1.41 gso/s, 0.6 ulp | 1.41 gso/s, 0.6 ulp | 1.42 gso/s, 0.6 ulp |
|
|
373
|
+
| `nk_angulars_packed_f64_neon` | 5.44 gso/s, 7,798 ulp | 5.38 gso/s, 3,868 ulp | 5.64 gso/s, 1,720 ulp |
|
|
374
|
+
| `nk_angulars_symmetric_f64_neon` | 5.47 gso/s, 7,660 ulp | 3.93 gso/s, 3,790 ulp | 3.77 gso/s, 1,720 ulp |
|
|
375
|
+
| `nk_euclideans_packed_f64_neon` | 5.69 gso/s, 0.2 ulp | 5.03 gso/s, 0.2 ulp | 5.35 gso/s, 0.2 ulp |
|
|
376
|
+
| `nk_euclideans_symmetric_f64_neon` | 5.21 gso/s, 0.2 ulp | 5.26 gso/s, 0.2 ulp | 3.99 gso/s, 0.2 ulp |
|
|
377
|
+
| `nk_angulars_packed_f64_smef64` | 24.3 gso/s, 0.0165 ulp | 28.8 gso/s, 0.0165 ulp | 14.1 gso/s, 0.0165 ulp |
|
|
378
|
+
| `nk_angulars_symmetric_f64_smef64` | 8.93 gso/s, 0.0182 ulp | 9.57 gso/s, 0.0182 ulp | 9.16 gso/s, 0.0182 ulp |
|
|
379
|
+
| `nk_euclideans_packed_f64_smef64` | 32.9 gso/s, 0.236 ulp | 35.4 gso/s, 0.237 ulp | 32.4 gso/s, 0.237 ulp |
|
|
380
|
+
| `nk_euclideans_symmetric_f64_smef64` | 9.89 gso/s, 0.281 ulp | 10.3 gso/s, 0.281 ulp | 9.83 gso/s, 0.281 ulp |
|
|
381
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
382
|
+
| `nk_angulars_packed_f32_serial` | 10.3 gso/s, 0.1 ulp | 10.7 gso/s, 0.1 ulp | 10.5 gso/s, 0.1 ulp |
|
|
383
|
+
| `nk_angulars_symmetric_f32_serial` | 8.26 gso/s, 0.3 ulp | 8.53 gso/s, 0.3 ulp | 8.47 gso/s, 0.1 ulp |
|
|
384
|
+
| `nk_euclideans_packed_f32_serial` | 10.9 gso/s, 0.6 ulp | 11.0 gso/s, 0.5 ulp | 11.0 gso/s, 0.5 ulp |
|
|
385
|
+
| `nk_euclideans_symmetric_f32_serial` | 8.47 gso/s, 3.9 ulp | 8.74 gso/s, 7.9 ulp | 8.77 gso/s, 3.4 ulp |
|
|
386
|
+
| `nk_angulars_packed_f32_neon` | 36.5 gso/s, 0 ulp | 40.4 gso/s, 0 ulp | 41.8 gso/s, 1,740 ulp |
|
|
387
|
+
| `nk_angulars_symmetric_f32_neon` | 9.99 gso/s, 7,690 ulp | 10.1 gso/s, 3,830 ulp | 10.4 gso/s, 1,730 ulp |
|
|
388
|
+
| `nk_euclideans_packed_f32_neon` | 39.8 gso/s, 0.2 ulp | 41.4 gso/s, 0.2 ulp | 42.5 gso/s, 3.5 ulp |
|
|
389
|
+
| `nk_euclideans_symmetric_f32_neon` | 10.2 gso/s, 3.8 ulp | 10.4 gso/s, 7.8 ulp | 10.7 gso/s, 3.5 ulp |
|
|
390
|
+
| `nk_angulars_packed_f32_smef64` | 54.2 gso/s, 0.153 ulp | 69.1 gso/s, 0.153 ulp | 57.7 gso/s, 0.153 ulp |
|
|
391
|
+
| `nk_angulars_symmetric_f32_smef64` | 33.7 gso/s, 0.128 ulp | 38.1 gso/s, 0.128 ulp | 28.2 gso/s, 0.128 ulp |
|
|
392
|
+
| `nk_euclideans_packed_f32_smef64` | 120 gso/s, 2.16 ulp | 108 gso/s, 2.16 ulp | 131 gso/s, 2.16 ulp |
|
|
393
|
+
| `nk_euclideans_symmetric_f32_smef64` | 37.0 gso/s, 1.50 ulp | 29.3 gso/s, 1.50 ulp | 36.5 gso/s, 1.50 ulp |
|
|
394
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
395
|
+
| `nk_angulars_packed_bf16_serial` | 14.3 gso/s, 0 ulp | 14.8 gso/s, 0.1 ulp | 14.7 gso/s, 0 ulp |
|
|
396
|
+
| `nk_angulars_symmetric_bf16_serial` | 13.1 gso/s, 0.04 ulp | 13.5 gso/s, 0.1 ulp | 13.1 gso/s, 0.04 ulp |
|
|
397
|
+
| `nk_euclideans_packed_bf16_serial` | 14.3 gso/s, 0.6 ulp | 14.8 gso/s, 3.1 ulp | 14.7 gso/s, 2.1 ulp |
|
|
398
|
+
| `nk_euclideans_symmetric_bf16_serial` | 13.1 gso/s, 0.6 ulp | 13.5 gso/s, 3.1 ulp | 13.1 gso/s, 2.1 ulp |
|
|
399
|
+
| `nk_angulars_packed_bf16_neonbfdot` | 55.8 gso/s, 0 ulp | 57.6 gso/s, 0.1 ulp | 58.5 gso/s, 0.04 ulp |
|
|
400
|
+
| `nk_angulars_symmetric_bf16_neonbfdot` | 36.8 gso/s, 0 ulp | 38.6 gso/s, 0.1 ulp | 36.7 gso/s, 0.04 ulp |
|
|
401
|
+
| `nk_euclideans_packed_bf16_neonbfdot` | 56.4 gso/s, 0.3 ulp | 57.7 gso/s, 2.9 ulp | 58.6 gso/s, 1.9 ulp |
|
|
402
|
+
| `nk_euclideans_symmetric_bf16_neonbfdot` | 36.9 gso/s, 0.3 ulp | 39.2 gso/s, 2.9 ulp | 40.3 gso/s, 1.9 ulp |
|
|
403
|
+
| `nk_angulars_packed_bf16_sme` | 272 gso/s, 0.0363 ulp | 274 gso/s, 0.0363 ulp | 377 gso/s, 0.0363 ulp |
|
|
404
|
+
| `nk_angulars_symmetric_bf16_sme` | 103 gso/s, 0.0256 ulp | 103 gso/s, 0.0256 ulp | 120 gso/s, 0.0255 ulp |
|
|
405
|
+
| `nk_euclideans_packed_bf16_sme` | 356 gso/s, 0.538 ulp | 335 gso/s, 0.538 ulp | 400 gso/s, 0.538 ulp |
|
|
406
|
+
| `nk_euclideans_symmetric_bf16_sme` | 119 gso/s, 0.275 ulp | 99.8 gso/s, 0.275 ulp | 96.8 gso/s, 0.275 ulp |
|
|
407
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
408
|
+
| `nk_angulars_packed_f16_serial` | 12.9 gso/s, 0.1 ulp | 13.3 gso/s, 0.1 ulp | 12.7 gso/s, 0.1 ulp |
|
|
409
|
+
| `nk_angulars_symmetric_f16_serial` | 17.7 gso/s, 0.1 ulp | 16.9 gso/s, 0.09 ulp | 16.1 gso/s, 0.1 ulp |
|
|
410
|
+
| `nk_euclideans_packed_f16_serial` | 12.9 gso/s, 1.1 ulp | 13.4 gso/s, 0.7 ulp | 12.7 gso/s, 5.6 ulp |
|
|
411
|
+
| `nk_euclideans_symmetric_f16_serial` | 17.7 gso/s, 1.1 ulp | 16.9 gso/s, 0.7 ulp | 16.1 gso/s, 5.6 ulp |
|
|
412
|
+
| `nk_angulars_packed_f16_neonhalf` | 76.7 gso/s, 0.1 ulp | 74.4 gso/s, 0.1 ulp | 83.6 gso/s, 0.1 ulp |
|
|
413
|
+
| `nk_angulars_symmetric_f16_neonhalf` | 20.8 gso/s, 0.1 ulp | 20.2 gso/s, 0.1 ulp | 20.9 gso/s, 0.1 ulp |
|
|
414
|
+
| `nk_euclideans_packed_f16_neonhalf` | 80.9 gso/s, 0.9 ulp | 77.3 gso/s, 0.7 ulp | 84.0 gso/s, 5.9 ulp |
|
|
415
|
+
| `nk_euclideans_symmetric_f16_neonhalf` | 21.1 gso/s, 0.9 ulp | 20.6 gso/s, 0.6 ulp | 20.9 gso/s, 5.8 ulp |
|
|
416
|
+
| `nk_angulars_packed_f16_neonfhm` | 101 gso/s, 0.1 ulp | 109 gso/s, 0.1 ulp | 113 gso/s, 0.1 ulp |
|
|
417
|
+
| `nk_angulars_symmetric_f16_neonfhm` | 33.8 gso/s, 0.1 ulp | 35.9 gso/s, 0.1 ulp | 36.4 gso/s, 0.1 ulp |
|
|
418
|
+
| `nk_euclideans_packed_f16_neonfhm` | 102 gso/s, 0.9 ulp | 106 gso/s, 0.7 ulp | 106 gso/s, 5.9 ulp |
|
|
419
|
+
| `nk_euclideans_symmetric_f16_neonfhm` | 34.1 gso/s, 0.9 ulp | 36.0 gso/s, 0.6 ulp | 36.4 gso/s, 5.8 ulp |
|
|
420
|
+
| `nk_angulars_packed_f16_sme` | 342 gso/s, 3,480 ulp | 344 gso/s, 3,480 ulp | 348 gso/s, 3,480 ulp |
|
|
421
|
+
| `nk_angulars_symmetric_f16_sme` | 120 gso/s, 5,050 ulp | 120 gso/s, 5,050 ulp | 94.6 gso/s, 5,080 ulp |
|
|
422
|
+
| `nk_euclideans_packed_f16_sme` | 414 gso/s, 62,900 ulp | 419 gso/s, 62,900 ulp | 318 gso/s, 62,900 ulp |
|
|
423
|
+
| `nk_euclideans_symmetric_f16_sme` | 104 gso/s, 91,000 ulp | 134 gso/s, 91,500 ulp | 140 gso/s, 91,500 ulp |
|
|
424
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
425
|
+
| `nk_angulars_packed_e5m2_serial` | 8.99 gso/s, 0 ulp | 9.47 gso/s, 0 ulp | 9.55 gso/s, 0 ulp |
|
|
426
|
+
| `nk_angulars_symmetric_e5m2_serial` | 2.53 gso/s, 0 ulp | 2.66 gso/s, 0 ulp | 5.29 gso/s, 0 ulp |
|
|
427
|
+
| `nk_euclideans_packed_e5m2_serial` | 9.08 gso/s, 0.5 ulp | 9.35 gso/s, 0.5 ulp | 9.59 gso/s, 0.5 ulp |
|
|
428
|
+
| `nk_euclideans_symmetric_e5m2_serial` | 2.50 gso/s, 0.5 ulp | 2.67 gso/s, 0.5 ulp | 5.32 gso/s, 0.5 ulp |
|
|
429
|
+
| `nk_angulars_packed_e5m2_neonfhm` | 87.6 gso/s, 0 ulp | 96.1 gso/s, 0 ulp | 98.7 gso/s, 0 ulp |
|
|
430
|
+
| `nk_angulars_symmetric_e5m2_neonfhm` | 60.0 gso/s, 0 ulp | 68.7 gso/s, 0 ulp | 63.3 gso/s, 0 ulp |
|
|
431
|
+
| `nk_euclideans_packed_e5m2_neonfhm` | 89.4 gso/s, 0 ulp | 96.6 gso/s, 0 ulp | 98.9 gso/s, 0 ulp |
|
|
432
|
+
| `nk_euclideans_symmetric_e5m2_neonfhm` | 57.8 gso/s, 0 ulp | 60.3 gso/s, 0 ulp | 64.4 gso/s, 0 ulp |
|
|
433
|
+
| `nk_angulars_packed_e5m2_sme` | 287 gso/s, 0.0130 ulp | 289 gso/s, 0.0130 ulp | 250 gso/s, 0.0130 ulp |
|
|
434
|
+
| `nk_angulars_symmetric_e5m2_sme` | 130 gso/s, 0.0130 ulp | 149 gso/s, 0.0130 ulp | 132 gso/s, 0.0130 ulp |
|
|
435
|
+
| `nk_euclideans_packed_e5m2_sme` | 590 gso/s, 0.00483 ulp | 589 gso/s, 0.00483 ulp | 590 gso/s, 0.00482 ulp |
|
|
436
|
+
| `nk_euclideans_symmetric_e5m2_sme` | 294 gso/s, 0.00350 ulp | 293 gso/s, 0.00350 ulp | 292 gso/s, 0.00350 ulp |
|
|
437
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
438
|
+
| `nk_angulars_packed_e4m3_serial` | 0.837 gso/s, 0 ulp | 0.823 gso/s, 0 ulp | 0.831 gso/s, 0 ulp |
|
|
439
|
+
| `nk_angulars_symmetric_e4m3_serial` | 0.419 gso/s, 0.03 ulp | 0.423 gso/s, 0.02 ulp | 0.852 gso/s, 0.01 ulp |
|
|
440
|
+
| `nk_euclideans_packed_e4m3_serial` | 0.831 gso/s, 0.5 ulp | 0.840 gso/s, 0.5 ulp | 0.836 gso/s, 0.5 ulp |
|
|
441
|
+
| `nk_euclideans_symmetric_e4m3_serial` | 0.421 gso/s, 0.5 ulp | 0.429 gso/s, 0.5 ulp | 0.857 gso/s, 0.3 ulp |
|
|
442
|
+
| `nk_angulars_packed_e4m3_neonfhm` | 27.3 gso/s, 0 ulp | 27.8 gso/s, 0 ulp | 33.2 gso/s, 0 ulp |
|
|
443
|
+
| `nk_angulars_symmetric_e4m3_neonfhm` | 33.0 gso/s, 0 ulp | 35.3 gso/s, 0 ulp | 35.3 gso/s, 0 ulp |
|
|
444
|
+
| `nk_euclideans_packed_e4m3_neonfhm` | 29.5 gso/s, 0 ulp | 26.6 gso/s, 0 ulp | 27.4 gso/s, 0.2 ulp |
|
|
445
|
+
| `nk_euclideans_symmetric_e4m3_neonfhm` | 32.1 gso/s, 0 ulp | 33.9 gso/s, 0 ulp | 33.3 gso/s, 0.2 ulp |
|
|
446
|
+
| `nk_angulars_packed_e4m3_sme` | 90.3 gso/s, 0.0131 ulp | 89.9 gso/s, 0.0131 ulp | 96.6 gso/s, 0.0131 ulp |
|
|
447
|
+
| `nk_angulars_symmetric_e4m3_sme` | 51.5 gso/s, 0.0131 ulp | 38.1 gso/s, 0.0131 ulp | 51.2 gso/s, 0.0131 ulp |
|
|
448
|
+
| `nk_euclideans_packed_e4m3_sme` | 233 gso/s, 0.113 ulp | 185 gso/s, 0.113 ulp | 233 gso/s, 0.113 ulp |
|
|
449
|
+
| `nk_euclideans_symmetric_e4m3_sme` | 64.2 gso/s, 0.113 ulp | 64.4 gso/s, 0.113 ulp | 64.2 gso/s, 0.113 ulp |
|
|
450
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
451
|
+
| `nk_angulars_packed_e3m2_serial` | 8.34 gso/s, 0 ulp | 8.62 gso/s, 0 ulp | 9.17 gso/s, 0 ulp |
|
|
452
|
+
| `nk_angulars_symmetric_e3m2_serial` | 2.48 gso/s, 0 ulp | 2.67 gso/s, 0 ulp | 5.36 gso/s, 0 ulp |
|
|
453
|
+
| `nk_euclideans_packed_e3m2_serial` | 8.74 gso/s, 0.5 ulp | 9.08 gso/s, 0.5 ulp | 8.91 gso/s, 0.5 ulp |
|
|
454
|
+
| `nk_euclideans_symmetric_e3m2_serial` | 2.50 gso/s, 0.5 ulp | 2.69 gso/s, 0.5 ulp | 5.22 gso/s, 0.5 ulp |
|
|
455
|
+
| `nk_angulars_packed_e3m2_sme` | 183 gso/s, 0.0130 ulp | 185 gso/s, 0.0130 ulp | 177 gso/s, 0.0130 ulp |
|
|
456
|
+
| `nk_angulars_symmetric_e3m2_sme` | 87.8 gso/s, 0.0131 ulp | 78.5 gso/s, 0.0131 ulp | 87.5 gso/s, 0.0131 ulp |
|
|
457
|
+
| `nk_euclideans_packed_e3m2_sme` | 423 gso/s, 0 ulp | 422 gso/s, 0 ulp | 422 gso/s, 0 ulp |
|
|
458
|
+
| `nk_euclideans_symmetric_e3m2_sme` | 121 gso/s, 0 ulp | 122 gso/s, 0 ulp | 122 gso/s, 0 ulp |
|
|
459
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
460
|
+
| `nk_angulars_packed_e2m3_serial` | 8.60 gso/s, 0 ulp | 8.83 gso/s, 0 ulp | 9.14 gso/s, 0 ulp |
|
|
461
|
+
| `nk_angulars_symmetric_e2m3_serial` | 2.52 gso/s, 0 ulp | 2.64 gso/s, 0 ulp | 5.43 gso/s, 0 ulp |
|
|
462
|
+
| `nk_euclideans_packed_e2m3_serial` | 8.65 gso/s, 0.5 ulp | 9.03 gso/s, 0.5 ulp | 8.98 gso/s, 0.5 ulp |
|
|
463
|
+
| `nk_euclideans_symmetric_e2m3_serial` | 2.53 gso/s, 0.5 ulp | 2.68 gso/s, 0.5 ulp | 5.30 gso/s, 0.5 ulp |
|
|
464
|
+
| `nk_angulars_packed_e2m3_sme` | 141 gso/s, 0.0132 ulp | 128 gso/s, 0.0132 ulp | 132 gso/s, 0.0132 ulp |
|
|
465
|
+
| `nk_angulars_symmetric_e2m3_sme` | 134 gso/s, 0.0130 ulp | 132 gso/s, 0.0130 ulp | 138 gso/s, 0.0130 ulp |
|
|
466
|
+
| `nk_euclideans_packed_e2m3_sme` | 307 gso/s, 0 ulp | 307 gso/s, 0 ulp | 307 gso/s, 0 ulp |
|
|
467
|
+
| `nk_euclideans_symmetric_e2m3_sme` | 227 gso/s, 0 ulp | 231 gso/s, 0 ulp | 231 gso/s, 0 ulp |
|
|
468
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
469
|
+
| `nk_angulars_packed_i8_serial` | 12.8 gso/s, 0 ulp | 13.6 gso/s, 0 ulp | 14.3 gso/s, 0 ulp |
|
|
470
|
+
| `nk_angulars_symmetric_i8_serial` | 4.30 gso/s, 0 ulp | 4.57 gso/s, 0 ulp | 8.96 gso/s, 0 ulp |
|
|
471
|
+
| `nk_euclideans_packed_i8_serial` | 13.1 gso/s, 0.4 ulp | 14.1 gso/s, 0.4 ulp | 13.4 gso/s, 0.4 ulp |
|
|
472
|
+
| `nk_euclideans_symmetric_i8_serial` | 4.28 gso/s, 0.4 ulp | 4.43 gso/s, 0.4 ulp | 8.66 gso/s, 0.4 ulp |
|
|
473
|
+
| `nk_angulars_packed_i8_neonsdot` | 267 gso/s, 0 ulp | 357 gso/s, 0 ulp | 369 gso/s, 0 ulp |
|
|
474
|
+
| `nk_angulars_symmetric_i8_neonsdot` | 69.7 gso/s, 0 ulp | 78.6 gso/s, 0 ulp | 82.6 gso/s, 0 ulp |
|
|
475
|
+
| `nk_euclideans_packed_i8_neonsdot` | 294 gso/s, 0 ulp | 365 gso/s, 0 ulp | 476 gso/s, 0 ulp |
|
|
476
|
+
| `nk_euclideans_symmetric_i8_neonsdot` | 70.5 gso/s, 0 ulp | 83.9 gso/s, 0 ulp | 87.3 gso/s, 0 ulp |
|
|
477
|
+
| `nk_angulars_packed_i8_sme` | 219 gso/s, 0.0133 ulp | 219 gso/s, 0.0133 ulp | 223 gso/s, 0.0133 ulp |
|
|
478
|
+
| `nk_angulars_symmetric_i8_sme` | 177 gso/s, 0.0134 ulp | 177 gso/s, 0.0134 ulp | 177 gso/s, 0.0134 ulp |
|
|
479
|
+
| `nk_euclideans_packed_i8_sme` | 228 gso/s, 0 ulp | 246 gso/s, 0 ulp | 243 gso/s, 0 ulp |
|
|
480
|
+
| `nk_euclideans_symmetric_i8_sme` | 159 gso/s, 0 ulp | 167 gso/s, 0 ulp | 169 gso/s, 0 ulp |
|
|
481
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
482
|
+
| `nk_angulars_packed_u8_serial` | 10.1 gso/s, 0.3 ulp | 10.3 gso/s, 0.3 ulp | 10.6 gso/s, 0.3 ulp |
|
|
483
|
+
| `nk_angulars_symmetric_u8_serial` | 4.88 gso/s, 0.3 ulp | 5.11 gso/s, 0.3 ulp | 10.2 gso/s, 0.3 ulp |
|
|
484
|
+
| `nk_euclideans_packed_u8_serial` | 10.1 gso/s, 0.5 ulp | 10.5 gso/s, 0.5 ulp | 10.0 gso/s, 0.6 ulp |
|
|
485
|
+
| `nk_euclideans_symmetric_u8_serial` | 4.83 gso/s, 0.5 ulp | 5.10 gso/s, 0.5 ulp | 9.83 gso/s, 0.6 ulp |
|
|
486
|
+
| `nk_angulars_packed_u8_neonsdot` | 264 gso/s, 0.3 ulp | 345 gso/s, 0.3 ulp | 379 gso/s, 0.3 ulp |
|
|
487
|
+
| `nk_angulars_symmetric_u8_neonsdot` | 69.7 gso/s, 0.3 ulp | 78.6 gso/s, 0.3 ulp | 82.3 gso/s, 0.3 ulp |
|
|
488
|
+
| `nk_euclideans_packed_u8_neonsdot` | 283 gso/s, 0 ulp | 383 gso/s, 0 ulp | 477 gso/s, 0 ulp |
|
|
489
|
+
| `nk_euclideans_symmetric_u8_neonsdot` | 70.8 gso/s, 0 ulp | 85.3 gso/s, 0 ulp | 87.6 gso/s, 0 ulp |
|
|
490
|
+
| `nk_angulars_packed_u8_sme` | 181 gso/s, 0.324 ulp | 181 gso/s, 0.324 ulp | 198 gso/s, 0.324 ulp |
|
|
491
|
+
| `nk_angulars_symmetric_u8_sme` | 193 gso/s, 0.323 ulp | 192 gso/s, 0.323 ulp | 159 gso/s, 0.323 ulp |
|
|
492
|
+
| `nk_euclideans_packed_u8_sme` | 254 gso/s, 0 ulp | 270 gso/s, 0 ulp | 225 gso/s, 0 ulp |
|
|
493
|
+
| `nk_euclideans_symmetric_u8_sme` | 170 gso/s, 0 ulp | 215 gso/s, 0 ulp | 162 gso/s, 0 ulp |
|
|
494
|
+
| __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
495
|
+
| `nk_angulars_packed_i4_serial` | 26.6 gso/s, 0.3 ulp | 27.9 gso/s, 0.3 ulp | 28.1 gso/s, 0.3 ulp |
|
|
496
|
+
| `nk_angulars_symmetric_i4_serial` | 7.66 gso/s, 0.3 ulp | 7.84 gso/s, 0.3 ulp | 7.80 gso/s, 0.3 ulp |
|
|
497
|
+
| `nk_euclideans_packed_i4_serial` | 26.6 gso/s, 0.5 ulp | 27.6 gso/s, 0.5 ulp | 27.6 gso/s, 0.6 ulp |
|
|
498
|
+
| `nk_euclideans_symmetric_i4_serial` | 7.75 gso/s, 0.5 ulp | 7.75 gso/s, 0.5 ulp | 7.82 gso/s, 0.6 ulp |
|
|
499
|
+
| `nk_angulars_packed_i4_neonsdot` | 207 gso/s, 0.3 ulp | 263 gso/s, 0.3 ulp | 279 gso/s, 0.3 ulp |
|
|
500
|
+
| `nk_angulars_symmetric_i4_neonsdot` | 97.6 gso/s, 0.3 ulp | 135 gso/s, 0.3 ulp | 151 gso/s, 0.3 ulp |
|
|
501
|
+
| `nk_euclideans_packed_i4_neonsdot` | 218 gso/s, 0.0 ulp | 266 gso/s, 0.0 ulp | 303 gso/s, 0.0 ulp |
|
|
502
|
+
| `nk_euclideans_symmetric_i4_neonsdot` | 101 gso/s, 0.0 ulp | 138 gso/s, 0.0 ulp | 156 gso/s, 0.0 ulp |
|
|
503
|
+
| `nk_angulars_packed_i4_sme` | 251 gso/s, 0.324 ulp | 251 gso/s, 0.324 ulp | 256 gso/s, 0.324 ulp |
|
|
504
|
+
| `nk_angulars_symmetric_i4_sme` | 274 gso/s, 0.323 ulp | 273 gso/s, 0.323 ulp | 281 gso/s, 0.323 ulp |
|
|
505
|
+
| `nk_euclideans_packed_i4_sme` | 260 gso/s, 0 ulp | 319 gso/s, 0 ulp | 275 gso/s, 0 ulp |
|
|
506
|
+
| `nk_euclideans_symmetric_i4_sme` | 258 gso/s, 0 ulp | 272 gso/s, 0 ulp | 264 gso/s, 0 ulp |
|
|
507
|
+
| __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
508
|
+
| `nk_angulars_packed_u4_serial` | 24.1 gso/s, 0.3 ulp | 25.2 gso/s, 0.3 ulp | 25.9 gso/s, 0.3 ulp |
|
|
509
|
+
| `nk_angulars_symmetric_u4_serial` | 8.23 gso/s, 0.3 ulp | 8.59 gso/s, 0.3 ulp | 8.53 gso/s, 0.3 ulp |
|
|
510
|
+
| `nk_euclideans_packed_u4_serial` | 24.6 gso/s, 0.5 ulp | 25.5 gso/s, 0.5 ulp | 25.5 gso/s, 0.6 ulp |
|
|
511
|
+
| `nk_euclideans_symmetric_u4_serial` | 8.24 gso/s, 0.5 ulp | 8.42 gso/s, 0.5 ulp | 8.54 gso/s, 0.6 ulp |
|
|
512
|
+
| `nk_angulars_packed_u4_neonsdot` | 230 gso/s, 0.3 ulp | 302 gso/s, 0.3 ulp | 328 gso/s, 0.3 ulp |
|
|
513
|
+
| `nk_angulars_symmetric_u4_neonsdot` | 98.1 gso/s, 0.3 ulp | 134 gso/s, 0.3 ulp | 156 gso/s, 0.3 ulp |
|
|
514
|
+
| `nk_euclideans_packed_u4_neonsdot` | 243 gso/s, 0.0 ulp | 311 gso/s, 0.0 ulp | 353 gso/s, 0.0 ulp |
|
|
515
|
+
| `nk_euclideans_symmetric_u4_neonsdot` | 101 gso/s, 0.0 ulp | 135 gso/s, 0.0 ulp | 158 gso/s, 0.0 ulp |
|
|
516
|
+
| `nk_angulars_packed_u4_sme` | 243 gso/s, 0.324 ulp | 242 gso/s, 0.324 ulp | 263 gso/s, 0.324 ulp |
|
|
517
|
+
| `nk_angulars_symmetric_u4_sme` | 300 gso/s, 0.323 ulp | 307 gso/s, 0.323 ulp | 278 gso/s, 0.323 ulp |
|
|
518
|
+
| `nk_euclideans_packed_u4_sme` | 307 gso/s, 0 ulp | 287 gso/s, 0 ulp | 287 gso/s, 0 ulp |
|
|
519
|
+
| `nk_euclideans_symmetric_u4_sme` | 283 gso/s, 0 ulp | 292 gso/s, 0 ulp | 271 gso/s, 0 ulp |
|
|
520
|
+
|
|
521
|
+
#### WASM
|
|
522
|
+
|
|
523
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
524
|
+
|
|
525
|
+
| Kernel | 256³ | 1024³ | 4096³ |
|
|
526
|
+
| :----------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
527
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
528
|
+
| `nk_angulars_packed_f64_serial` | 2.04 gso/s, 0 ulp | 5.33 gso/s, 0 ulp | 5.36 gso/s, 0 ulp |
|
|
529
|
+
| `nk_angulars_symmetric_f64_serial` | 2.10 gso/s, 0 ulp | 5.54 gso/s, 0 ulp | 11.2 gso/s, 0 ulp |
|
|
530
|
+
| `nk_euclideans_packed_f64_serial` | 3.88 gso/s, 0.4 ulp | 5.38 gso/s, 0.4 ulp | 5.61 gso/s, 0.4 ulp |
|
|
531
|
+
| `nk_euclideans_symmetric_f64_serial` | 5.20 gso/s, 0.4 ulp | 5.54 gso/s, 0.4 ulp | 11.2 gso/s, 0.4 ulp |
|
|
532
|
+
| `nk_angulars_packed_f64_v128relaxed` | 30.1 gso/s, 0.1 ulp | 36.2 gso/s, 0.1 ulp | 37.1 gso/s, 0.1 ulp |
|
|
533
|
+
| `nk_angulars_symmetric_f64_v128relaxed` | 9.59 gso/s, 0.1 ulp | 10.7 gso/s, 0.1 ulp | 20.9 gso/s, 0.1 ulp |
|
|
534
|
+
| `nk_euclideans_packed_f64_v128relaxed` | 30.4 gso/s, 0.4 ulp | 36.3 gso/s, 0.4 ulp | 37.1 gso/s, 0.4 ulp |
|
|
535
|
+
| `nk_euclideans_symmetric_f64_v128relaxed` | 9.58 gso/s, 0.4 ulp | 10.7 gso/s, 0.4 ulp | 20.9 gso/s, 0.4 ulp |
|
|
536
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
537
|
+
| `nk_angulars_packed_f32_serial` | 8.31 gso/s, 0.1 ulp | 23.8 gso/s, 0.1 ulp | 26.3 gso/s, 0.1 ulp |
|
|
538
|
+
| `nk_angulars_symmetric_f32_serial` | 6.06 gso/s, 0.1 ulp | 17.7 gso/s, 0.1 ulp | 35.0 gso/s, 0.1 ulp |
|
|
539
|
+
| `nk_euclideans_packed_f32_serial` | 8.71 gso/s, 0.3 ulp | 25.7 gso/s, 0.3 ulp | 26.3 gso/s, 0.3 ulp |
|
|
540
|
+
| `nk_euclideans_symmetric_f32_serial` | 6.15 gso/s, 0.3 ulp | 17.7 gso/s, 0.3 ulp | 34.9 gso/s, 0.3 ulp |
|
|
541
|
+
| `nk_angulars_packed_f32_v128relaxed` | 56.7 gso/s, 0.1 ulp | 66.9 gso/s, 0.1 ulp | 65.8 gso/s, 0.1 ulp |
|
|
542
|
+
| `nk_angulars_symmetric_f32_v128relaxed` | 18.5 gso/s, 0.1 ulp | 20.9 gso/s, 0.1 ulp | 39.0 gso/s, 0.1 ulp |
|
|
543
|
+
| `nk_euclideans_packed_f32_v128relaxed` | 57.2 gso/s, 0.2 ulp | 67.1 gso/s, 0.2 ulp | 65.8 gso/s, 0.2 ulp |
|
|
544
|
+
| `nk_euclideans_symmetric_f32_v128relaxed` | 18.5 gso/s, 0.2 ulp | 20.9 gso/s, 0.2 ulp | 39.0 gso/s, 0.2 ulp |
|
|
545
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
546
|
+
| `nk_angulars_packed_bf16_serial` | 6.43 gso/s, 0.3 ulp | 18.6 gso/s, 0.3 ulp | 20.5 gso/s, 0.3 ulp |
|
|
547
|
+
| `nk_angulars_symmetric_bf16_serial` | 7.55 gso/s, 0.3 ulp | 25.4 gso/s, 0.3 ulp | 52.0 gso/s, 0.3 ulp |
|
|
548
|
+
| `nk_euclideans_packed_bf16_serial` | 6.43 gso/s, 5.3 ulp | 18.6 gso/s, 5.3 ulp | 20.6 gso/s, 5.3 ulp |
|
|
549
|
+
| `nk_euclideans_symmetric_bf16_serial` | 7.27 gso/s, 5.3 ulp | 18.4 gso/s, 5.3 ulp | 52.0 gso/s, 5.3 ulp |
|
|
550
|
+
| `nk_angulars_packed_bf16_v128relaxed` | 52.9 gso/s, 0.3 ulp | 57.2 gso/s, 0.3 ulp | 57.0 gso/s, 0.3 ulp |
|
|
551
|
+
| `nk_angulars_symmetric_bf16_v128relaxed` | 16.5 gso/s, 0.3 ulp | 17.8 gso/s, 0.3 ulp | 34.8 gso/s, 0.3 ulp |
|
|
552
|
+
| `nk_euclideans_packed_bf16_v128relaxed` | 53.3 gso/s, 5.3 ulp | 57.3 gso/s, 5.3 ulp | 56.7 gso/s, 5.3 ulp |
|
|
553
|
+
| `nk_euclideans_symmetric_bf16_v128relaxed` | 16.6 gso/s, 5.3 ulp | 17.8 gso/s, 5.3 ulp | 34.7 gso/s, 5.3 ulp |
|
|
554
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
555
|
+
| `nk_angulars_packed_e2m3_serial` | 2.04 gso/s, 0 ulp | 5.36 gso/s, 0 ulp | 5.62 gso/s, 0 ulp |
|
|
556
|
+
| `nk_angulars_symmetric_e2m3_serial` | 3.00 gso/s, 0 ulp | 7.21 gso/s, 0 ulp | 15.2 gso/s, 0 ulp |
|
|
557
|
+
| `nk_euclideans_packed_e2m3_serial` | 3.58 gso/s, 0.3 ulp | 5.40 gso/s, 0.3 ulp | 5.64 gso/s, 0.3 ulp |
|
|
558
|
+
| `nk_euclideans_symmetric_e2m3_serial` | 6.64 gso/s, 0.3 ulp | 7.44 gso/s, 0.3 ulp | 15.3 gso/s, 0.3 ulp |
|
|
559
|
+
| `nk_angulars_packed_e2m3_v128relaxed` | 34.5 gso/s, 0 ulp | 37.8 gso/s, 0 ulp | 35.7 gso/s, 0 ulp |
|
|
560
|
+
| `nk_angulars_symmetric_e2m3_v128relaxed` | 31.3 gso/s, 0 ulp | 36.8 gso/s, 0 ulp | 72.5 gso/s, 0 ulp |
|
|
561
|
+
| `nk_euclideans_packed_e2m3_v128relaxed` | 34.3 gso/s, 0 ulp | 37.8 gso/s, 0 ulp | 35.9 gso/s, 0 ulp |
|
|
562
|
+
| `nk_euclideans_symmetric_e2m3_v128relaxed` | 31.7 gso/s, 0 ulp | 36.8 gso/s, 0 ulp | 72.5 gso/s, 0 ulp |
|
|
563
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
564
|
+
| `nk_angulars_packed_i8_serial` | 5.12 gso/s, 0 ulp | 13.0 gso/s, 0 ulp | 13.7 gso/s, 0 ulp |
|
|
565
|
+
| `nk_angulars_symmetric_i8_serial` | 6.54 gso/s, 0 ulp | 17.0 gso/s, 0 ulp | 36.5 gso/s, 0 ulp |
|
|
566
|
+
| `nk_euclideans_packed_i8_serial` | 12.2 gso/s, 0.5 ulp | 13.0 gso/s, 0.5 ulp | 13.7 gso/s, 0.5 ulp |
|
|
567
|
+
| `nk_euclideans_symmetric_i8_serial` | 13.7 gso/s, 0.5 ulp | 17.4 gso/s, 0.5 ulp | 36.5 gso/s, 0.5 ulp |
|
|
568
|
+
| `nk_angulars_packed_i8_v128relaxed` | 45.5 gso/s, 0 ulp | 51.1 gso/s, 0 ulp | 50.6 gso/s, 0 ulp |
|
|
569
|
+
| `nk_angulars_symmetric_i8_v128relaxed` | 41.3 gso/s, 0 ulp | 49.9 gso/s, 0 ulp | 102 gso/s, 0 ulp |
|
|
570
|
+
| `nk_euclideans_packed_i8_v128relaxed` | 46.0 gso/s, 0 ulp | 51.3 gso/s, 0 ulp | 50.7 gso/s, 0 ulp |
|
|
571
|
+
| `nk_euclideans_symmetric_i8_v128relaxed` | 41.5 gso/s, 0 ulp | 50.0 gso/s, 0 ulp | 102 gso/s, 0 ulp |
|
|
572
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
573
|
+
| `nk_angulars_packed_u8_serial` | 5.60 gso/s, 0.3 ulp | 13.4 gso/s, 0.3 ulp | 14.1 gso/s, 0.3 ulp |
|
|
574
|
+
| `nk_angulars_symmetric_u8_serial` | 7.91 gso/s, 0.3 ulp | 17.1 gso/s, 0.3 ulp | 36.9 gso/s, 0.3 ulp |
|
|
575
|
+
| `nk_euclideans_packed_u8_serial` | 12.5 gso/s, 0.4 ulp | 13.4 gso/s, 0.4 ulp | 14.1 gso/s, 0.4 ulp |
|
|
576
|
+
| `nk_euclideans_symmetric_u8_serial` | 13.7 gso/s, 0.4 ulp | 17.6 gso/s, 0.4 ulp | 36.9 gso/s, 0.4 ulp |
|
|
577
|
+
| `nk_angulars_packed_u8_v128relaxed` | 74.7 gso/s, 0.3 ulp | 86.1 gso/s, 0.3 ulp | 83.3 gso/s, 0.3 ulp |
|
|
578
|
+
| `nk_angulars_symmetric_u8_v128relaxed` | 54.1 gso/s, 0.3 ulp | 76.1 gso/s, 0.3 ulp | 160 gso/s, 0.3 ulp |
|
|
579
|
+
| `nk_euclideans_packed_u8_v128relaxed` | 75.6 gso/s, 0 ulp | 86.4 gso/s, 0 ulp | 84.2 gso/s, 0 ulp |
|
|
580
|
+
| `nk_euclideans_symmetric_u8_v128relaxed` | 54.5 gso/s, 0 ulp | 76.2 gso/s, 0 ulp | 161 gso/s, 0 ulp |
|