numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
# Horizontal Reductions in NumKong
|
|
2
|
+
|
|
3
|
+
NumKong implements single-pass horizontal reductions over dense vectors: statistical moments (sum + sum-of-squares) and extrema (min + max with argmin + argmax).
|
|
4
|
+
Both reductions traverse the input once, producing scalar outputs with compensated arithmetic for numerical stability.
|
|
5
|
+
The only module with full stride support — `stride_bytes` controls the byte distance between consecutive logical elements, enabling column extraction from row-major matrices and strided array views without copying.
|
|
6
|
+
Used internally by packing routines for norm precomputation and by distance kernels for normalization.
|
|
7
|
+
|
|
8
|
+
Moments:
|
|
9
|
+
|
|
10
|
+
```math
|
|
11
|
+
\text{sum} = \sum a_i, \quad \text{sumsq} = \sum a_i^2
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Min-max:
|
|
15
|
+
|
|
16
|
+
```math
|
|
17
|
+
\text{min} = \min_i a_i, \quad \text{argmin} = \arg\min_i a_i
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Reformulating as Python pseudocode:
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
import numpy as np
|
|
24
|
+
|
|
25
|
+
def moments(a: np.ndarray) -> tuple[float, float]:
|
|
26
|
+
return np.sum(a), np.sum(a ** 2)
|
|
27
|
+
|
|
28
|
+
def minmax(a: np.ndarray) -> tuple[float, int, float, int]:
|
|
29
|
+
return np.min(a), np.argmin(a), np.max(a), np.argmax(a)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Input & Output Types
|
|
33
|
+
|
|
34
|
+
Float reductions:
|
|
35
|
+
|
|
36
|
+
| Input Type | Output Type | Description |
|
|
37
|
+
| ---------- | ----------- | ------------------------------------- |
|
|
38
|
+
| `f64` | `f64` | 64-bit double precision |
|
|
39
|
+
| `f32` | `f32` | 32-bit single precision |
|
|
40
|
+
| `f16` | `f32` | 16-bit half precision, widened output |
|
|
41
|
+
| `bf16` | `f32` | 16-bit brain float, widened output |
|
|
42
|
+
|
|
43
|
+
Mini-float reductions:
|
|
44
|
+
|
|
45
|
+
| Input Type | Output Type | Description |
|
|
46
|
+
| ---------- | ----------- | -------------------------------------------- |
|
|
47
|
+
| `e4m3` | `f32` | 8-bit Float8: 4 exponent, 3 mantissa bits |
|
|
48
|
+
| `e5m2` | `f32` | 8-bit Float8: 5 exponent, 2 mantissa bits |
|
|
49
|
+
| `e2m3` | `f32` | 8-bit MX format: 2 exponent, 3 mantissa bits |
|
|
50
|
+
| `e3m2` | `f32` | 8-bit MX format: 3 exponent, 2 mantissa bits |
|
|
51
|
+
|
|
52
|
+
Integer reductions:
|
|
53
|
+
|
|
54
|
+
| Input Type | Output Type | Description |
|
|
55
|
+
| ---------- | ----------- | ---------------------------------- |
|
|
56
|
+
| `i8` | `i64` | 8-bit signed, widened to 64-bit |
|
|
57
|
+
| `u8` | `u64` | 8-bit unsigned, widened to 64-bit |
|
|
58
|
+
| `i16` | `i64` | 16-bit signed, widened to 64-bit |
|
|
59
|
+
| `u16` | `u64` | 16-bit unsigned, widened to 64-bit |
|
|
60
|
+
| `i32` | `i64` | 32-bit signed, widened to 64-bit |
|
|
61
|
+
| `u32` | `u64` | 32-bit unsigned, widened to 64-bit |
|
|
62
|
+
| `i64` | `i64` | 64-bit signed |
|
|
63
|
+
| `u64` | `u64` | 64-bit unsigned |
|
|
64
|
+
|
|
65
|
+
Sub-byte reductions:
|
|
66
|
+
|
|
67
|
+
| Input Type | Output Type | Description |
|
|
68
|
+
| ---------- | ----------- | ----------------------------------------- |
|
|
69
|
+
| `i4` | `i64` | 4-bit signed nibbles, widened to 64-bit |
|
|
70
|
+
| `u4` | `u64` | 4-bit unsigned nibbles, widened to 64-bit |
|
|
71
|
+
| `u1` | `u64` | 1-bit binary packed octets |
|
|
72
|
+
|
|
73
|
+
## Optimizations
|
|
74
|
+
|
|
75
|
+
### Strided Access Across Backends
|
|
76
|
+
|
|
77
|
+
Reductions accept a `stride_bytes` parameter specifying the byte distance between consecutive logical elements — the only NumKong module where loads far outnumber stores (N loads, 2-4 scalar stores), making arbitrary strides practical.
|
|
78
|
+
Serial iterates with byte-pointer arithmetic: `ptr += stride_bytes` per element.
|
|
79
|
+
NEON uses hardware de-interleaving loads (`vld2q_f32`, `vld3q_f32`, `vld4q_f32`) for small integer strides (2-4 elements apart), extracting column 0 from interleaved data in a single instruction.
|
|
80
|
+
Haswell/Skylake use blend masks for small strides and `_mm256_i32gather_ps` / `_mm512_i32gather_pd` hardware gathers for larger strides — 8cy per gather on Haswell, ~5cy on Skylake for 16-element gathers.
|
|
81
|
+
RVV uses native strided loads (`__riscv_vlse32_v_f32m1`) that accept arbitrary byte strides directly in the load instruction — no gather overhead, no stride-dependent branching.
|
|
82
|
+
|
|
83
|
+
### Kahan-Neumaier Compensated Summation
|
|
84
|
+
|
|
85
|
+
`nk_reduce_moments_f32_serial`, `nk_reduce_moments_f32_haswell` use Neumaier's variant of Kahan summation — maintaining a running compensation term that captures rounding errors.
|
|
86
|
+
Standard pairwise summation accumulates $O(\sqrt{n})$ ULP error for n elements; Neumaier compensation bounds error to $O(1)$ ULP regardless of vector length.
|
|
87
|
+
The serial path uses Neumaier's adaptive branch: `if (abs(sum) >= abs(val))` selects the larger summand first, minimizing relative error in the compensation term.
|
|
88
|
+
SIMD backends (`nk_reduce_moments_f32_haswell`) carry 8 independent compensation lanes in a YMM register — computing `round_error = tentative - sum; correction = (sum - (tentative - round_error)) + (val - round_error)` without branches, folding all lanes into a single scalar correction at the end.
|
|
89
|
+
|
|
90
|
+
### Fused Moments in a Single Pass
|
|
91
|
+
|
|
92
|
+
`nk_reduce_moments_f32_haswell`, `nk_reduce_moments_f64_skylake` compute sum and sum-of-squares simultaneously — one load feeds both a `VADDPS` (sum accumulator) and a `VFMADD231PS` (square accumulator).
|
|
93
|
+
Two accumulators share the same loaded data, halving memory bandwidth compared to separate sum + norm passes.
|
|
94
|
+
The squared-norm $\|a\|^2 = \sum a_i^2$ is a self-dot-product, reused by packing routines (`nk_dots_pack_f32_haswell`) to precompute per-vector norms during layout transformation.
|
|
95
|
+
For Float16/BFloat16/Float8 inputs, all backends widen to Float32 before accumulation — NEON FHM (`nk_reduce_moments_e4m3_neonfhm`) converts e4m3->f16 via lookup, then uses `vfmlalq_low_f16` to fuse the Float16 → Float32 widening with the FMA into the Float32 accumulator.
|
|
96
|
+
|
|
97
|
+
### Integer Saturation in Sum-of-Squares
|
|
98
|
+
|
|
99
|
+
Integer moments accumulate sums in the widest available type: Int8/UInt8/Int16 inputs produce Int64/UInt64 outputs.
|
|
100
|
+
Sums use widening addition chains — NEON uses pairwise widening (`vpaddlq_s16` -> UInt32 -> UInt64 stages); Haswell biases Int8 inputs with 0x80 and uses unsigned SAD (`_mm256_sad_epu8`) for the sum, correcting by subtracting $128 \times \text{count}$ at the end.
|
|
101
|
+
Sum-of-squares can overflow UInt64 when squaring large Int32 values — backends use explicit saturating multiply: checks if `abs(val) < 2^32` (square fits in UInt64), otherwise saturates to I64_MAX.
|
|
102
|
+
Haswell emulates UInt64 saturating add via XOR-based unsigned comparison: flip sign bits to convert unsigned overflow detection into a signed comparison, then OR with the overflow mask to produce all-ones on saturation.
|
|
103
|
+
|
|
104
|
+
### Recursive Blocking for Counter Overflow
|
|
105
|
+
|
|
106
|
+
All SIMD backends use loop iteration counters narrower than `nk_size_t` to save register pressure — UInt8 for Int8 minmax lanes, UInt16 for Float32 moments lanes.
|
|
107
|
+
When `count` exceeds the counter's range x lane count (e.g., Haswell Float32: $256 \times 8 = 2048$ elements for UInt8 counters), the reduction splits recursively: process the left half, process the right half, combine results with saturating arithmetic.
|
|
108
|
+
Block caps vary by backend and element width: Haswell Int8 minmax uses UInt8 loop counters (cap = $256 \times 32 = 8192$); Skylake Float32 moments uses UInt16 counters (cap = $65536 \times 16 = 1048576$).
|
|
109
|
+
The recursive split is invisible to the caller — the public API accepts arbitrary `count` values; internal dispatch chooses between single-pass and recursive based on the cap.
|
|
110
|
+
|
|
111
|
+
### Index Tracking at Different Register Scales
|
|
112
|
+
|
|
113
|
+
Argmin/argmax requires tracking both values and their positions — but indices need wider storage than values (UInt64 for arbitrary-length vectors, vs UInt8/UInt16/Float32 for data).
|
|
114
|
+
Haswell Int8 minmax tracks iteration counters in UInt8 lanes (same width as data) — after the loop, the winning lane's counter is multiplied by the lane count and added to the lane index within the register to reconstruct the global position.
|
|
115
|
+
RVV uses u64m2 registers (LMUL=2) for indices alongside f32m1 for values — the wider index register holds one 64-bit position per Float32 lane, enabling direct merge without post-loop reconstruction.
|
|
116
|
+
NEON uses same-width counters (u8x16 for i8x16 minmax), limiting block size to $256 \times 16 = 4096$ elements before recursive splitting.
|
|
117
|
+
|
|
118
|
+
### NaN-Aware Extrema Tracking
|
|
119
|
+
|
|
120
|
+
`nk_reduce_minmax_f32_haswell`, `nk_reduce_minmax_f64_skylake` use IEEE ordered-quiet comparisons (`_CMP_LT_OQ`, `_CMP_GT_OQ`) — returning false when either operand is NaN, so NaN inputs never replace the running extremum.
|
|
121
|
+
Tail elements beyond the vector-aligned portion are masked by loading into a NaN-filled register via `_mm256_mask_loadu_ps(nan_vec, mask, ptr)` — NaN tails cannot win any comparison, eliminating out-of-bounds artifacts.
|
|
122
|
+
If all inputs are NaN, the sentinels remain (min = F32_MAX, max = F32_MIN) and indices are set to NK_SIZE_MAX, signaling no valid extremum.
|
|
123
|
+
The final horizontal reduction across lanes uses pairwise `VSHUFPS` + `VMINPS` chains — 3 shuffles for a 256-bit register, $O(\log_2 w)$ for width $w$.
|
|
124
|
+
|
|
125
|
+
## Performance
|
|
126
|
+
|
|
127
|
+
The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
|
|
128
|
+
The input size is controlled by the `NK_DENSE_DIMENSIONS` environment variable and set to 256, 1024, and 4096 elements.
|
|
129
|
+
The throughput is measured in GB/s as the number of input bytes per second.
|
|
130
|
+
Accuracy is reported as mean ULP (units in last place) unless noted otherwise — the average number of representable floating-point values between the result and the exact answer.
|
|
131
|
+
Each kernel runs for at least 20 seconds per configuration.
|
|
132
|
+
Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
|
|
133
|
+
Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
|
|
134
|
+
|
|
135
|
+
### Intel Sapphire Rapids
|
|
136
|
+
|
|
137
|
+
#### Native
|
|
138
|
+
|
|
139
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
140
|
+
| :------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
141
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
142
|
+
| `nk_reduce_moments_f64_serial` | 1.47 gb/s, 0 ulp | 1.73 gb/s, 0 ulp | 1.95 gb/s, 0 ulp |
|
|
143
|
+
| `nk_reduce_minmax_f64_serial` | 6.59 gb/s, 0 ulp | 5.95 gb/s, 0 ulp | 5.82 gb/s, 0 ulp |
|
|
144
|
+
| `nk_reduce_moments_f64_haswell` | 10.8 gb/s, 0.1 ulp | 9.18 gb/s, 0 ulp | 6.05 gb/s, 0 ulp |
|
|
145
|
+
| `nk_reduce_minmax_f64_haswell` | 8.11 gb/s, 0 ulp | 9.45 gb/s, 0 ulp | 6.59 gb/s, 0 ulp |
|
|
146
|
+
| `nk_reduce_moments_f64_skylake` | 14.7 gb/s, 0.3 ulp | 13.9 gb/s, 0.1 ulp | 11.4 gb/s, 0 ulp |
|
|
147
|
+
| `nk_reduce_minmax_f64_skylake` | 9.02 gb/s, 0 ulp | 18.3 gb/s, 0 ulp | 9.93 gb/s, 0 ulp |
|
|
148
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
149
|
+
| `nk_reduce_moments_f32_serial` | 0.458 gb/s, 0 ulp | 0.437 gb/s, 0 ulp | 0.449 gb/s, 0 ulp |
|
|
150
|
+
| `nk_reduce_minmax_f32_serial` | 3.35 gb/s, 0 ulp | 3.04 gb/s, 0 ulp | 3.27 gb/s, 0 ulp |
|
|
151
|
+
| `nk_reduce_moments_f32_haswell` | 18.4 gb/s, 0.8 ulp | 17.8 gb/s, 4.2 ulp | 11.7 gb/s, 7.7 ulp |
|
|
152
|
+
| `nk_reduce_minmax_f32_haswell` | 8.18 gb/s, 0 ulp | 8.92 gb/s, 0 ulp | 8.24 gb/s, 0 ulp |
|
|
153
|
+
| `nk_reduce_moments_f32_skylake` | 20.7 gb/s, 0.4 ulp | 20.3 gb/s, 3.1 ulp | 17.1 gb/s, 8.8 ulp |
|
|
154
|
+
| `nk_reduce_minmax_f32_skylake` | 7.35 gb/s, 0 ulp | 15.9 gb/s, 0 ulp | 21.8 gb/s, 0 ulp |
|
|
155
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
156
|
+
| `nk_reduce_moments_bf16_serial` | 0.208 gb/s, 0 ulp | 0.245 gb/s, 0 ulp | 0.239 gb/s, 0 ulp |
|
|
157
|
+
| `nk_reduce_minmax_bf16_serial` | 0.935 gb/s, 0 ulp | 0.984 gb/s, 0 ulp | 1.00 gb/s, 0 ulp |
|
|
158
|
+
| `nk_reduce_moments_bf16_haswell` | 11.4 gb/s, 0 ulp | 12.2 gb/s, 0 ulp | 10.8 gb/s, 1.6 ulp |
|
|
159
|
+
| `nk_reduce_minmax_bf16_haswell` | 4.98 gb/s, 0 ulp | 7.54 gb/s, 0 ulp | 9.30 gb/s, 0 ulp |
|
|
160
|
+
| `nk_reduce_moments_bf16_skylake` | 18.2 gb/s, 0 ulp | 27.0 gb/s, 0 ulp | 17.9 gb/s, 0.7 ulp |
|
|
161
|
+
| `nk_reduce_minmax_bf16_skylake` | 6.53 gb/s, 0 ulp | 18.2 gb/s, 0 ulp | 13.7 gb/s, 0 ulp |
|
|
162
|
+
| `nk_reduce_moments_bf16_genoa` | 18.1 gb/s, 0 ulp | 20.5 gb/s, 0 ulp | 19.3 gb/s, 0.8 ulp |
|
|
163
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
164
|
+
| `nk_reduce_moments_f16_serial` | 0.391 gb/s, 0 ulp | 0.354 gb/s, 0 ulp | 0.407 gb/s, 0 ulp |
|
|
165
|
+
| `nk_reduce_minmax_f16_serial` | 0.901 gb/s, 0 ulp | 0.877 gb/s, 0 ulp | 0.974 gb/s, 0 ulp |
|
|
166
|
+
| `nk_reduce_moments_f16_haswell` | 13.5 gb/s, 0 ulp | 12.6 gb/s, 0 ulp | 11.0 gb/s, 0.3 ulp |
|
|
167
|
+
| `nk_reduce_minmax_f16_haswell` | 6.61 gb/s, 0 ulp | 9.19 gb/s, 0 ulp | 8.10 gb/s, 0 ulp |
|
|
168
|
+
| `nk_reduce_moments_f16_skylake` | 17.7 gb/s, 0 ulp | 29.1 gb/s, 0.1 ulp | 18.6 gb/s, 0 ulp |
|
|
169
|
+
| `nk_reduce_minmax_f16_skylake` | 10.2 gb/s, 0 ulp | 20.8 gb/s, 0 ulp | 22.0 gb/s, 0 ulp |
|
|
170
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
171
|
+
| `nk_reduce_moments_e5m2_serial` | 0.157 gb/s, 0 ulp | 0.296 gb/s, 0 ulp | 0.229 gb/s, 0 ulp |
|
|
172
|
+
| `nk_reduce_minmax_e5m2_serial` | 0.418 gb/s, 0 ulp | 0.417 gb/s, 0 ulp | 0.451 gb/s, 0 ulp |
|
|
173
|
+
| `nk_reduce_moments_e5m2_haswell` | 2.40 gb/s, 0 ulp | 2.69 gb/s, 0 ulp | 2.61 gb/s, 0 ulp |
|
|
174
|
+
| `nk_reduce_minmax_e5m2_haswell` | 4.48 gb/s, 0 ulp | 6.80 gb/s, 0 ulp | 7.21 gb/s, 0 ulp |
|
|
175
|
+
| `nk_reduce_moments_e5m2_skylake` | 4.66 gb/s, 0 ulp | 2.83 gb/s, 0 ulp | 4.04 gb/s, 0 ulp |
|
|
176
|
+
| `nk_reduce_minmax_e5m2_skylake` | 3.90 gb/s, 0 ulp | 11.8 gb/s, 0 ulp | 19.1 gb/s, 0 ulp |
|
|
177
|
+
| `nk_reduce_moments_e5m2_genoa` | 4.76 gb/s, 0 ulp | 6.08 gb/s, 0 ulp | 5.88 gb/s, 0 ulp |
|
|
178
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
179
|
+
| `nk_reduce_moments_e4m3_serial` | 0.121 gb/s, 0 ulp | 0.129 gb/s, 0 ulp | 0.158 gb/s, 0 ulp |
|
|
180
|
+
| `nk_reduce_minmax_e4m3_serial` | 0.460 gb/s, 0 ulp | 0.473 gb/s, 0 ulp | 0.464 gb/s, 0 ulp |
|
|
181
|
+
| `nk_reduce_moments_e4m3_haswell` | 1.82 gb/s, 0 ulp | 1.90 gb/s, 0 ulp | 1.77 gb/s, 0 ulp |
|
|
182
|
+
| `nk_reduce_minmax_e4m3_haswell` | 4.42 gb/s, 0 ulp | 7.00 gb/s, 0 ulp | 8.10 gb/s, 0 ulp |
|
|
183
|
+
| `nk_reduce_moments_e4m3_skylake` | 2.77 gb/s, 0 ulp | 3.53 gb/s, 0 ulp | 2.74 gb/s, 0 ulp |
|
|
184
|
+
| `nk_reduce_minmax_e4m3_skylake` | 3.79 gb/s, 0 ulp | 9.57 gb/s, 0 ulp | 17.0 gb/s, 0 ulp |
|
|
185
|
+
| `nk_reduce_moments_e4m3_genoa` | 4.67 gb/s, 0 ulp | 5.87 gb/s, 0 ulp | 5.67 gb/s, 0 ulp |
|
|
186
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
187
|
+
| `nk_reduce_moments_e3m2_serial` | 0.158 gb/s, 0 ulp | 0.279 gb/s, 0 ulp | 0.348 gb/s, 0 ulp |
|
|
188
|
+
| `nk_reduce_minmax_e3m2_serial` | 0.464 gb/s, 0 ulp | 0.416 gb/s, 0 ulp | 0.470 gb/s, 0 ulp |
|
|
189
|
+
| `nk_reduce_moments_e3m2_haswell` | 2.37 gb/s, 0 ulp | 2.55 gb/s, 0 ulp | 2.53 gb/s, 0 ulp |
|
|
190
|
+
| `nk_reduce_minmax_e3m2_haswell` | 5.36 gb/s, 0 ulp | 7.89 gb/s, 0 ulp | 9.56 gb/s, 0 ulp |
|
|
191
|
+
| `nk_reduce_moments_e3m2_skylake` | 2.77 gb/s, 0 ulp | 3.32 gb/s, 0 ulp | 3.58 gb/s, 0 ulp |
|
|
192
|
+
| `nk_reduce_minmax_e3m2_skylake` | 9.85 gb/s, 0 ulp | 20.1 gb/s, 0 ulp | 14.6 gb/s, 0 ulp |
|
|
193
|
+
| `nk_reduce_moments_e3m2_icelake` | 8.82 gb/s, 0 ulp | 9.02 gb/s, 0 ulp | 13.4 gb/s, 0 ulp |
|
|
194
|
+
| `nk_reduce_moments_e3m2_alder` | 4.80 gb/s, 0 ulp | 7.11 gb/s, 0 ulp | 7.89 gb/s, 0 ulp |
|
|
195
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
196
|
+
| `nk_reduce_moments_e2m3_serial` | 0.157 gb/s, 0 ulp | 0.294 gb/s, 0 ulp | 0.301 gb/s, 0 ulp |
|
|
197
|
+
| `nk_reduce_minmax_e2m3_serial` | 0.465 gb/s, 0 ulp | 0.421 gb/s, 0 ulp | 0.453 gb/s, 0 ulp |
|
|
198
|
+
| `nk_reduce_moments_e2m3_haswell` | 2.43 gb/s, 0 ulp | 2.45 gb/s, 0 ulp | 2.58 gb/s, 0 ulp |
|
|
199
|
+
| `nk_reduce_minmax_e2m3_haswell` | 5.31 gb/s, 0 ulp | 7.90 gb/s, 0 ulp | 9.36 gb/s, 0 ulp |
|
|
200
|
+
| `nk_reduce_moments_e2m3_skylake` | 3.49 gb/s, 0 ulp | 3.02 gb/s, 0 ulp | 3.66 gb/s, 0 ulp |
|
|
201
|
+
| `nk_reduce_minmax_e2m3_skylake` | 6.14 gb/s, 0 ulp | 17.5 gb/s, 0 ulp | 20.3 gb/s, 0 ulp |
|
|
202
|
+
| `nk_reduce_moments_e2m3_icelake` | 12.7 gb/s, 0 ulp | 22.7 gb/s, 0 ulp | 21.7 gb/s, 0 ulp |
|
|
203
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
204
|
+
| `nk_reduce_moments_i8_serial` | 2.21 gb/s | 2.40 gb/s | 2.29 gb/s |
|
|
205
|
+
| `nk_reduce_minmax_i8_serial` | 0.806 gb/s | 0.973 gb/s | 1.09 gb/s |
|
|
206
|
+
| `nk_reduce_moments_i8_haswell` | 9.35 gb/s | 11.9 gb/s | 12.7 gb/s |
|
|
207
|
+
| `nk_reduce_minmax_i8_haswell` | 7.11 gb/s | 11.7 gb/s | 13.2 gb/s |
|
|
208
|
+
| `nk_reduce_moments_i8_skylake` | 10.4 gb/s | 16.6 gb/s | 20.1 gb/s |
|
|
209
|
+
| `nk_reduce_minmax_i8_skylake` | 2.96 gb/s | 14.4 gb/s | 15.5 gb/s |
|
|
210
|
+
| `nk_reduce_moments_i8_icelake` | 14.0 gb/s | 28.3 gb/s | 28.4 gb/s |
|
|
211
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
212
|
+
| `nk_reduce_moments_u8_serial` | 2.40 gb/s | 2.49 gb/s | 2.15 gb/s |
|
|
213
|
+
| `nk_reduce_minmax_u8_serial` | 0.776 gb/s | 0.931 gb/s | 1.05 gb/s |
|
|
214
|
+
| `nk_reduce_moments_u8_haswell` | 10.3 gb/s | 12.9 gb/s | 13.6 gb/s |
|
|
215
|
+
| `nk_reduce_minmax_u8_haswell` | 7.08 gb/s | 11.2 gb/s | 12.0 gb/s |
|
|
216
|
+
| `nk_reduce_moments_u8_skylake` | 13.2 gb/s | 20.1 gb/s | 19.6 gb/s |
|
|
217
|
+
| `nk_reduce_minmax_u8_skylake` | 4.45 gb/s | 14.0 gb/s | 20.4 gb/s |
|
|
218
|
+
| `nk_reduce_moments_u8_icelake` | 14.6 gb/s | 21.7 gb/s | 30.4 gb/s |
|
|
219
|
+
| `nk_reduce_moments_u8_alder` | 11.5 gb/s | 13.3 gb/s | 13.7 gb/s |
|
|
220
|
+
| __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
221
|
+
| `nk_reduce_moments_i4_serial` | 0.345 gb/s | 0.757 gb/s | 0.752 gb/s |
|
|
222
|
+
| `nk_reduce_minmax_i4_serial` | 0.313 gb/s | 0.285 gb/s | 0.357 gb/s |
|
|
223
|
+
| `nk_reduce_moments_i4_haswell` | 6.36 gb/s | 9.17 gb/s | 10.3 gb/s |
|
|
224
|
+
| `nk_reduce_moments_i4_skylake` | 7.67 gb/s | 8.85 gb/s | 15.4 gb/s |
|
|
225
|
+
| __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
226
|
+
| `nk_reduce_moments_u4_serial` | 0.438 gb/s | 0.799 gb/s | 1.00 gb/s |
|
|
227
|
+
| `nk_reduce_minmax_u4_serial` | 0.352 gb/s | 0.292 gb/s | 0.397 gb/s |
|
|
228
|
+
| `nk_reduce_moments_u4_haswell` | 7.40 gb/s | 10.7 gb/s | 10.8 gb/s |
|
|
229
|
+
| `nk_reduce_moments_u4_skylake` | 9.45 gb/s | 15.0 gb/s | 18.3 gb/s |
|
|
230
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
231
|
+
| `nk_reduce_moments_u1_serial` | 1.36 gb/s | 1.96 gb/s | 2.04 gb/s |
|
|
232
|
+
| `nk_reduce_minmax_u1_serial` | 5.44 gb/s | 14.7 gb/s | 84.1 gb/s |
|
|
233
|
+
| `nk_reduce_moments_u1_haswell` | 4.29 gb/s | 9.69 gb/s | 12.0 gb/s |
|
|
234
|
+
| `nk_reduce_moments_u1_skylake` | 2.90 gb/s | 12.3 gb/s | 20.6 gb/s |
|
|
235
|
+
| __i16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
236
|
+
| `nk_reduce_moments_i16_serial` | 2.54 gb/s | 2.68 gb/s | 2.80 gb/s |
|
|
237
|
+
| `nk_reduce_minmax_i16_serial` | 1.60 gb/s | 1.75 gb/s | 2.07 gb/s |
|
|
238
|
+
| `nk_reduce_moments_i16_haswell` | 13.7 gb/s | 14.7 gb/s | 12.5 gb/s |
|
|
239
|
+
| `nk_reduce_minmax_i16_haswell` | 8.56 gb/s | 10.9 gb/s | 10.0 gb/s |
|
|
240
|
+
| `nk_reduce_moments_i16_skylake` | 16.8 gb/s | 21.0 gb/s | 20.5 gb/s |
|
|
241
|
+
| `nk_reduce_minmax_i16_skylake` | 6.74 gb/s | 15.9 gb/s | 19.1 gb/s |
|
|
242
|
+
| `nk_reduce_moments_i16_icelake` | 19.0 gb/s | 24.9 gb/s | 28.2 gb/s |
|
|
243
|
+
| `nk_reduce_moments_i16_alder` | 10.0 gb/s | 12.1 gb/s | 10.5 gb/s |
|
|
244
|
+
| __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
245
|
+
| `nk_reduce_moments_u16_serial` | 2.62 gb/s | 2.55 gb/s | 2.54 gb/s |
|
|
246
|
+
| `nk_reduce_minmax_u16_serial` | 1.28 gb/s | 1.41 gb/s | 1.62 gb/s |
|
|
247
|
+
| `nk_reduce_moments_u16_haswell` | 6.82 gb/s | 6.95 gb/s | 6.60 gb/s |
|
|
248
|
+
| `nk_reduce_minmax_u16_haswell` | 8.25 gb/s | 10.5 gb/s | 11.6 gb/s |
|
|
249
|
+
| `nk_reduce_moments_u16_skylake` | 10.2 gb/s | 13.9 gb/s | 12.6 gb/s |
|
|
250
|
+
| `nk_reduce_minmax_u16_skylake` | 16.0 gb/s | 22.6 gb/s | 16.9 gb/s |
|
|
251
|
+
| `nk_reduce_moments_u16_alder` | 7.17 gb/s | 8.10 gb/s | 7.57 gb/s |
|
|
252
|
+
| __i32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
253
|
+
| `nk_reduce_moments_i32_serial` | 2.39 gb/s | 2.25 gb/s | 2.32 gb/s |
|
|
254
|
+
| `nk_reduce_minmax_i32_serial` | 2.99 gb/s | 3.67 gb/s | 4.48 gb/s |
|
|
255
|
+
| `nk_reduce_moments_i32_haswell` | 5.43 gb/s | 5.37 gb/s | 4.41 gb/s |
|
|
256
|
+
| `nk_reduce_minmax_i32_haswell` | 11.1 gb/s | 10.2 gb/s | 10.4 gb/s |
|
|
257
|
+
| `nk_reduce_moments_i32_skylake` | 6.87 gb/s | 11.1 gb/s | 10.6 gb/s |
|
|
258
|
+
| `nk_reduce_minmax_i32_skylake` | 23.8 gb/s | 24.7 gb/s | 17.6 gb/s |
|
|
259
|
+
| __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
260
|
+
| `nk_reduce_moments_u32_serial` | 3.46 gb/s | 3.53 gb/s | 3.41 gb/s |
|
|
261
|
+
| `nk_reduce_minmax_u32_serial` | 2.81 gb/s | 3.34 gb/s | 4.05 gb/s |
|
|
262
|
+
| `nk_reduce_moments_u32_haswell` | 6.10 gb/s | 5.79 gb/s | 5.27 gb/s |
|
|
263
|
+
| `nk_reduce_minmax_u32_haswell` | 10.6 gb/s | 11.2 gb/s | 9.95 gb/s |
|
|
264
|
+
| `nk_reduce_moments_u32_skylake` | 15.9 gb/s | 9.96 gb/s | 15.3 gb/s |
|
|
265
|
+
| `nk_reduce_minmax_u32_skylake` | 23.6 gb/s | 25.3 gb/s | 21.7 gb/s |
|
|
266
|
+
| __i64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
267
|
+
| `nk_reduce_moments_i64_serial` | 2.43 gb/s | 2.43 gb/s | 2.43 gb/s |
|
|
268
|
+
| `nk_reduce_minmax_i64_serial` | 4.90 gb/s | 5.54 gb/s | 6.10 gb/s |
|
|
269
|
+
| `nk_reduce_moments_i64_haswell` | 7.16 gb/s | 6.54 gb/s | 5.38 gb/s |
|
|
270
|
+
| `nk_reduce_minmax_i64_haswell` | 9.50 gb/s | 9.87 gb/s | 7.63 gb/s |
|
|
271
|
+
| `nk_reduce_moments_i64_skylake` | 13.0 gb/s | 8.29 gb/s | 10.5 gb/s |
|
|
272
|
+
| `nk_reduce_minmax_i64_skylake` | 11.6 gb/s | 23.1 gb/s | 22.0 gb/s |
|
|
273
|
+
| __u64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
274
|
+
| `nk_reduce_moments_u64_serial` | 1.94 gb/s | 1.92 gb/s | 1.83 gb/s |
|
|
275
|
+
| `nk_reduce_minmax_u64_serial` | 5.99 gb/s | 7.20 gb/s | 7.33 gb/s |
|
|
276
|
+
| `nk_reduce_moments_u64_haswell` | 8.60 gb/s | 8.45 gb/s | 5.96 gb/s |
|
|
277
|
+
| `nk_reduce_minmax_u64_haswell` | 8.93 gb/s | 9.81 gb/s | 7.55 gb/s |
|
|
278
|
+
| `nk_reduce_moments_u64_skylake` | 15.6 gb/s | 19.3 gb/s | 8.87 gb/s |
|
|
279
|
+
| `nk_reduce_minmax_u64_skylake` | 9.90 gb/s | 23.1 gb/s | 21.6 gb/s |
|
|
280
|
+
|
|
281
|
+
#### WASM
|
|
282
|
+
|
|
283
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
284
|
+
|
|
285
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
286
|
+
| :----------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
287
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
288
|
+
| `nk_reduce_moments_f64_serial` | 1.2 gb/s, 0 ulp | 1.22 gb/s, 0 ulp | 1.19 gb/s, 0 ulp |
|
|
289
|
+
| `nk_reduce_moments_f64_v128relaxed` | 6.97 gb/s, 0 ulp | 6.91 gb/s, 0 ulp | 7.06 gb/s, 0 ulp |
|
|
290
|
+
| `nk_reduce_minmax_f64_serial` | 5.23 gb/s, 0 ulp | 5.49 gb/s, 0 ulp | 5.71 gb/s, 0 ulp |
|
|
291
|
+
| `nk_reduce_minmax_f64_v128relaxed` | 4.9 gb/s, 0 ulp | 5.16 gb/s, 0 ulp | 5.11 gb/s, 0 ulp |
|
|
292
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
293
|
+
| `nk_reduce_moments_f32_serial` | 0.606 gb/s, 0 ulp | 0.622 gb/s, 0 ulp | 0.618 gb/s, 0 ulp |
|
|
294
|
+
| `nk_reduce_moments_f32_v128relaxed` | 8.84 gb/s, 0.1 ulp | 6.78 gb/s, 0.4 ulp | 6.49 gb/s, 0 ulp |
|
|
295
|
+
| `nk_reduce_minmax_f32_serial` | 2.63 gb/s, 0 ulp | 2.73 gb/s, 0 ulp | 2.86 gb/s, 0 ulp |
|
|
296
|
+
| `nk_reduce_minmax_f32_v128relaxed` | 4.59 gb/s, 0 ulp | 5.02 gb/s, 0 ulp | 5.05 gb/s, 0 ulp |
|
|
297
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
298
|
+
| `nk_reduce_moments_bf16_serial` | 0.274 gb/s, 0 ulp | 0.28 gb/s, 0 ulp | 0.288 gb/s, 0 ulp |
|
|
299
|
+
| `nk_reduce_moments_bf16_v128relaxed` | 7.7 gb/s, 0 ulp | 6.06 gb/s, 0.1 ulp | 5.26 gb/s, 1.6 ulp |
|
|
300
|
+
| `nk_reduce_minmax_bf16_serial` | 0.873 gb/s, 0 ulp | 0.977 gb/s, 0 ulp | 1.04 gb/s, 0 ulp |
|
|
301
|
+
| `nk_reduce_minmax_bf16_v128relaxed` | 4.88 gb/s, 0 ulp | 5.14 gb/s, 0 ulp | 5.55 gb/s, 0 ulp |
|
|
302
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
303
|
+
| `nk_reduce_moments_f16_serial` | 0.234 gb/s, 0 ulp | 0.235 gb/s, 0 ulp | 0.247 gb/s, 0 ulp |
|
|
304
|
+
| `nk_reduce_moments_f16_v128relaxed` | 1.91 gb/s, 0 ulp | 1.95 gb/s, 0 ulp | 1.92 gb/s, 0.2 ulp |
|
|
305
|
+
| `nk_reduce_minmax_f16_serial` | 0.859 gb/s, 0 ulp | 0.967 gb/s, 0 ulp | 1.06 gb/s, 0 ulp |
|
|
306
|
+
| `nk_reduce_minmax_f16_v128relaxed` | 1.3 gb/s, 0 ulp | 1.38 gb/s, 0 ulp | 1.39 gb/s, 0 ulp |
|
|
307
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
308
|
+
| `nk_reduce_moments_e5m2_serial` | 0.116 gb/s, 0 ulp | 0.118 gb/s, 0 ulp | 0.122 gb/s, 0 ulp |
|
|
309
|
+
| `nk_reduce_moments_e5m2_v128relaxed` | 1.55 gb/s, 0 ulp | 1.59 gb/s, 0 ulp | 1.56 gb/s, 0 ulp |
|
|
310
|
+
| `nk_reduce_minmax_e5m2_serial` | 0.471 gb/s, 0 ulp | 0.499 gb/s, 0 ulp | 0.527 gb/s, 0 ulp |
|
|
311
|
+
| `nk_reduce_minmax_e5m2_v128relaxed` | 1.27 gb/s, 0 ulp | 2.76 gb/s, 0 ulp | 3.07 gb/s, 0 ulp |
|
|
312
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
313
|
+
| `nk_reduce_moments_e4m3_serial` | 0.109 gb/s, 0 ulp | 0.109 gb/s, 0 ulp | 0.111 gb/s, 0 ulp |
|
|
314
|
+
| `nk_reduce_moments_e4m3_v128relaxed` | 1.15 gb/s, 0 ulp | 1.16 gb/s, 0 ulp | 1.17 gb/s, 0 ulp |
|
|
315
|
+
| `nk_reduce_minmax_e4m3_serial` | 0.451 gb/s, 0 ulp | 0.495 gb/s, 0 ulp | 0.535 gb/s, 0 ulp |
|
|
316
|
+
| `nk_reduce_minmax_e4m3_v128relaxed` | 1.43 gb/s, 0 ulp | 2.61 gb/s, 0 ulp | 3.06 gb/s, 0 ulp |
|
|
317
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
318
|
+
| `nk_reduce_moments_e3m2_serial` | 0.121 gb/s, 0 ulp | 0.117 gb/s, 0 ulp | 0.12 gb/s, 0 ulp |
|
|
319
|
+
| `nk_reduce_moments_e3m2_v128relaxed` | 2.04 gb/s, 0 ulp | 2.1 gb/s, 0 ulp | 2.09 gb/s, 0 ulp |
|
|
320
|
+
| `nk_reduce_minmax_e3m2_serial` | 0.431 gb/s, 0 ulp | 0.433 gb/s, 0 ulp | 0.437 gb/s, 0 ulp |
|
|
321
|
+
| `nk_reduce_minmax_e3m2_v128relaxed` | 1.45 gb/s, 0 ulp | 3.75 gb/s, 0 ulp | 4.32 gb/s, 0 ulp |
|
|
322
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
323
|
+
| `nk_reduce_moments_e2m3_serial` | 0.117 gb/s, 0 ulp | 0.118 gb/s, 0 ulp | 0.12 gb/s, 0 ulp |
|
|
324
|
+
| `nk_reduce_moments_e2m3_v128relaxed` | 3.1 gb/s, 0 ulp | 3.26 gb/s, 0 ulp | 3.28 gb/s, 0 ulp |
|
|
325
|
+
| `nk_reduce_minmax_e2m3_serial` | 0.434 gb/s, 0 ulp | 0.43 gb/s, 0 ulp | 0.439 gb/s, 0 ulp |
|
|
326
|
+
| `nk_reduce_minmax_e2m3_v128relaxed` | 2.84 gb/s, 0 ulp | 3.77 gb/s, 0 ulp | 4.32 gb/s, 0 ulp |
|
|
327
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
328
|
+
| `nk_reduce_moments_i8_serial` | 2.04 gb/s | 2.07 gb/s | 2.09 gb/s |
|
|
329
|
+
| `nk_reduce_moments_i8_v128relaxed` | 6.69 gb/s | 7.37 gb/s | 7.43 gb/s |
|
|
330
|
+
| `nk_reduce_minmax_i8_serial` | 0.928 gb/s | 0.92 gb/s | 0.935 gb/s |
|
|
331
|
+
| `nk_reduce_minmax_i8_v128relaxed` | 5.08 gb/s | 5.55 gb/s | 7 gb/s |
|
|
332
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
333
|
+
| `nk_reduce_moments_u8_serial` | 2.04 gb/s | 2.09 gb/s | 2.1 gb/s |
|
|
334
|
+
| `nk_reduce_moments_u8_v128relaxed` | 6.38 gb/s | 6.96 gb/s | 7.04 gb/s |
|
|
335
|
+
| `nk_reduce_minmax_u8_serial` | 0.851 gb/s | 0.851 gb/s | 0.858 gb/s |
|
|
336
|
+
| `nk_reduce_minmax_u8_v128relaxed` | 3.97 gb/s | 4.52 gb/s | 5.45 gb/s |
|
|
337
|
+
| __i16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
338
|
+
| `nk_reduce_moments_i16_serial` | 4.09 gb/s | 4.12 gb/s | 4.18 gb/s |
|
|
339
|
+
| `nk_reduce_moments_i16_v128relaxed` | 7.03 gb/s | 7.37 gb/s | 7.34 gb/s |
|
|
340
|
+
| `nk_reduce_minmax_i16_serial` | 1.86 gb/s | 1.85 gb/s | 1.87 gb/s |
|
|
341
|
+
| `nk_reduce_minmax_i16_v128relaxed` | 6.67 gb/s | 7.53 gb/s | 8.2 gb/s |
|
|
342
|
+
| __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
343
|
+
| `nk_reduce_moments_u16_serial` | 4.09 gb/s | 4.17 gb/s | 4.23 gb/s |
|
|
344
|
+
| `nk_reduce_moments_u16_v128relaxed` | 6.74 gb/s | 7.05 gb/s | 6.91 gb/s |
|
|
345
|
+
| `nk_reduce_minmax_u16_serial` | 1.67 gb/s | 1.67 gb/s | 1.68 gb/s |
|
|
346
|
+
| `nk_reduce_minmax_u16_v128relaxed` | 4.78 gb/s | 5.43 gb/s | 5.83 gb/s |
|
|
347
|
+
| __i32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
348
|
+
| `nk_reduce_moments_i32_serial` | 3.65 gb/s | 3.74 gb/s | 3.77 gb/s |
|
|
349
|
+
| `nk_reduce_moments_i32_v128relaxed` | 1.58 gb/s | 1.6 gb/s | 1.58 gb/s |
|
|
350
|
+
| `nk_reduce_minmax_i32_serial` | 4.27 gb/s | 4.3 gb/s | 4.32 gb/s |
|
|
351
|
+
| `nk_reduce_minmax_i32_v128relaxed` | 6.91 gb/s | 7.77 gb/s | 8.06 gb/s |
|
|
352
|
+
| __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
353
|
+
| `nk_reduce_moments_u32_serial` | 3.59 gb/s | 3.6 gb/s | 3.65 gb/s |
|
|
354
|
+
| `nk_reduce_moments_u32_v128relaxed` | 1.22 gb/s | 1.21 gb/s | 1.21 gb/s |
|
|
355
|
+
| `nk_reduce_minmax_u32_serial` | 3.81 gb/s | 3.81 gb/s | 3.88 gb/s |
|
|
356
|
+
| `nk_reduce_minmax_u32_v128relaxed` | 5.1 gb/s | 5.62 gb/s | 5.89 gb/s |
|
|
357
|
+
| __i64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
358
|
+
| `nk_reduce_moments_i64_serial` | 3.09 gb/s | 3.14 gb/s | 3.28 gb/s |
|
|
359
|
+
| `nk_reduce_moments_i64_v128relaxed` | 2.65 gb/s | 2.66 gb/s | 2.65 gb/s |
|
|
360
|
+
| `nk_reduce_minmax_i64_serial` | 8.49 gb/s | 8.38 gb/s | 8.65 gb/s |
|
|
361
|
+
| `nk_reduce_minmax_i64_v128relaxed` | 6.14 gb/s | 6.37 gb/s | 6.5 gb/s |
|
|
362
|
+
| __u64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
363
|
+
| `nk_reduce_moments_u64_serial` | 2.78 gb/s | 2.79 gb/s | 2.87 gb/s |
|
|
364
|
+
| `nk_reduce_moments_u64_v128relaxed` | 2.46 gb/s | 2.48 gb/s | 2.47 gb/s |
|
|
365
|
+
| `nk_reduce_minmax_u64_serial` | 7.83 gb/s | 7.59 gb/s | 7.74 gb/s |
|
|
366
|
+
| `nk_reduce_minmax_u64_v128relaxed` | 2.54 gb/s | 2.59 gb/s | 2.64 gb/s |
|
|
367
|
+
|
|
368
|
+
### Apple M4
|
|
369
|
+
|
|
370
|
+
#### Native
|
|
371
|
+
|
|
372
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
373
|
+
| :--------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
374
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
375
|
+
| `nk_reduce_moments_f64_serial` | 4.97 gb/s, 0 ulp | 4.98 gb/s, 0 ulp | 5.01 gb/s, 0 ulp |
|
|
376
|
+
| `nk_reduce_minmax_f64_serial` | 10.3 gb/s, 0 ulp | 10.3 gb/s, 0 ulp | 10.0 gb/s, 0 ulp |
|
|
377
|
+
| `nk_reduce_moments_f64_neon` | 11.4 gb/s, 0 ulp | 11.3 gb/s, 0 ulp | 11.1 gb/s, 0 ulp |
|
|
378
|
+
| `nk_reduce_minmax_f64_neon` | 14.1 gb/s, 0 ulp | 13.1 gb/s, 0 ulp | 12.8 gb/s, 0 ulp |
|
|
379
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
380
|
+
| `nk_reduce_moments_f32_serial` | 2.33 gb/s, 0 ulp | 2.32 gb/s, 0 ulp | 2.35 gb/s, 0 ulp |
|
|
381
|
+
| `nk_reduce_minmax_f32_serial` | 5.39 gb/s, 0 ulp | 5.19 gb/s, 0 ulp | 5.20 gb/s, 0 ulp |
|
|
382
|
+
| `nk_reduce_moments_f32_neon` | 11.4 gb/s, 0.4 ulp | 8.38 gb/s, 1.8 ulp | 7.27 gb/s, 1.1 ulp |
|
|
383
|
+
| `nk_reduce_minmax_f32_neon` | 14.2 gb/s, 0 ulp | 12.6 gb/s, 0 ulp | 13.0 gb/s, 0 ulp |
|
|
384
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
385
|
+
| `nk_reduce_moments_bf16_serial` | 1.09 gb/s, 0 ulp | 1.10 gb/s, 0 ulp | 1.11 gb/s, 0 ulp |
|
|
386
|
+
| `nk_reduce_minmax_bf16_serial` | 1.56 gb/s, 0 ulp | 1.69 gb/s, 0 ulp | 1.75 gb/s, 0 ulp |
|
|
387
|
+
| `nk_reduce_moments_bf16_neonbfdot` | 18.6 gb/s, 0 ulp | 20.3 gb/s, 0.4 ulp | 21.0 gb/s, 0.3 ulp |
|
|
388
|
+
| `nk_reduce_minmax_bf16_neonbfdot` | 13.7 gb/s, 0 ulp | 14.5 gb/s, 0 ulp | 15.4 gb/s, 0 ulp |
|
|
389
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
390
|
+
| `nk_reduce_moments_f16_serial` | 1.10 gb/s, 0 ulp | 1.09 gb/s, 0 ulp | 1.09 gb/s, 0 ulp |
|
|
391
|
+
| `nk_reduce_minmax_f16_serial` | 1.17 gb/s, 0 ulp | 1.26 gb/s, 0 ulp | 1.33 gb/s, 0 ulp |
|
|
392
|
+
| `nk_reduce_moments_f16_neonhalf` | 15.5 gb/s, 0.1 ulp | 11.1 gb/s, 0.1 ulp | 8.74 gb/s, 0.8 ulp |
|
|
393
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
394
|
+
| `nk_reduce_moments_e5m2_serial` | 0.531 gb/s, 0 ulp | 0.534 gb/s, 0 ulp | 0.536 gb/s, 0 ulp |
|
|
395
|
+
| `nk_reduce_minmax_e5m2_serial` | 0.803 gb/s, 0 ulp | 0.887 gb/s, 0 ulp | 0.880 gb/s, 0 ulp |
|
|
396
|
+
| `nk_reduce_moments_e5m2_neonfhm` | 7.95 gb/s, 0 ulp | 5.31 gb/s, 0 ulp | 4.07 gb/s, 0 ulp |
|
|
397
|
+
| `nk_reduce_minmax_e5m2_neonfhm` | 14.0 gb/s, 0 ulp | 16.8 gb/s, 0 ulp | 17.1 gb/s, 0 ulp |
|
|
398
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
399
|
+
| `nk_reduce_moments_e4m3_serial` | 0.382 gb/s, 0 ulp | 0.386 gb/s, 0 ulp | 0.387 gb/s, 0 ulp |
|
|
400
|
+
| `nk_reduce_minmax_e4m3_serial` | 0.823 gb/s, 0 ulp | 0.884 gb/s, 0 ulp | 0.918 gb/s, 0 ulp |
|
|
401
|
+
| `nk_reduce_moments_e4m3_neonfhm` | 2.90 gb/s, 0 ulp | 2.98 gb/s, 0 ulp | 2.89 gb/s, 0 ulp |
|
|
402
|
+
| `nk_reduce_minmax_e4m3_neonfhm` | 14.0 gb/s, 0 ulp | 16.7 gb/s, 0 ulp | 17.8 gb/s, 0 ulp |
|
|
403
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
404
|
+
| `nk_reduce_moments_e3m2_serial` | 0.537 gb/s, 0 ulp | 0.532 gb/s, 0 ulp | 0.539 gb/s, 0 ulp |
|
|
405
|
+
| `nk_reduce_minmax_e3m2_serial` | 0.506 gb/s, 0 ulp | 0.500 gb/s, 0 ulp | 0.503 gb/s, 0 ulp |
|
|
406
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
407
|
+
| `nk_reduce_moments_e2m3_serial` | 0.538 gb/s, 0 ulp | 0.539 gb/s, 0 ulp | 0.536 gb/s, 0 ulp |
|
|
408
|
+
| `nk_reduce_minmax_e2m3_serial` | 0.503 gb/s, 0 ulp | 0.504 gb/s, 0 ulp | 0.502 gb/s, 0 ulp |
|
|
409
|
+
| `nk_reduce_moments_e2m3_neonsdot` | 19.4 gb/s, 0 ulp | 19.5 gb/s, 0 ulp | 18.7 gb/s, 0 ulp |
|
|
410
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
411
|
+
| `nk_reduce_moments_i8_serial` | 2.44 gb/s | 2.59 gb/s | 2.59 gb/s |
|
|
412
|
+
| `nk_reduce_minmax_i8_serial` | 1.30 gb/s | 1.34 gb/s | 1.34 gb/s |
|
|
413
|
+
| `nk_reduce_moments_i8_neon` | 16.5 gb/s | 12.4 gb/s | 9.71 gb/s |
|
|
414
|
+
| `nk_reduce_minmax_i8_neon` | 17.9 gb/s | 20.3 gb/s | 19.0 gb/s |
|
|
415
|
+
| `nk_reduce_moments_i8_neonsdot` | 29.8 gb/s | 28.9 gb/s | 22.0 gb/s |
|
|
416
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
417
|
+
| `nk_reduce_moments_u8_serial` | 2.49 gb/s | 2.59 gb/s | 2.57 gb/s |
|
|
418
|
+
| `nk_reduce_minmax_u8_serial` | 1.32 gb/s | 1.31 gb/s | 1.31 gb/s |
|
|
419
|
+
| `nk_reduce_moments_u8_neon` | 16.7 gb/s | 12.1 gb/s | 9.20 gb/s |
|
|
420
|
+
| `nk_reduce_minmax_u8_neon` | 18.2 gb/s | 20.4 gb/s | 21.4 gb/s |
|
|
421
|
+
| `nk_reduce_moments_u8_neonsdot` | 29.6 gb/s | 29.2 gb/s | 22.6 gb/s |
|
|
422
|
+
| __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
423
|
+
| `nk_reduce_moments_i4_serial` | 1.55 gb/s | 1.82 gb/s | 1.85 gb/s |
|
|
424
|
+
| `nk_reduce_minmax_i4_serial` | 0.512 gb/s | 0.524 gb/s | 0.532 gb/s |
|
|
425
|
+
| __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
426
|
+
| `nk_reduce_moments_u4_serial` | 1.63 gb/s | 1.94 gb/s | 2.00 gb/s |
|
|
427
|
+
| `nk_reduce_minmax_u4_serial` | 0.536 gb/s | 0.544 gb/s | 0.548 gb/s |
|
|
428
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
429
|
+
| `nk_reduce_moments_u1_serial` | 1.24 gb/s | 1.32 gb/s | 1.35 gb/s |
|
|
430
|
+
| `nk_reduce_minmax_u1_serial` | 7.22 gb/s | 28.9 gb/s | 111 gb/s |
|
|
431
|
+
| __i16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
432
|
+
| `nk_reduce_moments_i16_serial` | 4.97 gb/s | 5.16 gb/s | 5.17 gb/s |
|
|
433
|
+
| `nk_reduce_minmax_i16_serial` | 2.08 gb/s | 2.05 gb/s | 2.08 gb/s |
|
|
434
|
+
| `nk_reduce_moments_i16_neon` | 16.3 gb/s | 11.9 gb/s | 9.28 gb/s |
|
|
435
|
+
| `nk_reduce_minmax_i16_neon` | 20.4 gb/s | 20.1 gb/s | 21.1 gb/s |
|
|
436
|
+
| __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
437
|
+
| `nk_reduce_moments_u16_serial` | 4.97 gb/s | 5.11 gb/s | 5.20 gb/s |
|
|
438
|
+
| `nk_reduce_minmax_u16_serial` | 2.59 gb/s | 2.66 gb/s | 2.62 gb/s |
|
|
439
|
+
| `nk_reduce_moments_u16_neon` | 16.6 gb/s | 12.0 gb/s | 9.35 gb/s |
|
|
440
|
+
| `nk_reduce_minmax_u16_neon` | 20.2 gb/s | 21.1 gb/s | 21.4 gb/s |
|
|
441
|
+
| __i32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
442
|
+
| `nk_reduce_moments_i32_serial` | 5.66 gb/s | 5.62 gb/s | 5.77 gb/s |
|
|
443
|
+
| `nk_reduce_minmax_i32_serial` | 6.17 gb/s | 6.21 gb/s | 6.24 gb/s |
|
|
444
|
+
| `nk_reduce_moments_i32_neon` | 6.48 gb/s | 5.58 gb/s | 5.29 gb/s |
|
|
445
|
+
| `nk_reduce_minmax_i32_neon` | 20.9 gb/s | 19.3 gb/s | 21.2 gb/s |
|
|
446
|
+
| __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
447
|
+
| `nk_reduce_moments_u32_serial` | 5.72 gb/s | 5.70 gb/s | 5.88 gb/s |
|
|
448
|
+
| `nk_reduce_minmax_u32_serial` | 5.60 gb/s | 5.59 gb/s | 5.93 gb/s |
|
|
449
|
+
| `nk_reduce_moments_u32_neon` | 13.7 gb/s | 10.3 gb/s | 9.07 gb/s |
|
|
450
|
+
| `nk_reduce_minmax_u32_neon` | 20.6 gb/s | 21.4 gb/s | 21.3 gb/s |
|
|
451
|
+
| __i64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
452
|
+
| `nk_reduce_moments_i64_serial` | 7.53 gb/s | 7.55 gb/s | 7.59 gb/s |
|
|
453
|
+
| `nk_reduce_minmax_i64_serial` | 10.0 gb/s | 10.6 gb/s | 10.5 gb/s |
|
|
454
|
+
| `nk_reduce_moments_i64_neon` | 11.4 gb/s | 10.5 gb/s | 10.2 gb/s |
|
|
455
|
+
| `nk_reduce_minmax_i64_neon` | 14.3 gb/s | 13.4 gb/s | 13.0 gb/s |
|
|
456
|
+
| __u64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
457
|
+
| `nk_reduce_moments_u64_serial` | 7.11 gb/s | 7.44 gb/s | 7.48 gb/s |
|
|
458
|
+
| `nk_reduce_minmax_u64_serial` | 10.5 gb/s | 10.6 gb/s | 10.4 gb/s |
|
|
459
|
+
| `nk_reduce_moments_u64_neon` | 21.2 gb/s | 18.7 gb/s | 17.2 gb/s |
|
|
460
|
+
| `nk_reduce_minmax_u64_neon` | 14.3 gb/s | 13.2 gb/s | 12.9 gb/s |
|
|
461
|
+
|
|
462
|
+
#### WASM
|
|
463
|
+
|
|
464
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
465
|
+
|
|
466
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
467
|
+
| :----------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
468
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
469
|
+
| `nk_reduce_moments_f64_serial` | 4.19 gb/s, 0 ulp | 8.01 gb/s, 0 ulp | 8.28 gb/s, 0 ulp |
|
|
470
|
+
| `nk_reduce_moments_f64_v128relaxed` | 6.55 gb/s, 0 ulp | 16.7 gb/s, 0 ulp | 16.4 gb/s, 0 ulp |
|
|
471
|
+
| `nk_reduce_minmax_f64_serial` | ? | 15.6 gb/s, 0 ulp | 16.2 gb/s, 0 ulp |
|
|
472
|
+
| `nk_reduce_minmax_f64_v128relaxed` | 5.99 gb/s, 0 ulp | 13.8 gb/s, 0 ulp | 15.6 gb/s, 0 ulp |
|
|
473
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
474
|
+
| `nk_reduce_moments_f32_serial` | 1.75 gb/s, 0 ulp | 3.39 gb/s, 0 ulp | 3.50 gb/s, 0 ulp |
|
|
475
|
+
| `nk_reduce_moments_f32_v128relaxed` | 4.63 gb/s, 0.1 ulp | 8.81 gb/s, 0.5 ulp | 7.11 gb/s, 0 ulp |
|
|
476
|
+
| `nk_reduce_minmax_f32_serial` | ? | 5.27 gb/s, 0 ulp | 5.41 gb/s, 0 ulp |
|
|
477
|
+
| `nk_reduce_minmax_f32_v128relaxed` | 4.89 gb/s, 0 ulp | 12.7 gb/s, 0 ulp | 15.3 gb/s, 0 ulp |
|
|
478
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
479
|
+
| `nk_reduce_moments_bf16_serial` | 2.21 gb/s, 0 ulp | 2.08 gb/s, 0 ulp | 2.07 gb/s, 0 ulp |
|
|
480
|
+
| `nk_reduce_moments_bf16_v128relaxed` | 5.49 gb/s, 0 ulp | 9.10 gb/s, 0.2 ulp | 8.28 gb/s, 1.6 ulp |
|
|
481
|
+
| `nk_reduce_minmax_bf16_serial` | ? | 1.87 gb/s, 0 ulp | 1.99 gb/s, 0 ulp |
|
|
482
|
+
| `nk_reduce_minmax_bf16_v128relaxed` | 4.05 gb/s, 0 ulp | 8.80 gb/s, 0 ulp | 10.4 gb/s, 0 ulp |
|
|
483
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
484
|
+
| `nk_reduce_moments_f16_serial` | 2.05 gb/s, 0 ulp | 1.98 gb/s, 0 ulp | 2.00 gb/s, 0 ulp |
|
|
485
|
+
| `nk_reduce_moments_f16_v128relaxed` | 1.78 gb/s, 0 ulp | 3.94 gb/s, 0 ulp | 4.43 gb/s, 0.3 ulp |
|
|
486
|
+
| `nk_reduce_minmax_f16_serial` | ? | 1.86 gb/s, 0 ulp | 1.98 gb/s, 0 ulp |
|
|
487
|
+
| `nk_reduce_minmax_f16_v128relaxed` | 1.66 gb/s, 0 ulp | 3.70 gb/s, 0 ulp | 3.90 gb/s, 0 ulp |
|
|
488
|
+
| __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
489
|
+
| `nk_reduce_moments_e5m2_serial` | 1.10 gb/s, 0 ulp | 1.05 gb/s, 0 ulp | 1.06 gb/s, 0 ulp |
|
|
490
|
+
| `nk_reduce_moments_e5m2_v128relaxed` | 1.16 gb/s, 0 ulp | 2.56 gb/s, 0 ulp | 2.83 gb/s, 0 ulp |
|
|
491
|
+
| `nk_reduce_minmax_e5m2_serial` | ? | 1.09 gb/s, 0 ulp | 1.14 gb/s, 0 ulp |
|
|
492
|
+
| `nk_reduce_minmax_e5m2_v128relaxed` | 3.09 gb/s, 0 ulp | 10.3 gb/s, 0 ulp | 14.6 gb/s, 0 ulp |
|
|
493
|
+
| __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
494
|
+
| `nk_reduce_moments_e4m3_serial` | 0.731 gb/s, 0 ulp | 0.669 gb/s, 0 ulp | 0.668 gb/s, 0 ulp |
|
|
495
|
+
| `nk_reduce_moments_e4m3_v128relaxed` | 0.945 gb/s, 0 ulp | 2.10 gb/s, 0 ulp | 2.37 gb/s, 0 ulp |
|
|
496
|
+
| `nk_reduce_minmax_e4m3_serial` | ? | 1.09 gb/s, 0 ulp | 1.14 gb/s, 0 ulp |
|
|
497
|
+
| `nk_reduce_minmax_e4m3_v128relaxed` | 3.47 gb/s, 0 ulp | 9.70 gb/s, 0 ulp | 14.6 gb/s, 0 ulp |
|
|
498
|
+
| __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
499
|
+
| `nk_reduce_moments_e3m2_serial` | 1.10 gb/s, 0 ulp | 1.00 gb/s, 0 ulp | 1.06 gb/s, 0 ulp |
|
|
500
|
+
| `nk_reduce_moments_e3m2_v128relaxed` | 2.03 gb/s, 0 ulp | 3.92 gb/s, 0 ulp | 4.01 gb/s, 0 ulp |
|
|
501
|
+
| `nk_reduce_minmax_e3m2_serial` | ? | 0.618 gb/s, 0 ulp | 0.633 gb/s, 0 ulp |
|
|
502
|
+
| `nk_reduce_minmax_e3m2_v128relaxed` | 3.34 gb/s, 0 ulp | 12.2 gb/s, 0 ulp | 15.8 gb/s, 0 ulp |
|
|
503
|
+
| __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
504
|
+
| `nk_reduce_moments_e2m3_serial` | 1.10 gb/s, 0 ulp | 1.05 gb/s, 0 ulp | 1.06 gb/s, 0 ulp |
|
|
505
|
+
| `nk_reduce_moments_e2m3_v128relaxed` | 3.41 gb/s, 0 ulp | 7.66 gb/s, 0 ulp | 8.01 gb/s, 0 ulp |
|
|
506
|
+
| `nk_reduce_minmax_e2m3_serial` | ? | 0.618 gb/s, 0 ulp | 0.629 gb/s, 0 ulp |
|
|
507
|
+
| `nk_reduce_minmax_e2m3_v128relaxed` | 3.90 gb/s, 0 ulp | 11.0 gb/s, 0 ulp | 15.9 gb/s, 0 ulp |
|
|
508
|
+
| __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
509
|
+
| `nk_reduce_moments_i8_serial` | 1.86 gb/s | 3.46 gb/s | 3.53 gb/s |
|
|
510
|
+
| `nk_reduce_moments_i8_v128relaxed` | 6.64 gb/s | 16.8 gb/s | 13.2 gb/s |
|
|
511
|
+
| `nk_reduce_minmax_i8_serial` | ? | 1.38 gb/s | 1.43 gb/s |
|
|
512
|
+
| `nk_reduce_minmax_i8_v128relaxed` | 4.67 gb/s | 16.0 gb/s | 21.7 gb/s |
|
|
513
|
+
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
514
|
+
| `nk_reduce_moments_u8_serial` | 2.19 gb/s | 3.45 gb/s | 3.51 gb/s |
|
|
515
|
+
| `nk_reduce_moments_u8_v128relaxed` | 6.58 gb/s | 16.7 gb/s | 13.2 gb/s |
|
|
516
|
+
| `nk_reduce_minmax_u8_serial` | ? | 1.44 gb/s | 1.52 gb/s |
|
|
517
|
+
| `nk_reduce_minmax_u8_v128relaxed` | 4.52 gb/s | 15.3 gb/s | 21.6 gb/s |
|
|
518
|
+
| __i16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
519
|
+
| `nk_reduce_moments_i16_serial` | 4.83 gb/s | 6.94 gb/s | 7.03 gb/s |
|
|
520
|
+
| `nk_reduce_moments_i16_v128relaxed` | 4.64 gb/s | 8.63 gb/s | 7.74 gb/s |
|
|
521
|
+
| `nk_reduce_minmax_i16_serial` | ? | 2.75 gb/s | 2.88 gb/s |
|
|
522
|
+
| `nk_reduce_minmax_i16_v128relaxed` | 7.26 gb/s | 12.3 gb/s | 14.9 gb/s |
|
|
523
|
+
| __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
524
|
+
| `nk_reduce_moments_u16_serial` | 5.49 gb/s | 6.94 gb/s | 6.92 gb/s |
|
|
525
|
+
| `nk_reduce_moments_u16_v128relaxed` | 4.71 gb/s | 8.68 gb/s | 7.86 gb/s |
|
|
526
|
+
| `nk_reduce_minmax_u16_serial` | ? | 2.88 gb/s | 3.01 gb/s |
|
|
527
|
+
| `nk_reduce_minmax_u16_v128relaxed` | 7.79 gb/s | 12.2 gb/s | 14.9 gb/s |
|
|
528
|
+
| __i32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
529
|
+
| `nk_reduce_moments_i32_serial` | 4.10 gb/s | 5.38 gb/s | 5.31 gb/s |
|
|
530
|
+
| `nk_reduce_moments_i32_v128relaxed` | 2.23 gb/s | 5.74 gb/s | 5.75 gb/s |
|
|
531
|
+
| `nk_reduce_minmax_i32_serial` | ? | 6.38 gb/s | 6.73 gb/s |
|
|
532
|
+
| `nk_reduce_minmax_i32_v128relaxed` | 5.36 gb/s | 12.9 gb/s | 15.4 gb/s |
|
|
533
|
+
| __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
534
|
+
| `nk_reduce_moments_u32_serial` | 3.97 gb/s | 5.35 gb/s | 5.29 gb/s |
|
|
535
|
+
| `nk_reduce_moments_u32_v128relaxed` | 1.16 gb/s | 2.32 gb/s | 2.30 gb/s |
|
|
536
|
+
| `nk_reduce_minmax_u32_serial` | ? | 6.38 gb/s | 6.66 gb/s |
|
|
537
|
+
| `nk_reduce_minmax_u32_v128relaxed` | 5.45 gb/s | 12.9 gb/s | 15.7 gb/s |
|
|
538
|
+
| __i64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
539
|
+
| `nk_reduce_moments_i64_serial` | 5.57 gb/s | 7.93 gb/s | 7.99 gb/s |
|
|
540
|
+
| `nk_reduce_moments_i64_v128relaxed` | 2.61 gb/s | 6.93 gb/s | 6.92 gb/s |
|
|
541
|
+
| `nk_reduce_minmax_i64_serial` | ? | 12.7 gb/s | 13.2 gb/s |
|
|
542
|
+
| `nk_reduce_minmax_i64_v128relaxed` | 6.85 gb/s | 13.8 gb/s | 15.8 gb/s |
|
|
543
|
+
| __u64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
544
|
+
| `nk_reduce_moments_u64_serial` | 7.06 gb/s | 6.46 gb/s | 6.45 gb/s |
|
|
545
|
+
| `nk_reduce_moments_u64_v128relaxed` | 1.36 gb/s | 2.55 gb/s | 2.67 gb/s |
|
|
546
|
+
| `nk_reduce_minmax_u64_serial` | ? | 12.7 gb/s | 13.1 gb/s |
|
|
547
|
+
| `nk_reduce_minmax_u64_v128relaxed` | 1.72 gb/s | 2.79 gb/s | 3.07 gb/s |
|