numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# Trigonometric Functions in NumKong
|
|
2
|
+
|
|
3
|
+
NumKong implements element-wise trigonometric functions — sine, cosine, and arc tangent — with ~3 ulp error bounds for f32 and faithful rounding for f64.
|
|
4
|
+
Each function operates on dense vectors, reading input angles (radians) and writing output values of the same length.
|
|
5
|
+
The implementations derive from SLEEF (SIMD Library for Evaluating Elementary Functions), adapted for NumKong's ISA dispatch and type system.
|
|
6
|
+
|
|
7
|
+
Sine:
|
|
8
|
+
|
|
9
|
+
```math
|
|
10
|
+
\text{sin}: \mathbb{R} \to [-1, 1]
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Cosine:
|
|
14
|
+
|
|
15
|
+
```math
|
|
16
|
+
\text{cos}: \mathbb{R} \to [-1, 1]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Arc tangent:
|
|
20
|
+
|
|
21
|
+
```math
|
|
22
|
+
\text{atan}: \mathbb{R} \to \left(-\frac{\pi}{2}, \frac{\pi}{2}\right)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Reformulating as Python pseudocode:
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import numpy as np
|
|
29
|
+
|
|
30
|
+
def sin(a: np.ndarray) -> np.ndarray:
|
|
31
|
+
return np.sin(a)
|
|
32
|
+
|
|
33
|
+
def cos(a: np.ndarray) -> np.ndarray:
|
|
34
|
+
return np.cos(a)
|
|
35
|
+
|
|
36
|
+
def atan(a: np.ndarray) -> np.ndarray:
|
|
37
|
+
return np.arctan(a)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Input & Output Types
|
|
41
|
+
|
|
42
|
+
| Input Type | Output Type | Description |
|
|
43
|
+
| ---------- | ----------- | ------------------------------------------------ |
|
|
44
|
+
| `f64` | `f64` | 64-bit IEEE 754 double precision |
|
|
45
|
+
| `f32` | `f32` | 32-bit IEEE 754 single precision |
|
|
46
|
+
| `f16` | `f16` | 16-bit half precision, widened to f32 internally |
|
|
47
|
+
|
|
48
|
+
## Optimizations
|
|
49
|
+
|
|
50
|
+
### Cody-Waite Range Reduction
|
|
51
|
+
|
|
52
|
+
All trigonometric kernels reduce the input angle to $[-\pi/4, \pi/4]$ before polynomial evaluation using Cody-Waite argument reduction.
|
|
53
|
+
The constant $\pi$ is split into high and low parts ($\pi_{\text{hi}} + \pi_{\text{lo}}$) to maintain precision during the subtraction $x - n\pi$: `reduced = (x - n * pi_hi) - n * pi_lo`.
|
|
54
|
+
Single-part subtraction would lose ~3 bits of precision for large multiples of $\pi$; the two-part split preserves the full mantissa.
|
|
55
|
+
The quadrant index $n = \text{round}(x / \pi)$ selects which trigonometric identity to apply (sin-cos swap, sign flip) via a 2-bit branch.
|
|
56
|
+
|
|
57
|
+
### Minimax Polynomial Approximation
|
|
58
|
+
|
|
59
|
+
`nk_each_sin_f32_serial`, `nk_each_cos_f32_serial` evaluate degree-9 minimax polynomials via Horner's method after range reduction.
|
|
60
|
+
The polynomial coefficients are precomputed to minimize maximum error over $[-\pi/4, \pi/4]$ — Chebyshev-optimal, not Taylor truncation.
|
|
61
|
+
Horner evaluation: `p = c9*x^2 + c7; p = p*x^2 + c5; p = p*x^2 + c3; p = p*x^2 + c1; p = p*x` — 4 FMA operations plus 1 multiply for the final odd-power term.
|
|
62
|
+
`nk_each_sin_f64_serial` uses degree-19 polynomials for 52-bit mantissa coverage.
|
|
63
|
+
|
|
64
|
+
### Vectorized Polynomial Evaluation
|
|
65
|
+
|
|
66
|
+
`nk_each_sin_f32_haswell`, `nk_each_cos_f32_skylake` evaluate the same polynomial on 8 (AVX2) or 16 (AVX-512) elements simultaneously.
|
|
67
|
+
Range reduction, quadrant selection, and polynomial evaluation all operate on packed vectors — the only scalar operation is the final sign correction via `VBLENDVPS` with the quadrant mask.
|
|
68
|
+
`nk_each_sin_f32_neon` processes 4 elements per iteration using `vfmaq_f32` for the Horner chain.
|
|
69
|
+
WASM v128relaxed (`nk_each_sin_f32_v128relaxed`) uses `f32x4.relaxed_madd` for the FMA steps, achieving ~2x throughput over strict `f32x4.mul` + `f32x4.add` sequences.
|
|
70
|
+
|
|
71
|
+
## Performance
|
|
72
|
+
|
|
73
|
+
The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
|
|
74
|
+
The input size is controlled by the `NK_DENSE_DIMENSIONS` environment variable and set to 256, 1024, and 4096 elements.
|
|
75
|
+
The throughput is measured in GB/s as the number of input bytes per second.
|
|
76
|
+
Accuracy is reported as mean ULP (units in last place) unless noted otherwise — the average number of representable floating-point values between the result and the exact answer.
|
|
77
|
+
Each kernel runs for at least 20 seconds per configuration.
|
|
78
|
+
Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
|
|
79
|
+
Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
|
|
80
|
+
|
|
81
|
+
### Intel Sapphire Rapids
|
|
82
|
+
|
|
83
|
+
#### Native
|
|
84
|
+
|
|
85
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
86
|
+
| :------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
87
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
88
|
+
| `nk_each_sin_f64_serial` | 0.994 gb/s, 0 ulp | 0.783 gb/s, 0 ulp | 0.827 gb/s, 0 ulp |
|
|
89
|
+
| `nk_each_cos_f64_serial` | 0.906 gb/s, 0 ulp | 0.784 gb/s, 0 ulp | 0.824 gb/s, 0 ulp |
|
|
90
|
+
| `nk_each_atan_f64_serial` | 0.307 gb/s, 0 ulp | 0.291 gb/s, 0 ulp | 0.291 gb/s, 0 ulp |
|
|
91
|
+
| `nk_each_sin_f64_haswell` | 4.59 gb/s, 0 ulp | 4.19 gb/s, 0 ulp | 4.04 gb/s, 0 ulp |
|
|
92
|
+
| `nk_each_cos_f64_haswell` | 4.25 gb/s, 0 ulp | 4.14 gb/s, 0 ulp | 3.92 gb/s, 0 ulp |
|
|
93
|
+
| `nk_each_atan_f64_haswell` | 3.83 gb/s, 0 ulp | 3.21 gb/s, 0 ulp | 3.49 gb/s, 0 ulp |
|
|
94
|
+
| `nk_each_sin_f64_skylake` | 7.65 gb/s, 0 ulp | 6.55 gb/s, 0 ulp | 4.70 gb/s, 0 ulp |
|
|
95
|
+
| `nk_each_cos_f64_skylake` | 7.88 gb/s, 0 ulp | 5.76 gb/s, 0 ulp | 5.01 gb/s, 0 ulp |
|
|
96
|
+
| `nk_each_atan_f64_skylake` | 5.08 gb/s, 0 ulp | 4.72 gb/s, 0 ulp | 4.58 gb/s, 0 ulp |
|
|
97
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
98
|
+
| `nk_each_sin_f32_serial` | 6.29 gb/s, 5 ulp | 6.07 gb/s, 5 ulp | 5.41 gb/s, 5 ulp |
|
|
99
|
+
| `nk_each_cos_f32_serial` | 7.03 gb/s, 15 ulp | 6.24 gb/s, 15 ulp | 5.16 gb/s, 15 ulp |
|
|
100
|
+
| `nk_each_atan_f32_serial` | 0.642 gb/s, 0.4 ulp | 0.541 gb/s, 0.4 ulp | 0.567 gb/s, 0.4 ulp |
|
|
101
|
+
| `nk_each_sin_f32_haswell` | 10.0 gb/s, 5 ulp | 7.36 gb/s, 5 ulp | 5.63 gb/s, 5 ulp |
|
|
102
|
+
| `nk_each_cos_f32_haswell` | 7.82 gb/s, 15 ulp | 7.11 gb/s, 15 ulp | 5.09 gb/s, 15 ulp |
|
|
103
|
+
| `nk_each_atan_f32_haswell` | 7.63 gb/s, 0.4 ulp | 5.94 gb/s, 0.4 ulp | 5.38 gb/s, 0.4 ulp |
|
|
104
|
+
| `nk_each_sin_f32_skylake` | 11.9 gb/s, 5 ulp | 9.14 gb/s, 5 ulp | 5.43 gb/s, 5 ulp |
|
|
105
|
+
| `nk_each_cos_f32_skylake` | 10.4 gb/s, 15 ulp | 8.26 gb/s, 15 ulp | 5.40 gb/s, 15 ulp |
|
|
106
|
+
| `nk_each_atan_f32_skylake` | 9.07 gb/s, 0.4 ulp | 7.80 gb/s, 0.4 ulp | 5.75 gb/s, 0.4 ulp |
|
|
107
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
108
|
+
| `nk_each_sin_f16_serial` | 0.112 gb/s, 0.9 ulp | 0.102 gb/s, 1.1 ulp | 0.110 gb/s, 0.9 ulp |
|
|
109
|
+
| `nk_each_cos_f16_serial` | 0.105 gb/s, 12 ulp | 0.0962 gb/s, 12 ulp | 0.0976 gb/s, 12 ulp |
|
|
110
|
+
| `nk_each_atan_f16_serial` | 0.0208 gb/s, 6.4 ulp | 0.0201 gb/s, 6.7 ulp | 0.0204 gb/s, 6.6 ulp |
|
|
111
|
+
| `nk_each_sin_f16_skylake` | 6.05 gb/s, 8.41K ulp | 5.81 gb/s, 8.43K ulp | 5.24 gb/s, 8.41K ulp |
|
|
112
|
+
| `nk_each_cos_f16_skylake` | 6.05 gb/s, 8.34K ulp | 5.20 gb/s, 8.34K ulp | 5.09 gb/s, 8.35K ulp |
|
|
113
|
+
| `nk_each_atan_f16_skylake` | 4.86 gb/s, 16.5K ulp | 5.25 gb/s, 16.6K ulp | 4.76 gb/s, 16.5K ulp |
|
|
114
|
+
|
|
115
|
+
#### WASM
|
|
116
|
+
|
|
117
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
118
|
+
|
|
119
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
120
|
+
| :----------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
121
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
122
|
+
| `nk_each_sin_f64_serial` | 0.34 gb/s, 0.2 ulp | 0.38 gb/s, 0.2 ulp | 0.08 gb/s, 0.2 ulp |
|
|
123
|
+
| `nk_each_cos_f64_serial` | 0.36 gb/s, 0.3 ulp | 0.39 gb/s, 0.3 ulp | 0.08 gb/s, 0.3 ulp |
|
|
124
|
+
| `nk_each_atan_f64_serial` | 0.11 gb/s, 0.3 ulp | 0.12 gb/s, 0.3 ulp | 0.11 gb/s, 0.3 ulp |
|
|
125
|
+
| `nk_each_sin_f64_v128relaxed` | 0.59 gb/s, 0.2 ulp | 0.26 gb/s, 0.2 ulp | 0.05 gb/s, 0.2 ulp |
|
|
126
|
+
| `nk_each_cos_f64_v128relaxed` | 0.29 gb/s, 0.3 ulp | 0.50 gb/s, 0.3 ulp | 0.03 gb/s, 0.3 ulp |
|
|
127
|
+
| `nk_each_atan_f64_v128relaxed` | 0.11 gb/s, 0.3 ulp | 0.48 gb/s, 0.3 ulp | 0.21 gb/s, 0.3 ulp |
|
|
128
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
129
|
+
| `nk_each_sin_f32_serial` | 0.17 gb/s, 4.9 ulp | 0.51 gb/s, 4.9 ulp | 0.07 gb/s, 4.9 ulp |
|
|
130
|
+
| `nk_each_cos_f32_serial` | 0.05 gb/s, 14.4 ulp | 0.41 gb/s, 14.4 ulp | 0.10 gb/s, 14.4 ulp |
|
|
131
|
+
| `nk_each_atan_f32_serial` | 0.08 gb/s, 0.4 ulp | 0.08 gb/s, 0.4 ulp | 0.09 gb/s, 0.4 ulp |
|
|
132
|
+
| `nk_each_sin_f32_v128relaxed` | 0.13 gb/s, 20.7 ulp | 0.01 gb/s, 20.7 ulp | 0.10 gb/s, 20.7 ulp |
|
|
133
|
+
| `nk_each_cos_f32_v128relaxed` | 0.15 gb/s, 21.9 ulp | 0.32 gb/s, 21.9 ulp | 0.05 gb/s, 21.9 ulp |
|
|
134
|
+
| `nk_each_atan_f32_v128relaxed` | 0.45 gb/s, 0.4 ulp | 0.39 gb/s, 0.4 ulp | 0.15 gb/s, 0.4 ulp |
|
|
135
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
136
|
+
| `nk_each_sin_f16_serial` | 0.07 gb/s, 1.1 ulp | 0.07 gb/s, 1.1 ulp | 0.07 gb/s, 1.1 ulp |
|
|
137
|
+
| `nk_each_cos_f16_serial` | 0.07 gb/s, 11.8 ulp | 0.07 gb/s, 11.8 ulp | 0.07 gb/s, 11.8 ulp |
|
|
138
|
+
| `nk_each_atan_f16_serial` | 0.03 gb/s, 6.5 ulp | 0.03 gb/s, 6.5 ulp | 0.03 gb/s, 6.5 ulp |
|
|
139
|
+
|
|
140
|
+
### Apple M4
|
|
141
|
+
|
|
142
|
+
#### Native
|
|
143
|
+
|
|
144
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
145
|
+
| :------------------------ | -----------------------: | -----------------------: | -----------------------: |
|
|
146
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
147
|
+
| `nk_each_sin_f64_serial` | 0.627 gb/s, 0.2 ulp | 0.634 gb/s, 0.2 ulp | 0.639 gb/s, 0.2 ulp |
|
|
148
|
+
| `nk_each_cos_f64_serial` | 0.621 gb/s, 0.3 ulp | 0.632 gb/s, 0.3 ulp | 0.619 gb/s, 0.3 ulp |
|
|
149
|
+
| `nk_each_atan_f64_serial` | 0.153 gb/s, 0.3 ulp | 0.154 gb/s, 0.3 ulp | 0.153 gb/s, 0.3 ulp |
|
|
150
|
+
| `nk_each_sin_f64_neon` | 5.94 gb/s, 0.2 ulp | 5.75 gb/s, 0.2 ulp | 5.82 gb/s, 0.2 ulp |
|
|
151
|
+
| `nk_each_cos_f64_neon` | 5.15 gb/s, 0.3 ulp | 5.36 gb/s, 0.3 ulp | 5.37 gb/s, 0.3 ulp |
|
|
152
|
+
| `nk_each_atan_f64_neon` | 3.53 gb/s, 0.3 ulp | 3.50 gb/s, 0.3 ulp | 3.50 gb/s, 0.3 ulp |
|
|
153
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
154
|
+
| `nk_each_sin_f32_serial` | 7.94 gb/s, 4.9 ulp | 7.94 gb/s, 4.9 ulp | 7.23 gb/s, 4.9 ulp |
|
|
155
|
+
| `nk_each_cos_f32_serial` | 7.26 gb/s, 14 ulp | 6.41 gb/s, 14 ulp | 6.52 gb/s, 14 ulp |
|
|
156
|
+
| `nk_each_atan_f32_serial` | 0.128 gb/s, 0.4 ulp | 0.129 gb/s, 0.4 ulp | 0.126 gb/s, 0.4 ulp |
|
|
157
|
+
| `nk_each_sin_f32_neon` | 9.75 gb/s, 4.9 ulp | 9.44 gb/s, 4.9 ulp | 8.13 gb/s, 4.9 ulp |
|
|
158
|
+
| `nk_each_cos_f32_neon` | 8.68 gb/s, 18 ulp | 7.77 gb/s, 18 ulp | 7.84 gb/s, 18 ulp |
|
|
159
|
+
| `nk_each_atan_f32_neon` | 5.57 gb/s, 0.4 ulp | 5.00 gb/s, 0.4 ulp | 5.10 gb/s, 0.4 ulp |
|
|
160
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
161
|
+
| `nk_each_sin_f16_serial` | 3.66 gb/s, 1.3 ulp | 3.71 gb/s, 1.3 ulp | 3.38 gb/s, 1.3 ulp |
|
|
162
|
+
| `nk_each_cos_f16_serial` | 3.28 gb/s, 12 ulp | 3.29 gb/s, 12 ulp | 3.15 gb/s, 12 ulp |
|
|
163
|
+
| `nk_each_atan_f16_serial` | 0.0639 gb/s, 6.5 ulp | 0.0626 gb/s, 6.5 ulp | 0.0627 gb/s, 6.5 ulp |
|
|
164
|
+
|
|
165
|
+
#### WASM
|
|
166
|
+
|
|
167
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
168
|
+
|
|
169
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
170
|
+
| :----------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
171
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
172
|
+
| `nk_each_sin_f64_serial` | 0.619 gb/s, 0.2 ulp | 0.517 gb/s, 0.2 ulp | 0.705 gb/s, 0.2 ulp |
|
|
173
|
+
| `nk_each_sin_f64_v128relaxed` | 9.10 gb/s, 0.2 ulp | 3.58 gb/s, 0.2 ulp | 8.93 gb/s, 0.2 ulp |
|
|
174
|
+
| `nk_each_cos_f64_serial` | 0.595 gb/s, 0.3 ulp | 0.501 gb/s, 0.3 ulp | 0.681 gb/s, 0.3 ulp |
|
|
175
|
+
| `nk_each_cos_f64_v128relaxed` | 9.35 gb/s, 0.3 ulp | 3.99 gb/s, 0.3 ulp | 9.16 gb/s, 0.3 ulp |
|
|
176
|
+
| `nk_each_atan_f64_serial` | 0.188 gb/s, 0.3 ulp | 0.157 gb/s, 0.3 ulp | 0.214 gb/s, 0.3 ulp |
|
|
177
|
+
| `nk_each_atan_f64_v128relaxed` | 6.22 gb/s, 0.3 ulp | 2.44 gb/s, 0.3 ulp | 6.06 gb/s, 0.3 ulp |
|
|
178
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
179
|
+
| `nk_each_sin_f32_serial` | 8.08 gb/s, 4.9 ulp | 4.26 gb/s, 4.9 ulp | 9.32 gb/s, 4.9 ulp |
|
|
180
|
+
| `nk_each_sin_f32_v128relaxed` | 19.1 gb/s, 20 ulp | 7.19 gb/s, 20 ulp | 17.1 gb/s, 20 ulp |
|
|
181
|
+
| `nk_each_cos_f32_serial` | 7.45 gb/s, 14 ulp | 3.88 gb/s, 14 ulp | 8.52 gb/s, 14 ulp |
|
|
182
|
+
| `nk_each_cos_f32_v128relaxed` | 17.8 gb/s, 21 ulp | 7.06 gb/s, 21 ulp | 16.2 gb/s, 21 ulp |
|
|
183
|
+
| `nk_each_atan_f32_serial` | 0.151 gb/s, 0.4 ulp | 0.0950 gb/s, 0.4 ulp | 0.175 gb/s, 0.4 ulp |
|
|
184
|
+
| `nk_each_atan_f32_v128relaxed` | 11.3 gb/s, 0.4 ulp | 4.22 gb/s, 0.4 ulp | 10.9 gb/s, 0.4 ulp |
|