numkong 7.4.1 → 7.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -130
- package/binding.gyp +16 -0
- package/c/numkong.c +1 -1
- package/include/numkong/attention/sapphireamx.h +2 -2
- package/include/numkong/attention/sme.h +2 -2
- package/include/numkong/capabilities.h +47 -47
- package/include/numkong/cast/diamond.h +2 -2
- package/include/numkong/cast/haswell.h +2 -2
- package/include/numkong/cast/icelake.h +2 -2
- package/include/numkong/cast/loongsonasx.h +2 -2
- package/include/numkong/cast/neon.h +2 -2
- package/include/numkong/cast/powervsx.h +2 -2
- package/include/numkong/cast/rvv.h +2 -2
- package/include/numkong/cast/sapphire.h +2 -2
- package/include/numkong/cast/skylake.h +2 -2
- package/include/numkong/curved/genoa.h +2 -2
- package/include/numkong/curved/haswell.h +2 -2
- package/include/numkong/curved/neon.h +2 -2
- package/include/numkong/curved/neonbfdot.h +2 -2
- package/include/numkong/curved/rvv.h +2 -2
- package/include/numkong/curved/skylake.h +2 -2
- package/include/numkong/curved/smef64.h +2 -2
- package/include/numkong/dot/alder.h +2 -2
- package/include/numkong/dot/diamond.h +2 -2
- package/include/numkong/dot/genoa.h +2 -2
- package/include/numkong/dot/haswell.h +2 -2
- package/include/numkong/dot/icelake.h +2 -2
- package/include/numkong/dot/loongsonasx.h +2 -2
- package/include/numkong/dot/neon.h +2 -2
- package/include/numkong/dot/neonbfdot.h +2 -2
- package/include/numkong/dot/neonfhm.h +2 -2
- package/include/numkong/dot/neonfp8.h +2 -2
- package/include/numkong/dot/neonsdot.h +2 -2
- package/include/numkong/dot/rvv.h +2 -2
- package/include/numkong/dot/rvvbb.h +2 -2
- package/include/numkong/dot/rvvbf16.h +2 -2
- package/include/numkong/dot/rvvhalf.h +2 -2
- package/include/numkong/dot/sapphire.h +2 -2
- package/include/numkong/dot/sierra.h +2 -2
- package/include/numkong/dot/skylake.h +2 -2
- package/include/numkong/dot/sve.h +2 -2
- package/include/numkong/dot/svebfdot.h +2 -2
- package/include/numkong/dot/svehalf.h +2 -2
- package/include/numkong/dot/svesdot.h +2 -2
- package/include/numkong/dots/alder.h +2 -2
- package/include/numkong/dots/diamond.h +2 -2
- package/include/numkong/dots/genoa.h +2 -2
- package/include/numkong/dots/haswell.h +2 -2
- package/include/numkong/dots/icelake.h +2 -2
- package/include/numkong/dots/loongsonasx.h +2 -2
- package/include/numkong/dots/neon.h +2 -2
- package/include/numkong/dots/neonbfdot.h +2 -2
- package/include/numkong/dots/neonfhm.h +2 -2
- package/include/numkong/dots/neonfp8.h +2 -2
- package/include/numkong/dots/neonsdot.h +2 -2
- package/include/numkong/dots/powervsx.h +2 -2
- package/include/numkong/dots/rvv.h +2 -2
- package/include/numkong/dots/sapphireamx.h +2 -2
- package/include/numkong/dots/sierra.h +2 -2
- package/include/numkong/dots/skylake.h +2 -2
- package/include/numkong/dots/sme.h +10 -10
- package/include/numkong/dots/smebi32.h +2 -2
- package/include/numkong/dots/smef64.h +2 -2
- package/include/numkong/dots/smehalf.h +2 -2
- package/include/numkong/each/haswell.h +2 -2
- package/include/numkong/each/icelake.h +2 -2
- package/include/numkong/each/neon.h +2 -2
- package/include/numkong/each/neonbfdot.h +2 -2
- package/include/numkong/each/neonhalf.h +2 -2
- package/include/numkong/each/rvv.h +2 -2
- package/include/numkong/each/sapphire.h +2 -2
- package/include/numkong/each/skylake.h +2 -2
- package/include/numkong/geospatial/haswell.h +2 -2
- package/include/numkong/geospatial/neon.h +2 -2
- package/include/numkong/geospatial/rvv.h +2 -2
- package/include/numkong/geospatial/skylake.h +2 -2
- package/include/numkong/maxsim/alder.h +2 -2
- package/include/numkong/maxsim/genoa.h +2 -2
- package/include/numkong/maxsim/haswell.h +2 -2
- package/include/numkong/maxsim/icelake.h +2 -2
- package/include/numkong/maxsim/neonsdot.h +2 -2
- package/include/numkong/maxsim/sapphireamx.h +2 -2
- package/include/numkong/maxsim/sme.h +2 -2
- package/include/numkong/mesh/haswell.h +2 -2
- package/include/numkong/mesh/neon.h +2 -2
- package/include/numkong/mesh/neonbfdot.h +2 -2
- package/include/numkong/mesh/rvv.h +2 -2
- package/include/numkong/mesh/skylake.h +2 -2
- package/include/numkong/numkong.h +1 -1
- package/include/numkong/probability/haswell.h +2 -2
- package/include/numkong/probability/neon.h +2 -2
- package/include/numkong/probability/rvv.h +2 -2
- package/include/numkong/probability/skylake.h +2 -2
- package/include/numkong/reduce/alder.h +2 -2
- package/include/numkong/reduce/genoa.h +2 -2
- package/include/numkong/reduce/haswell.h +2 -2
- package/include/numkong/reduce/icelake.h +2 -2
- package/include/numkong/reduce/neon.h +2 -2
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +2 -2
- package/include/numkong/reduce/neonsdot.h +2 -2
- package/include/numkong/reduce/rvv.h +2 -2
- package/include/numkong/reduce/sierra.h +2 -2
- package/include/numkong/reduce/skylake.h +2 -2
- package/include/numkong/scalar/haswell.h +2 -2
- package/include/numkong/scalar/loongsonasx.h +2 -2
- package/include/numkong/scalar/neon.h +2 -2
- package/include/numkong/scalar/neonhalf.h +2 -2
- package/include/numkong/scalar/powervsx.h +2 -2
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +2 -2
- package/include/numkong/set/haswell.h +2 -2
- package/include/numkong/set/icelake.h +2 -2
- package/include/numkong/set/loongsonasx.h +2 -2
- package/include/numkong/set/neon.h +2 -2
- package/include/numkong/set/powervsx.h +2 -2
- package/include/numkong/set/rvv.h +2 -2
- package/include/numkong/set/rvvbb.h +2 -2
- package/include/numkong/set/sve.h +2 -2
- package/include/numkong/sets/haswell.h +2 -2
- package/include/numkong/sets/icelake.h +2 -2
- package/include/numkong/sets/loongsonasx.h +2 -2
- package/include/numkong/sets/neon.h +2 -2
- package/include/numkong/sets/powervsx.h +2 -2
- package/include/numkong/sets/smebi32.h +2 -2
- package/include/numkong/sparse/icelake.h +2 -2
- package/include/numkong/sparse/neon.h +2 -2
- package/include/numkong/sparse/sve2.h +2 -2
- package/include/numkong/sparse/turin.h +2 -2
- package/include/numkong/spatial/alder.h +2 -2
- package/include/numkong/spatial/diamond.h +2 -2
- package/include/numkong/spatial/genoa.h +2 -2
- package/include/numkong/spatial/haswell.h +2 -2
- package/include/numkong/spatial/icelake.h +2 -2
- package/include/numkong/spatial/loongsonasx.h +2 -2
- package/include/numkong/spatial/neon.h +2 -2
- package/include/numkong/spatial/neonbfdot.h +2 -2
- package/include/numkong/spatial/neonfp8.h +2 -2
- package/include/numkong/spatial/neonsdot.h +2 -2
- package/include/numkong/spatial/powervsx.h +2 -2
- package/include/numkong/spatial/rvv.h +2 -2
- package/include/numkong/spatial/rvvbf16.h +2 -2
- package/include/numkong/spatial/rvvhalf.h +2 -2
- package/include/numkong/spatial/sierra.h +2 -2
- package/include/numkong/spatial/skylake.h +2 -2
- package/include/numkong/spatial/sve.h +2 -2
- package/include/numkong/spatial/svebfdot.h +2 -2
- package/include/numkong/spatial/svehalf.h +2 -2
- package/include/numkong/spatial/svesdot.h +2 -2
- package/include/numkong/spatials/alder.h +2 -2
- package/include/numkong/spatials/diamond.h +2 -2
- package/include/numkong/spatials/genoa.h +2 -2
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/icelake.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +2 -2
- package/include/numkong/spatials/neon.h +2 -2
- package/include/numkong/spatials/neonbfdot.h +2 -2
- package/include/numkong/spatials/neonfhm.h +2 -2
- package/include/numkong/spatials/neonfp8.h +2 -2
- package/include/numkong/spatials/neonsdot.h +2 -2
- package/include/numkong/spatials/powervsx.h +2 -2
- package/include/numkong/spatials/rvv.h +2 -2
- package/include/numkong/spatials/sapphireamx.h +2 -2
- package/include/numkong/spatials/sierra.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +2 -2
- package/include/numkong/spatials/smef64.h +2 -2
- package/include/numkong/trigonometry/haswell.h +2 -2
- package/include/numkong/trigonometry/neon.h +2 -2
- package/include/numkong/trigonometry/rvv.h +2 -2
- package/include/numkong/trigonometry/skylake.h +2 -2
- package/include/numkong/types.h +88 -80
- package/package.json +7 -7
package/README.md
CHANGED
|
@@ -10,39 +10,39 @@ Most libraries return dot products in the __same type as the input__ — Float16
|
|
|
10
10
|
This leads to quiet overflow: a 2048-dimensional `i8` dot product can reach ±10 million, but `i8` maxes out at 127.
|
|
11
11
|
NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results stay in range.
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
14
|
+
| :----- | -------------------: | -------------------: | -------------------: | --------------------: |
|
|
15
|
+
| | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
|
|
16
|
+
| `f64` | 2.0 gso/s, 1e-15 err | 0.6 gso/s, 1e-15 err | 0.4 gso/s, 1e-14 err | 5.8 gso/s, 1e-16 err |
|
|
17
|
+
| `f32` | 1.5 gso/s, 2e-6 err | 0.6 gso/s, 2e-6 err | 0.4 gso/s, 5e-6 err | 7.1 gso/s, 2e-7 err |
|
|
18
|
+
| `bf16` | — | 0.5 gso/s, 1.9% err | 0.5 gso/s, 1.9% err | 9.7 gso/s, 1.8% err |
|
|
19
|
+
| `f16` | 0.2 gso/s, 0.25% err | 0.5 gso/s, 0.25% err | 0.4 gso/s, 0.25% err | 11.5 gso/s, 0.24% err |
|
|
20
|
+
| `e5m2` | — | 0.7 gso/s, 4.6% err | 0.5 gso/s, 4.6% err | 7.1 gso/s, 0% err |
|
|
21
|
+
| `i8` | 1.1 gso/s, overflow | 0.5 gso/s, overflow | 0.5 gso/s, overflow | 14.8 gso/s, 0% err |
|
|
22
|
+
|
|
23
|
+
> Single 2048-d dot product on Intel Sapphire Rapids, single-threaded.
|
|
14
24
|
> Each cell shows __gso/s, mean relative error__ vs higher-precision reference.
|
|
15
25
|
> gso/s = Giga Scalar Operations per Second — a more suitable name than GFLOP/s when counting both integer and floating-point work.
|
|
16
26
|
> NumPy 2.4, PyTorch 2.10, JAX 0.9.
|
|
17
27
|
|
|
18
|
-
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
19
|
-
| :----- | ----------------------: | ----------------------: | ----------------------: | --------------------: |
|
|
20
|
-
| | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
|
|
21
|
-
| `f64` | 2.0 gso/s, 1e-15 err | 0.6 gso/s, 1e-15 err | 0.4 gso/s, 1e-14 err | 5.8 gso/s, 1e-16 err |
|
|
22
|
-
| `f32` | 1.5 gso/s, 2e-6 err | 0.6 gso/s, 2e-6 err | 0.4 gso/s, 5e-6 err | 7.1 gso/s, 2e-7 err |
|
|
23
|
-
| `bf16` | — | 0.5 gso/s, 1.9% err | 0.5 gso/s, 1.9% err | 9.7 gso/s, 1.8% err |
|
|
24
|
-
| `f16` | 0.2 gso/s, 0.25% err | 0.5 gso/s, 0.25% err | 0.4 gso/s, 0.25% err | 11.5 gso/s, 0.24% err |
|
|
25
|
-
| `e5m2` | — | 0.7 gso/s, 4.6% err | 0.5 gso/s, 4.6% err | 7.1 gso/s, 0% err |
|
|
26
|
-
| `i8` | 1.1 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 14.8 gso/s, 0% err |
|
|
27
|
-
|
|
28
28
|
A fair objection: PyTorch and JAX are designed for throughput, not single-call latency.
|
|
29
29
|
They lower execution graphs through [XLA](https://openxla.org/) or vendored BLAS libraries like [Intel MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) and Nvidia [cuBLAS](https://developer.nvidia.com/cublas).
|
|
30
30
|
So here's the same comparison on a throughput-oriented workload — matrix multiplication:
|
|
31
31
|
|
|
32
|
+
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
33
|
+
| :----- | --------------------: | --------------------: | ---------------------: | -------------------: |
|
|
34
|
+
| | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
|
|
35
|
+
| `f64` | 65.5 gso/s, 1e-15 err | 68.2 gso/s, 1e-15 err | ~14.3 gso/s, 1e-15 err | 8.6 gso/s, 1e-16 err |
|
|
36
|
+
| `f32` | 140 gso/s, 9e-7 err | 145 gso/s, 1e-6 err | ~60.5 gso/s, 1e-6 err | 37.7 gso/s, 4e-7 err |
|
|
37
|
+
| `bf16` | — | 851 gso/s, 1.8% err | ~25.8 gso/s, 3.4% err | 458 gso/s, 3.6% err |
|
|
38
|
+
| `f16` | 0.3 gso/s, 0.25% err | 140 gso/s, 0.37% err | ~26.1 gso/s, 0.35% err | 103 gso/s, 0.26% err |
|
|
39
|
+
| `e5m2` | — | 0.4 gso/s, 4.6% err | ~26.4 gso/s, 4.6% err | 398 gso/s, 0% err |
|
|
40
|
+
| `i8` | 0.4 gso/s, overflow | 50.0 gso/s, overflow | ~0.0 gso/s, overflow | 1279 gso/s, 0% err |
|
|
41
|
+
|
|
32
42
|
> Matrix multiplication (2048 × 2048) × (2048 × 2048) on Intel Sapphire Rapids, single-threaded.
|
|
33
43
|
> gso/s = Giga Scalar Operations per Second, same format.
|
|
34
44
|
> NumPy 2.4, PyTorch 2.10, JAX 0.9, same versions.
|
|
35
45
|
|
|
36
|
-
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
37
|
-
| :----- | ----------------------: | -----------------------: | -----------------------: | -------------------: |
|
|
38
|
-
| | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
|
|
39
|
-
| `f64` | 65.5 gso/s, 1e-15 err | 68.2 gso/s, 1e-15 err | ~14.3 gso/s, 1e-15 err | 8.6 gso/s, 1e-16 err |
|
|
40
|
-
| `f32` | 140 gso/s, 9e-7 err | 145 gso/s, 1e-6 err | ~60.5 gso/s, 1e-6 err | 37.7 gso/s, 4e-7 err |
|
|
41
|
-
| `bf16` | — | 851 gso/s, 1.8% err | ~25.8 gso/s, 3.4% err | 458 gso/s, 3.6% err |
|
|
42
|
-
| `f16` | 0.3 gso/s, 0.25% err | 140 gso/s, 0.37% err | ~26.1 gso/s, 0.35% err | 103 gso/s, 0.26% err |
|
|
43
|
-
| `e5m2` | — | 0.4 gso/s, 4.6% err | ~26.4 gso/s, 4.6% err | 398 gso/s, 0% err |
|
|
44
|
-
| `i8` | 0.4 gso/s, __overflow__ | 50.0 gso/s, __overflow__ | ~0.0 gso/s, __overflow__ | 1279 gso/s, 0% err |
|
|
45
|
-
|
|
46
46
|
For `f64`, compensated "Dot2" summation reduces error by 10–50× compared to naive Float64 accumulation, depending on vector length.
|
|
47
47
|
For `f32`, widening to Float64 gives 5–10× lower error.
|
|
48
48
|
The library ships as a relatively small binary:
|
|
@@ -77,27 +77,27 @@ NumKong covers 17 numeric types — from 6-bit floats to 64-bit complex numbers
|
|
|
77
77
|
|
|
78
78
|
### Language Bindings
|
|
79
79
|
|
|
80
|
-
| Operation | [C
|
|
81
|
-
| :-------------------------- |
|
|
82
|
-
| __Vector Ops__ |
|
|
83
|
-
| [Dot] Product |
|
|
84
|
-
| [Spatial] Metric |
|
|
85
|
-
| [Set] Similarity |
|
|
86
|
-
| [Geo]spatial |
|
|
87
|
-
| [Mesh] Alignment |
|
|
88
|
-
| [Sparse] Products |
|
|
89
|
-
| [
|
|
90
|
-
| [Curved] Spaces |
|
|
91
|
-
| __Many-to-Many Vector Ops__ |
|
|
92
|
-
| "[Dots]" Products |
|
|
93
|
-
| "[Spatials]" Metrics |
|
|
94
|
-
| "[Sets]" Similarities |
|
|
95
|
-
| [MaxSim] Scoring |
|
|
96
|
-
| __Scalar Ops__ |
|
|
97
|
-
| [Cast] |
|
|
98
|
-
| [Reduce] |
|
|
99
|
-
| [Each] |
|
|
100
|
-
| [
|
|
80
|
+
| Operation | [C 99 & C++ 23][c] | [Python][py] | [Rust][rs] | [JavaScript][js] | [Swift][swift] | [GoLang][go] |
|
|
81
|
+
| :-------------------------- | :----------------: | :----------: | :--------: | :--------------: | :------------: | :----------: |
|
|
82
|
+
| __Vector Ops__ | | | | | | |
|
|
83
|
+
| [Dot] Product | ● | ● | ● | ● | ● | ● |
|
|
84
|
+
| [Spatial] Metric | ● | ● | ● | ● | ● | ● |
|
|
85
|
+
| [Set] Similarity | ● | ● | ● | ● | ● | ● |
|
|
86
|
+
| [Geo]spatial | ● | ● | ● | · | ● | ● |
|
|
87
|
+
| [Mesh] Alignment | ● | ● | ● | · | · | · |
|
|
88
|
+
| [Sparse] Products | ● | ● | ● | · | · | · |
|
|
89
|
+
| [Probability] Divergences | ● | ● | ● | ● | · | ● |
|
|
90
|
+
| [Curved] Spaces | ● | ● | ● | · | · | · |
|
|
91
|
+
| __Many-to-Many Vector Ops__ | | | | | | |
|
|
92
|
+
| "[Dots]" Products | ● | ● | ● | ● | ● | ● |
|
|
93
|
+
| "[Spatials]" Metrics | ● | ● | ● | ● | ● | ● |
|
|
94
|
+
| "[Sets]" Similarities | ● | ● | ● | · | ● | ● |
|
|
95
|
+
| [MaxSim] Scoring | ● | ● | ● | · | ● | ● |
|
|
96
|
+
| __Scalar Ops__ | | | | | | |
|
|
97
|
+
| [Cast] | ● | ● | ● | ● | · | · |
|
|
98
|
+
| [Reduce] | ● | ● | ● | · | · | · |
|
|
99
|
+
| [Each] | ● | ● | ● | · | · | · |
|
|
100
|
+
| [Trigonometry] | ● | ● | ● | · | · | · |
|
|
101
101
|
|
|
102
102
|
[Dot]: include/numkong/dot/README.md
|
|
103
103
|
[Dots]: include/numkong/dots/README.md
|
|
@@ -107,12 +107,12 @@ NumKong covers 17 numeric types — from 6-bit floats to 64-bit complex numbers
|
|
|
107
107
|
[Sets]: include/numkong/sets/README.md
|
|
108
108
|
[Cast]: include/numkong/cast/README.md
|
|
109
109
|
[Reduce]: include/numkong/reduce/README.md
|
|
110
|
-
[
|
|
110
|
+
[Trigonometry]: include/numkong/trigonometry/README.md
|
|
111
111
|
[MaxSim]: include/numkong/maxsim/README.md
|
|
112
112
|
[Mesh]: include/numkong/mesh/README.md
|
|
113
113
|
[Each]: include/numkong/each/README.md
|
|
114
114
|
[Sparse]: include/numkong/sparse/README.md
|
|
115
|
-
[
|
|
115
|
+
[Probability]: include/numkong/probability/README.md
|
|
116
116
|
[Curved]: include/numkong/curved/README.md
|
|
117
117
|
[Geo]: include/numkong/geospatial/README.md
|
|
118
118
|
[c]: include/README.md
|
|
@@ -122,50 +122,6 @@ NumKong covers 17 numeric types — from 6-bit floats to 64-bit complex numbers
|
|
|
122
122
|
[swift]: swift/README.md
|
|
123
123
|
[go]: golang/README.md
|
|
124
124
|
|
|
125
|
-
### Numeric Types × Backend
|
|
126
|
-
|
|
127
|
-
| Backend | f64 | f32 | bf16 | f16 | e5m2 | e4m3 | e3m2 | e2m3 | i8 | u8 | i4 | u4 | u1 | f64c | f32c | bf16c | f16c |
|
|
128
|
-
| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
|
|
129
|
-
| __x86__ | | | | | | | | | | | | | | | | | |
|
|
130
|
-
| Haswell | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● |
|
|
131
|
-
| Skylake | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · |
|
|
132
|
-
| Ice Lake | · | · | · | ● | · | · | ● | ● | ● | ● | ● | ● | ● | · | · | · | · |
|
|
133
|
-
| Genoa | · | · | ● | · | ● | ● | · | · | · | · | · | · | · | · | · | ● | · |
|
|
134
|
-
| Sapphire | · | · | · | ● | · | ● | · | · | ● | ● | · | · | · | · | · | · | · |
|
|
135
|
-
| Sapphire AMX | · | · | ● | · | ● | ● | ● | ● | ● | ● | · | · | · | · | · | · | · |
|
|
136
|
-
| Alder Lake | · | · | · | · | · | · | ● | ● | ● | ● | · | · | · | · | · | · | · |
|
|
137
|
-
| Sierra Forest | · | · | · | · | · | · | ● | ● | ● | ● | · | · | · | · | · | · | · |
|
|
138
|
-
| Turin | · | ● | ● | · | · | · | · | · | · | · | · | · | · | · | · | · | · |
|
|
139
|
-
| Diamond | · | · | · | ● | ● | ● | · | · | · | · | · | · | · | · | · | · | · |
|
|
140
|
-
| __Arm__ | | | | | | | | | | | | | | | | | |
|
|
141
|
-
| NEON | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · | ● | ● | ● | · | ● |
|
|
142
|
-
| NEON Half | · | · | · | ● | · | · | · | · | ● | ● | · | · | · | · | · | · | · |
|
|
143
|
-
| NEON FHM | · | · | · | ● | ● | ● | · | · | · | · | · | · | · | · | · | · | ● |
|
|
144
|
-
| NEON BF16 | · | · | ● | · | ● | ● | · | · | · | · | · | · | · | · | · | ● | · |
|
|
145
|
-
| NEON SDot | · | · | · | · | · | · | ● | ● | ● | ● | ● | ● | · | · | · | · | · |
|
|
146
|
-
| NEON FP8 | · | · | · | · | ● | ● | ● | ● | · | · | · | · | · | · | · | · | · |
|
|
147
|
-
| SVE | ● | ● | · | · | · | · | · | · | · | ● | · | · | ● | ● | ● | · | · |
|
|
148
|
-
| SVE Half | · | · | · | ● | · | · | · | · | · | · | · | · | · | · | · | · | ● |
|
|
149
|
-
| SVE BF16 | · | · | ● | · | · | · | · | · | · | · | · | · | · | · | · | · | · |
|
|
150
|
-
| SVE SDot | · | · | · | · | · | · | · | · | ● | ● | · | · | · | · | · | · | · |
|
|
151
|
-
| SVE2 | · | ● | ● | · | · | · | · | · | · | · | · | · | · | · | · | · | · |
|
|
152
|
-
| SME | · | · | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · | · | · | · |
|
|
153
|
-
| SME F64 | ● | ● | · | · | · | · | · | · | · | · | · | · | · | ● | ● | · | · |
|
|
154
|
-
| SME BI32 | · | · | · | · | · | · | · | · | · | · | · | · | ● | · | · | · | · |
|
|
155
|
-
| __RISC-V__ | | | | | | | | | | | | | | | | | |
|
|
156
|
-
| RVV | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · |
|
|
157
|
-
| RVV Half | · | · | · | ● | ● | ● | · | · | · | · | · | · | · | · | · | · | · |
|
|
158
|
-
| RVV BF16 | · | · | ● | · | ● | ● | · | · | · | · | · | · | · | · | · | · | · |
|
|
159
|
-
| RVV BB | · | · | · | · | · | · | · | · | · | · | · | · | ● | · | · | · | · |
|
|
160
|
-
| __Other__ | | | | | | | | | | | | | | | | | |
|
|
161
|
-
| Power VSX | ● | ● | ● | ● | · | · | · | · | ● | ● | · | · | ● | · | · | · | · |
|
|
162
|
-
| LoongArch LASX | ● | ● | ● | ● | · | · | · | · | ● | ● | · | · | ● | · | · | · | · |
|
|
163
|
-
| WASM V128 | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · |
|
|
164
|
-
|
|
165
|
-
Not every combination is implemented — only the ones that unlock real performance gains.
|
|
166
|
-
The `icelake` level doesn't get a `dot_bf16` variant, for example, and falls through to `dot_bf16_skylake`.
|
|
167
|
-
Every operation has a `serial` fallback, but even types no CPU supports today get optimized via lookup tables and bit-twiddling hacks rather than scalar loops.
|
|
168
|
-
For details on compile-time and run-time [dispatch](#compile-time-and-run-time-dispatch), see the contributor guide.
|
|
169
125
|
|
|
170
126
|
## Design Decisions
|
|
171
127
|
|
|
@@ -436,16 +392,16 @@ On x86, older CPUs use __F16C extensions__ (Ivy Bridge+) for fast Float16 → Fl
|
|
|
436
392
|
On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float32 widening multiply-accumulate, reducing the total latency from 7 cycles to 4 cycles and achieving 20–48% speedup over the separate convert-then-FMA path.
|
|
437
393
|
|
|
438
394
|
| Platform | BFloat16 Path | Elem/Op | Float16 Path | Elem/Op |
|
|
439
|
-
|
|
|
395
|
+
| :--------------------- | :------------------------- | ------: | :--------------------- | ------: |
|
|
440
396
|
| __x86__ | | | | |
|
|
441
|
-
| Diamond Rapids (
|
|
397
|
+
| Diamond Rapids (2026) | ↓ Genoa | 32 | `VDPPHPS` widening dot | 32 |
|
|
442
398
|
| Sapphire Rapids (2023) | ↓ Genoa | 32 | ↓ Skylake | 16 |
|
|
443
399
|
| Genoa (2022) | `VDPBF16PS` widening dot | 32 | ↓ Skylake | 16 |
|
|
444
400
|
| Skylake (2015) | `SLLI` + `VFMADD` | 16 | `VCVTPH2PS` + `VFMADD` | 16 |
|
|
445
401
|
| Haswell (2013) | `SLLI` + `VFMADD` | 8 | `VCVTPH2PS` + `VFMADD` | 8 |
|
|
446
402
|
| __Arm__ | | | | |
|
|
447
|
-
| Graviton 3 (2021) | `SVBFDOT` widening dot | 4–32 | `SVCVT` → `SVFMLA` | 4–32 |
|
|
448
403
|
| Apple M2+ (2022) | `BFDOT` widening dot | 8 | ↓ FP16FML | 8 |
|
|
404
|
+
| Graviton 3+ (2021) | `SVBFDOT` widening dot | 4–32 | `SVCVT` → `SVFMLA` | 4–32 |
|
|
449
405
|
| Apple M1 (2020) | ↓ NEON | 8 | `FMLAL` widening FMA | 8 |
|
|
450
406
|
| Graviton 2 (2019) | ↓ NEON | 8 | `FCVTL` + `FMLA` | 4 |
|
|
451
407
|
| Graviton 1 (2018) | `SHLL` + `FMLA` | 8 | bit-manip → `FMLA` | 8 |
|
|
@@ -464,14 +420,14 @@ On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float
|
|
|
464
420
|
|
|
465
421
|
### Mini-Floats: E4M3, E5M2, E3M2, & E2M3
|
|
466
422
|
|
|
467
|
-
| Format
|
|
468
|
-
|
|
|
469
|
-
| E5M2FN
|
|
470
|
-
| E4M3FN
|
|
471
|
-
| E3M2FN
|
|
472
|
-
| E2M3FN
|
|
473
|
-
|
|
|
474
|
-
|
|
|
423
|
+
| Format | Bits | Range | NumKong Promotion Rules | Support in GPUs |
|
|
424
|
+
| :----------- | ----: | -----: | ------------------------------------- | ----------------- |
|
|
425
|
+
| E5M2FN | 8 | ±57344 | BFloat16 → Float32 | H100+, MI300+ |
|
|
426
|
+
| E4M3FN | 8 | ±448 | BFloat16 → Float32 | H100+, MI300+ |
|
|
427
|
+
| E3M2FN | 6 → 8 | ±28 | B- & Float16 → Float32, Int16 → Int32 | only block-scaled |
|
|
428
|
+
| E2M3FN | 6 → 8 | ±7.5 | B- & Float16 → Float32, Int8 → Int32 | only block-scaled |
|
|
429
|
+
| Scaled NVFP4 | 4 | ±6 | — | B200+ |
|
|
430
|
+
| Scaled MXFP4 | 4 | ±6 | — | B200+, MI325+ |
|
|
475
431
|
|
|
476
432
|
> __Block scaling.__
|
|
477
433
|
> NumKong does not implement block-scaled variants (MXFP4, NVFP4, or block-scaled E3M2/E2M3).
|
|
@@ -488,22 +444,22 @@ E4M3FN (no infinities, NaN only) is preferred for __training__ where precision n
|
|
|
488
444
|
On x86 Genoa/Sapphire Rapids, E4M3/E5M2 values upcast to BFloat16 via lookup tables, then use native __DPBF16PS__ for 2-per-lane dot products accumulating to Float32.
|
|
489
445
|
On Arm Graviton 3+, the same BFloat16 upcast happens via NEON table lookups, then __BFDOT__ instructions complete the computation.
|
|
490
446
|
|
|
491
|
-
| Platform
|
|
492
|
-
|
|
|
493
|
-
| __x86__
|
|
494
|
-
| Diamond Rapids (
|
|
495
|
-
| Genoa (2022)
|
|
496
|
-
| Ice Lake (2019)
|
|
497
|
-
| Skylake (2015)
|
|
498
|
-
| Haswell (2013)
|
|
499
|
-
| __Arm__
|
|
500
|
-
| NEON + FP8DOT (
|
|
501
|
-
| NEON + FP16FML (
|
|
502
|
-
| NEON (
|
|
503
|
-
| __RISC-V__
|
|
504
|
-
| RVV + Zvfbfwma
|
|
505
|
-
| RVV + Zvfh
|
|
506
|
-
| RVV
|
|
447
|
+
| Platform | E5M2 Path | Elem/Op | E4M3 Path | Elem/Op |
|
|
448
|
+
| :-------------------- | :----------------------------- | ------: | :----------------------------- | ------: |
|
|
449
|
+
| __x86__ | | | | |
|
|
450
|
+
| Diamond Rapids (2026) | `VCVTBF82PH` → F16 + `VDPPHPS` | 32 | `VCVTHF82PH` → F16 + `VDPPHPS` | 32 |
|
|
451
|
+
| Genoa (2022) | → BF16 + `VDPBF16PS` | 32 | ↓ Ice Lake | 64 |
|
|
452
|
+
| Ice Lake (2019) | ↓ Skylake | 16 | octave LUT + `VPDPBUSD` | 64 |
|
|
453
|
+
| Skylake (2015) | rebias → F32 FMA | 16 | rebias → F32 FMA | 16 |
|
|
454
|
+
| Haswell (2013) | rebias → F32 FMA | 8 | rebias → F32 FMA | 8 |
|
|
455
|
+
| __Arm__ | | | | |
|
|
456
|
+
| NEON + FP8DOT (2026) | native `FDOT` | 16 | native `FDOT` | 16 |
|
|
457
|
+
| NEON + FP16FML (2020) | SHL → F16 + `FMLAL` | 16 | LUT → F16 + `FMLAL` | 16 |
|
|
458
|
+
| NEON (2018) | SHL + `FCVTL` + FMA | 8 | → F16 + `FCVTL` + FMA | 8 |
|
|
459
|
+
| __RISC-V__ | | | | |
|
|
460
|
+
| RVV + Zvfbfwma | rebias → BF16 + `VFWMACCBF16` | 4–32 | LUT → BF16 + `VFWMACCBF16` | 4–32 |
|
|
461
|
+
| RVV + Zvfh | SHL → F16 + `VFWMACC` | 4–32 | LUT → F16 + `VFWMACC` | 4–32 |
|
|
462
|
+
| RVV | rebias → F32 + `VFMACC` | 4–32 | LUT → F32 + `VFMACC` | 4–32 |
|
|
507
463
|
|
|
508
464
|
> E5M2 shares Float16's exponent bias (15), so E5M2 → Float16 conversion is a single left-shift by 8 bits (`SHL 8`).
|
|
509
465
|
> E4M3 on Ice Lake uses "octave decomposition": the 4-bit exponent splits into 2 octave + 2 remainder bits, yielding 7 integer accumulators post-scaled by powers of 2.
|
|
@@ -513,20 +469,20 @@ Their smaller range allows scaling to exact integers that fit in `i8`/`i16`, ena
|
|
|
513
469
|
Float16 can also serve as an accumulator, accurately representing ~50 products of E3M2FN pairs or ~20 products of E2M3FN pairs before overflow.
|
|
514
470
|
On Arm, NEON FHM extensions bring widening `FMLAL` dot-products for Float16 — both faster and more widely available than `BFDOT` for BFloat16.
|
|
515
471
|
|
|
516
|
-
| Platform
|
|
517
|
-
|
|
|
518
|
-
| __x86__
|
|
519
|
-
|
|
|
520
|
-
|
|
|
521
|
-
|
|
|
522
|
-
| Skylake (2015)
|
|
523
|
-
| Haswell (2013)
|
|
524
|
-
| __Arm__
|
|
525
|
-
| NEON + FP8DOT (
|
|
526
|
-
| NEON + DotProd (
|
|
527
|
-
| NEON (
|
|
528
|
-
| __RISC-V__
|
|
529
|
-
| RVV
|
|
472
|
+
| Platform | E3M2 Path | Elem/Op | E2M3 Path | Elem/Op |
|
|
473
|
+
| :-------------------- | :------------------------- | ------: | :--------------------------- | ------: |
|
|
474
|
+
| __x86__ | | | | |
|
|
475
|
+
| Sierra Forest (2024) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBSSD` | 32 |
|
|
476
|
+
| Alder Lake (2021) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBUSD` | 32 |
|
|
477
|
+
| Ice Lake (2019) | `VPERMW` LUT + `VPMADDWD` | 32 | `VPERMB` LUT + `VPDPBUSD` | 64 |
|
|
478
|
+
| Skylake (2015) | `VPSHUFB` LUT + `VPMADDWD` | 64 | `VPSHUFB` LUT + `VPMADDUBSW` | 64 |
|
|
479
|
+
| Haswell (2013) | `VPSHUFB` LUT + `VPMADDWD` | 32 | `VPSHUFB` LUT + `VPMADDUBSW` | 32 |
|
|
480
|
+
| __Arm__ | | | | |
|
|
481
|
+
| NEON + FP8DOT (2026) | → E5M2 + `FDOT` | 16 | → E4M3 + `FDOT` | 16 |
|
|
482
|
+
| NEON + DotProd (2019) | `VQTBL2` LUT + `SMLAL` | 16 | `VQTBL2` LUT + `SDOT` | 16 |
|
|
483
|
+
| NEON (2018) | → F16 + `FCVTL` + FMA | 16 | → F16 + `FCVTL` + FMA | 16 |
|
|
484
|
+
| __RISC-V__ | | | | |
|
|
485
|
+
| RVV | I16 gather LUT + `VWMACC` | 4–32 | U8 gather LUT + `VWMACC` | 4–32 |
|
|
530
486
|
|
|
531
487
|
> E3M2/E2M3 values map to exact integers via 32-entry LUTs (magnitudes up to 448 for E3M2, 120 for E2M3), enabling integer accumulation with no rounding error.
|
|
532
488
|
> On NEON + FP8DOT, E3M2 is first promoted to E5M2 and E2M3 to E4M3 before the hardware `FDOT` instruction.
|
|
@@ -538,7 +494,7 @@ E5M2's range (±57,344) makes the scaled product exceed Int32 entirely.
|
|
|
538
494
|
Without the integer path, E5M2 falls back to Float32 accumulation — where its [2-bit mantissa (only 4 values per binade)](https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/) creates a [catastrophic cancellation risk](https://www.ac.uma.es/arith2024/papers/Fused%20FP8%204-Way%20Dot%20Product%20with%20Scaling%20and%20FP32%20Accumulation.pdf) that E2M3's integer path avoids completely:
|
|
539
495
|
|
|
540
496
|
| | _i_ = 0 | _i_ = 1 | _i_ = 2 | _i_ = 3 | _i_ = 4 | _i_ = 5 | _i_ = 6 |
|
|
541
|
-
|
|
|
497
|
+
| :------ | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
|
|
542
498
|
| _aᵢ_ | 0.00122 | 20480 | −0.00122 | 1.5 | −3072 | −640 | 0.00146 |
|
|
543
499
|
| _bᵢ_ | −40 | 320 | −1280 | −7.63e⁻⁵ | 0.000427 | 10240 | −4.58e⁻⁵ |
|
|
544
500
|
| _aᵢ·bᵢ_ | −0.04883 | 6553600 | 1.5625 | −0.000114 | −1.3125 | −6553600 | ≈ 0 |
|
package/binding.gyp
CHANGED
package/c/numkong.c
CHANGED
|
@@ -943,7 +943,7 @@ BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved) {
|
|
|
943
943
|
// carries ZA state. So __arm_tpidr2_save is always a no-op and
|
|
944
944
|
// __arm_tpidr2_restore has nothing to restore.
|
|
945
945
|
// Weak linkage lets a real compiler-rt override these if available.
|
|
946
|
-
#if
|
|
946
|
+
#if NK_TARGET_ARM64_ && NK_TARGET_SME
|
|
947
947
|
__attribute__((weak, visibility("default"))) void __arm_tpidr2_save(void) {}
|
|
948
948
|
__attribute__((weak, visibility("default"))) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
|
|
949
949
|
#endif
|
|
@@ -62,7 +62,7 @@
|
|
|
62
62
|
#ifndef NK_ATTENTION_SAPPHIREAMX_H
|
|
63
63
|
#define NK_ATTENTION_SAPPHIREAMX_H
|
|
64
64
|
|
|
65
|
-
#if
|
|
65
|
+
#if NK_TARGET_X8664_
|
|
66
66
|
#if NK_TARGET_SAPPHIREAMX
|
|
67
67
|
|
|
68
68
|
#include "numkong/types.h"
|
|
@@ -1359,5 +1359,5 @@ NK_PUBLIC void nk_attention_causal_bf16_sapphireamx(nk_bf16_t const *q, void con
|
|
|
1359
1359
|
#endif
|
|
1360
1360
|
|
|
1361
1361
|
#endif // NK_TARGET_SAPPHIREAMX
|
|
1362
|
-
#endif //
|
|
1362
|
+
#endif // NK_TARGET_X8664_
|
|
1363
1363
|
#endif // NK_ATTENTION_SAPPHIREAMX_H
|
|
@@ -91,7 +91,7 @@
|
|
|
91
91
|
#ifndef NK_ATTENTION_SME_H
|
|
92
92
|
#define NK_ATTENTION_SME_H
|
|
93
93
|
|
|
94
|
-
#if
|
|
94
|
+
#if NK_TARGET_ARM64_
|
|
95
95
|
#if NK_TARGET_SME
|
|
96
96
|
|
|
97
97
|
#include "numkong/types.h"
|
|
@@ -2068,5 +2068,5 @@ NK_PUBLIC void nk_attention_causal_f16_sme(nk_f16_t const *q, void const *kv_pac
|
|
|
2068
2068
|
#endif
|
|
2069
2069
|
|
|
2070
2070
|
#endif // NK_TARGET_SME
|
|
2071
|
-
#endif //
|
|
2071
|
+
#endif // NK_TARGET_ARM64_
|
|
2072
2072
|
#endif // NK_ATTENTION_SME_H
|