numkong 7.4.2 → 7.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -84
- package/c/numkong.c +1 -1
- package/include/numkong/attention/sapphireamx.h +2 -2
- package/include/numkong/attention/sme.h +2 -2
- package/include/numkong/capabilities.h +47 -47
- package/include/numkong/cast/diamond.h +2 -2
- package/include/numkong/cast/haswell.h +2 -2
- package/include/numkong/cast/icelake.h +2 -2
- package/include/numkong/cast/loongsonasx.h +2 -2
- package/include/numkong/cast/neon.h +2 -2
- package/include/numkong/cast/powervsx.h +2 -2
- package/include/numkong/cast/rvv.h +2 -2
- package/include/numkong/cast/sapphire.h +2 -2
- package/include/numkong/cast/skylake.h +2 -2
- package/include/numkong/curved/genoa.h +2 -2
- package/include/numkong/curved/haswell.h +2 -2
- package/include/numkong/curved/neon.h +2 -2
- package/include/numkong/curved/neonbfdot.h +2 -2
- package/include/numkong/curved/rvv.h +2 -2
- package/include/numkong/curved/skylake.h +2 -2
- package/include/numkong/curved/smef64.h +2 -2
- package/include/numkong/dot/alder.h +2 -2
- package/include/numkong/dot/diamond.h +2 -2
- package/include/numkong/dot/genoa.h +2 -2
- package/include/numkong/dot/haswell.h +2 -2
- package/include/numkong/dot/icelake.h +2 -2
- package/include/numkong/dot/loongsonasx.h +2 -2
- package/include/numkong/dot/neon.h +2 -2
- package/include/numkong/dot/neonbfdot.h +2 -2
- package/include/numkong/dot/neonfhm.h +2 -2
- package/include/numkong/dot/neonfp8.h +2 -2
- package/include/numkong/dot/neonsdot.h +2 -2
- package/include/numkong/dot/rvv.h +2 -2
- package/include/numkong/dot/rvvbb.h +2 -2
- package/include/numkong/dot/rvvbf16.h +2 -2
- package/include/numkong/dot/rvvhalf.h +2 -2
- package/include/numkong/dot/sapphire.h +2 -2
- package/include/numkong/dot/sierra.h +2 -2
- package/include/numkong/dot/skylake.h +2 -2
- package/include/numkong/dot/sve.h +2 -2
- package/include/numkong/dot/svebfdot.h +2 -2
- package/include/numkong/dot/svehalf.h +2 -2
- package/include/numkong/dot/svesdot.h +2 -2
- package/include/numkong/dots/alder.h +2 -2
- package/include/numkong/dots/diamond.h +2 -2
- package/include/numkong/dots/genoa.h +2 -2
- package/include/numkong/dots/haswell.h +2 -2
- package/include/numkong/dots/icelake.h +2 -2
- package/include/numkong/dots/loongsonasx.h +2 -2
- package/include/numkong/dots/neon.h +2 -2
- package/include/numkong/dots/neonbfdot.h +2 -2
- package/include/numkong/dots/neonfhm.h +2 -2
- package/include/numkong/dots/neonfp8.h +2 -2
- package/include/numkong/dots/neonsdot.h +2 -2
- package/include/numkong/dots/powervsx.h +2 -2
- package/include/numkong/dots/rvv.h +2 -2
- package/include/numkong/dots/sapphireamx.h +2 -2
- package/include/numkong/dots/sierra.h +2 -2
- package/include/numkong/dots/skylake.h +2 -2
- package/include/numkong/dots/sme.h +10 -10
- package/include/numkong/dots/smebi32.h +2 -2
- package/include/numkong/dots/smef64.h +2 -2
- package/include/numkong/dots/smehalf.h +2 -2
- package/include/numkong/each/haswell.h +2 -2
- package/include/numkong/each/icelake.h +2 -2
- package/include/numkong/each/neon.h +2 -2
- package/include/numkong/each/neonbfdot.h +2 -2
- package/include/numkong/each/neonhalf.h +2 -2
- package/include/numkong/each/rvv.h +2 -2
- package/include/numkong/each/sapphire.h +2 -2
- package/include/numkong/each/skylake.h +2 -2
- package/include/numkong/geospatial/haswell.h +2 -2
- package/include/numkong/geospatial/neon.h +2 -2
- package/include/numkong/geospatial/rvv.h +2 -2
- package/include/numkong/geospatial/skylake.h +2 -2
- package/include/numkong/maxsim/alder.h +2 -2
- package/include/numkong/maxsim/genoa.h +2 -2
- package/include/numkong/maxsim/haswell.h +2 -2
- package/include/numkong/maxsim/icelake.h +2 -2
- package/include/numkong/maxsim/neonsdot.h +2 -2
- package/include/numkong/maxsim/sapphireamx.h +2 -2
- package/include/numkong/maxsim/sme.h +2 -2
- package/include/numkong/mesh/haswell.h +2 -2
- package/include/numkong/mesh/neon.h +2 -2
- package/include/numkong/mesh/neonbfdot.h +2 -2
- package/include/numkong/mesh/rvv.h +2 -2
- package/include/numkong/mesh/skylake.h +2 -2
- package/include/numkong/numkong.h +1 -1
- package/include/numkong/probability/haswell.h +2 -2
- package/include/numkong/probability/neon.h +2 -2
- package/include/numkong/probability/rvv.h +2 -2
- package/include/numkong/probability/skylake.h +2 -2
- package/include/numkong/reduce/alder.h +2 -2
- package/include/numkong/reduce/genoa.h +2 -2
- package/include/numkong/reduce/haswell.h +2 -2
- package/include/numkong/reduce/icelake.h +2 -2
- package/include/numkong/reduce/neon.h +2 -2
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +2 -2
- package/include/numkong/reduce/neonsdot.h +2 -2
- package/include/numkong/reduce/rvv.h +2 -2
- package/include/numkong/reduce/sierra.h +2 -2
- package/include/numkong/reduce/skylake.h +2 -2
- package/include/numkong/scalar/haswell.h +2 -2
- package/include/numkong/scalar/loongsonasx.h +2 -2
- package/include/numkong/scalar/neon.h +2 -2
- package/include/numkong/scalar/neonhalf.h +2 -2
- package/include/numkong/scalar/powervsx.h +2 -2
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +2 -2
- package/include/numkong/set/haswell.h +2 -2
- package/include/numkong/set/icelake.h +2 -2
- package/include/numkong/set/loongsonasx.h +2 -2
- package/include/numkong/set/neon.h +2 -2
- package/include/numkong/set/powervsx.h +2 -2
- package/include/numkong/set/rvv.h +2 -2
- package/include/numkong/set/rvvbb.h +2 -2
- package/include/numkong/set/sve.h +2 -2
- package/include/numkong/sets/haswell.h +2 -2
- package/include/numkong/sets/icelake.h +2 -2
- package/include/numkong/sets/loongsonasx.h +2 -2
- package/include/numkong/sets/neon.h +2 -2
- package/include/numkong/sets/powervsx.h +2 -2
- package/include/numkong/sets/smebi32.h +2 -2
- package/include/numkong/sparse/icelake.h +2 -2
- package/include/numkong/sparse/neon.h +2 -2
- package/include/numkong/sparse/sve2.h +2 -2
- package/include/numkong/sparse/turin.h +2 -2
- package/include/numkong/spatial/alder.h +2 -2
- package/include/numkong/spatial/diamond.h +2 -2
- package/include/numkong/spatial/genoa.h +2 -2
- package/include/numkong/spatial/haswell.h +2 -2
- package/include/numkong/spatial/icelake.h +2 -2
- package/include/numkong/spatial/loongsonasx.h +2 -2
- package/include/numkong/spatial/neon.h +2 -2
- package/include/numkong/spatial/neonbfdot.h +2 -2
- package/include/numkong/spatial/neonfp8.h +2 -2
- package/include/numkong/spatial/neonsdot.h +2 -2
- package/include/numkong/spatial/powervsx.h +2 -2
- package/include/numkong/spatial/rvv.h +2 -2
- package/include/numkong/spatial/rvvbf16.h +2 -2
- package/include/numkong/spatial/rvvhalf.h +2 -2
- package/include/numkong/spatial/sierra.h +2 -2
- package/include/numkong/spatial/skylake.h +2 -2
- package/include/numkong/spatial/sve.h +2 -2
- package/include/numkong/spatial/svebfdot.h +2 -2
- package/include/numkong/spatial/svehalf.h +2 -2
- package/include/numkong/spatial/svesdot.h +2 -2
- package/include/numkong/spatials/alder.h +2 -2
- package/include/numkong/spatials/diamond.h +2 -2
- package/include/numkong/spatials/genoa.h +2 -2
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/icelake.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +2 -2
- package/include/numkong/spatials/neon.h +2 -2
- package/include/numkong/spatials/neonbfdot.h +2 -2
- package/include/numkong/spatials/neonfhm.h +2 -2
- package/include/numkong/spatials/neonfp8.h +2 -2
- package/include/numkong/spatials/neonsdot.h +2 -2
- package/include/numkong/spatials/powervsx.h +2 -2
- package/include/numkong/spatials/rvv.h +2 -2
- package/include/numkong/spatials/sapphireamx.h +2 -2
- package/include/numkong/spatials/sierra.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +2 -2
- package/include/numkong/spatials/smef64.h +2 -2
- package/include/numkong/trigonometry/haswell.h +2 -2
- package/include/numkong/trigonometry/neon.h +2 -2
- package/include/numkong/trigonometry/rvv.h +2 -2
- package/include/numkong/trigonometry/skylake.h +2 -2
- package/include/numkong/types.h +88 -80
- package/package.json +7 -7
package/README.md
CHANGED
|
@@ -10,39 +10,39 @@ Most libraries return dot products in the __same type as the input__ — Float16
|
|
|
10
10
|
This leads to quiet overflow: a 2048-dimensional `i8` dot product can reach ±10 million, but `i8` maxes out at 127.
|
|
11
11
|
NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results stay in range.
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
14
|
+
| :----- | -------------------: | -------------------: | -------------------: | --------------------: |
|
|
15
|
+
| | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
|
|
16
|
+
| `f64` | 2.0 gso/s, 1e-15 err | 0.6 gso/s, 1e-15 err | 0.4 gso/s, 1e-14 err | 5.8 gso/s, 1e-16 err |
|
|
17
|
+
| `f32` | 1.5 gso/s, 2e-6 err | 0.6 gso/s, 2e-6 err | 0.4 gso/s, 5e-6 err | 7.1 gso/s, 2e-7 err |
|
|
18
|
+
| `bf16` | — | 0.5 gso/s, 1.9% err | 0.5 gso/s, 1.9% err | 9.7 gso/s, 1.8% err |
|
|
19
|
+
| `f16` | 0.2 gso/s, 0.25% err | 0.5 gso/s, 0.25% err | 0.4 gso/s, 0.25% err | 11.5 gso/s, 0.24% err |
|
|
20
|
+
| `e5m2` | — | 0.7 gso/s, 4.6% err | 0.5 gso/s, 4.6% err | 7.1 gso/s, 0% err |
|
|
21
|
+
| `i8` | 1.1 gso/s, overflow | 0.5 gso/s, overflow | 0.5 gso/s, overflow | 14.8 gso/s, 0% err |
|
|
22
|
+
|
|
23
|
+
> Single 2048-d dot product on Intel Sapphire Rapids, single-threaded.
|
|
14
24
|
> Each cell shows __gso/s, mean relative error__ vs higher-precision reference.
|
|
15
25
|
> gso/s = Giga Scalar Operations per Second — a more suitable name than GFLOP/s when counting both integer and floating-point work.
|
|
16
26
|
> NumPy 2.4, PyTorch 2.10, JAX 0.9.
|
|
17
27
|
|
|
18
|
-
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
19
|
-
| :----- | ----------------------: | ----------------------: | ----------------------: | --------------------: |
|
|
20
|
-
| | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
|
|
21
|
-
| `f64` | 2.0 gso/s, 1e-15 err | 0.6 gso/s, 1e-15 err | 0.4 gso/s, 1e-14 err | 5.8 gso/s, 1e-16 err |
|
|
22
|
-
| `f32` | 1.5 gso/s, 2e-6 err | 0.6 gso/s, 2e-6 err | 0.4 gso/s, 5e-6 err | 7.1 gso/s, 2e-7 err |
|
|
23
|
-
| `bf16` | — | 0.5 gso/s, 1.9% err | 0.5 gso/s, 1.9% err | 9.7 gso/s, 1.8% err |
|
|
24
|
-
| `f16` | 0.2 gso/s, 0.25% err | 0.5 gso/s, 0.25% err | 0.4 gso/s, 0.25% err | 11.5 gso/s, 0.24% err |
|
|
25
|
-
| `e5m2` | — | 0.7 gso/s, 4.6% err | 0.5 gso/s, 4.6% err | 7.1 gso/s, 0% err |
|
|
26
|
-
| `i8` | 1.1 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 14.8 gso/s, 0% err |
|
|
27
|
-
|
|
28
28
|
A fair objection: PyTorch and JAX are designed for throughput, not single-call latency.
|
|
29
29
|
They lower execution graphs through [XLA](https://openxla.org/) or vendored BLAS libraries like [Intel MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) and Nvidia [cuBLAS](https://developer.nvidia.com/cublas).
|
|
30
30
|
So here's the same comparison on a throughput-oriented workload — matrix multiplication:
|
|
31
31
|
|
|
32
|
+
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
33
|
+
| :----- | --------------------: | --------------------: | ---------------------: | -------------------: |
|
|
34
|
+
| | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
|
|
35
|
+
| `f64` | 65.5 gso/s, 1e-15 err | 68.2 gso/s, 1e-15 err | ~14.3 gso/s, 1e-15 err | 8.6 gso/s, 1e-16 err |
|
|
36
|
+
| `f32` | 140 gso/s, 9e-7 err | 145 gso/s, 1e-6 err | ~60.5 gso/s, 1e-6 err | 37.7 gso/s, 4e-7 err |
|
|
37
|
+
| `bf16` | — | 851 gso/s, 1.8% err | ~25.8 gso/s, 3.4% err | 458 gso/s, 3.6% err |
|
|
38
|
+
| `f16` | 0.3 gso/s, 0.25% err | 140 gso/s, 0.37% err | ~26.1 gso/s, 0.35% err | 103 gso/s, 0.26% err |
|
|
39
|
+
| `e5m2` | — | 0.4 gso/s, 4.6% err | ~26.4 gso/s, 4.6% err | 398 gso/s, 0% err |
|
|
40
|
+
| `i8` | 0.4 gso/s, overflow | 50.0 gso/s, overflow | ~0.0 gso/s, overflow | 1279 gso/s, 0% err |
|
|
41
|
+
|
|
32
42
|
> Matrix multiplication (2048 × 2048) × (2048 × 2048) on Intel Sapphire Rapids, single-threaded.
|
|
33
43
|
> gso/s = Giga Scalar Operations per Second, same format.
|
|
34
44
|
> NumPy 2.4, PyTorch 2.10, JAX 0.9, same versions.
|
|
35
45
|
|
|
36
|
-
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
37
|
-
| :----- | ----------------------: | -----------------------: | -----------------------: | -------------------: |
|
|
38
|
-
| | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
|
|
39
|
-
| `f64` | 65.5 gso/s, 1e-15 err | 68.2 gso/s, 1e-15 err | ~14.3 gso/s, 1e-15 err | 8.6 gso/s, 1e-16 err |
|
|
40
|
-
| `f32` | 140 gso/s, 9e-7 err | 145 gso/s, 1e-6 err | ~60.5 gso/s, 1e-6 err | 37.7 gso/s, 4e-7 err |
|
|
41
|
-
| `bf16` | — | 851 gso/s, 1.8% err | ~25.8 gso/s, 3.4% err | 458 gso/s, 3.6% err |
|
|
42
|
-
| `f16` | 0.3 gso/s, 0.25% err | 140 gso/s, 0.37% err | ~26.1 gso/s, 0.35% err | 103 gso/s, 0.26% err |
|
|
43
|
-
| `e5m2` | — | 0.4 gso/s, 4.6% err | ~26.4 gso/s, 4.6% err | 398 gso/s, 0% err |
|
|
44
|
-
| `i8` | 0.4 gso/s, __overflow__ | 50.0 gso/s, __overflow__ | ~0.0 gso/s, __overflow__ | 1279 gso/s, 0% err |
|
|
45
|
-
|
|
46
46
|
For `f64`, compensated "Dot2" summation reduces error by 10–50× compared to naive Float64 accumulation, depending on vector length.
|
|
47
47
|
For `f32`, widening to Float64 gives 5–10× lower error.
|
|
48
48
|
The library ships as a relatively small binary:
|
|
@@ -77,27 +77,27 @@ NumKong covers 17 numeric types — from 6-bit floats to 64-bit complex numbers
|
|
|
77
77
|
|
|
78
78
|
### Language Bindings
|
|
79
79
|
|
|
80
|
-
| Operation | [C
|
|
81
|
-
| :-------------------------- |
|
|
82
|
-
| __Vector Ops__ |
|
|
83
|
-
| [Dot] Product |
|
|
84
|
-
| [Spatial] Metric |
|
|
85
|
-
| [Set] Similarity |
|
|
86
|
-
| [Geo]spatial |
|
|
87
|
-
| [Mesh] Alignment |
|
|
88
|
-
| [Sparse] Products |
|
|
89
|
-
| [Probability] Divergences |
|
|
90
|
-
| [Curved] Spaces |
|
|
91
|
-
| __Many-to-Many Vector Ops__ |
|
|
92
|
-
| "[Dots]" Products |
|
|
93
|
-
| "[Spatials]" Metrics |
|
|
94
|
-
| "[Sets]" Similarities |
|
|
95
|
-
| [MaxSim] Scoring |
|
|
96
|
-
| __Scalar Ops__ |
|
|
97
|
-
| [Cast] |
|
|
98
|
-
| [Reduce] |
|
|
99
|
-
| [Each] |
|
|
100
|
-
| [Trigonometry] |
|
|
80
|
+
| Operation | [C 99 & C++ 23][c] | [Python][py] | [Rust][rs] | [JavaScript][js] | [Swift][swift] | [GoLang][go] |
|
|
81
|
+
| :-------------------------- | :----------------: | :----------: | :--------: | :--------------: | :------------: | :----------: |
|
|
82
|
+
| __Vector Ops__ | | | | | | |
|
|
83
|
+
| [Dot] Product | ● | ● | ● | ● | ● | ● |
|
|
84
|
+
| [Spatial] Metric | ● | ● | ● | ● | ● | ● |
|
|
85
|
+
| [Set] Similarity | ● | ● | ● | ● | ● | ● |
|
|
86
|
+
| [Geo]spatial | ● | ● | ● | · | ● | ● |
|
|
87
|
+
| [Mesh] Alignment | ● | ● | ● | · | · | · |
|
|
88
|
+
| [Sparse] Products | ● | ● | ● | · | · | · |
|
|
89
|
+
| [Probability] Divergences | ● | ● | ● | ● | · | ● |
|
|
90
|
+
| [Curved] Spaces | ● | ● | ● | · | · | · |
|
|
91
|
+
| __Many-to-Many Vector Ops__ | | | | | | |
|
|
92
|
+
| "[Dots]" Products | ● | ● | ● | ● | ● | ● |
|
|
93
|
+
| "[Spatials]" Metrics | ● | ● | ● | ● | ● | ● |
|
|
94
|
+
| "[Sets]" Similarities | ● | ● | ● | · | ● | ● |
|
|
95
|
+
| [MaxSim] Scoring | ● | ● | ● | · | ● | ● |
|
|
96
|
+
| __Scalar Ops__ | | | | | | |
|
|
97
|
+
| [Cast] | ● | ● | ● | ● | · | · |
|
|
98
|
+
| [Reduce] | ● | ● | ● | · | · | · |
|
|
99
|
+
| [Each] | ● | ● | ● | · | · | · |
|
|
100
|
+
| [Trigonometry] | ● | ● | ● | · | · | · |
|
|
101
101
|
|
|
102
102
|
[Dot]: include/numkong/dot/README.md
|
|
103
103
|
[Dots]: include/numkong/dots/README.md
|
|
@@ -392,16 +392,16 @@ On x86, older CPUs use __F16C extensions__ (Ivy Bridge+) for fast Float16 → Fl
|
|
|
392
392
|
On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float32 widening multiply-accumulate, reducing the total latency from 7 cycles to 4 cycles and achieving 20–48% speedup over the separate convert-then-FMA path.
|
|
393
393
|
|
|
394
394
|
| Platform | BFloat16 Path | Elem/Op | Float16 Path | Elem/Op |
|
|
395
|
-
|
|
|
395
|
+
| :--------------------- | :------------------------- | ------: | :--------------------- | ------: |
|
|
396
396
|
| __x86__ | | | | |
|
|
397
|
-
| Diamond Rapids (
|
|
397
|
+
| Diamond Rapids (2026) | ↓ Genoa | 32 | `VDPPHPS` widening dot | 32 |
|
|
398
398
|
| Sapphire Rapids (2023) | ↓ Genoa | 32 | ↓ Skylake | 16 |
|
|
399
399
|
| Genoa (2022) | `VDPBF16PS` widening dot | 32 | ↓ Skylake | 16 |
|
|
400
400
|
| Skylake (2015) | `SLLI` + `VFMADD` | 16 | `VCVTPH2PS` + `VFMADD` | 16 |
|
|
401
401
|
| Haswell (2013) | `SLLI` + `VFMADD` | 8 | `VCVTPH2PS` + `VFMADD` | 8 |
|
|
402
402
|
| __Arm__ | | | | |
|
|
403
|
-
| Graviton 3 (2021) | `SVBFDOT` widening dot | 4–32 | `SVCVT` → `SVFMLA` | 4–32 |
|
|
404
403
|
| Apple M2+ (2022) | `BFDOT` widening dot | 8 | ↓ FP16FML | 8 |
|
|
404
|
+
| Graviton 3+ (2021) | `SVBFDOT` widening dot | 4–32 | `SVCVT` → `SVFMLA` | 4–32 |
|
|
405
405
|
| Apple M1 (2020) | ↓ NEON | 8 | `FMLAL` widening FMA | 8 |
|
|
406
406
|
| Graviton 2 (2019) | ↓ NEON | 8 | `FCVTL` + `FMLA` | 4 |
|
|
407
407
|
| Graviton 1 (2018) | `SHLL` + `FMLA` | 8 | bit-manip → `FMLA` | 8 |
|
|
@@ -420,14 +420,14 @@ On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float
|
|
|
420
420
|
|
|
421
421
|
### Mini-Floats: E4M3, E5M2, E3M2, & E2M3
|
|
422
422
|
|
|
423
|
-
| Format
|
|
424
|
-
|
|
|
425
|
-
| E5M2FN
|
|
426
|
-
| E4M3FN
|
|
427
|
-
| E3M2FN
|
|
428
|
-
| E2M3FN
|
|
429
|
-
|
|
|
430
|
-
|
|
|
423
|
+
| Format | Bits | Range | NumKong Promotion Rules | Support in GPUs |
|
|
424
|
+
| :----------- | ----: | -----: | ------------------------------------- | ----------------- |
|
|
425
|
+
| E5M2FN | 8 | ±57344 | BFloat16 → Float32 | H100+, MI300+ |
|
|
426
|
+
| E4M3FN | 8 | ±448 | BFloat16 → Float32 | H100+, MI300+ |
|
|
427
|
+
| E3M2FN | 6 → 8 | ±28 | B- & Float16 → Float32, Int16 → Int32 | only block-scaled |
|
|
428
|
+
| E2M3FN | 6 → 8 | ±7.5 | B- & Float16 → Float32, Int8 → Int32 | only block-scaled |
|
|
429
|
+
| Scaled NVFP4 | 4 | ±6 | — | B200+ |
|
|
430
|
+
| Scaled MXFP4 | 4 | ±6 | — | B200+, MI325+ |
|
|
431
431
|
|
|
432
432
|
> __Block scaling.__
|
|
433
433
|
> NumKong does not implement block-scaled variants (MXFP4, NVFP4, or block-scaled E3M2/E2M3).
|
|
@@ -444,22 +444,22 @@ E4M3FN (no infinities, NaN only) is preferred for __training__ where precision n
|
|
|
444
444
|
On x86 Genoa/Sapphire Rapids, E4M3/E5M2 values upcast to BFloat16 via lookup tables, then use native __DPBF16PS__ for 2-per-lane dot products accumulating to Float32.
|
|
445
445
|
On Arm Graviton 3+, the same BFloat16 upcast happens via NEON table lookups, then __BFDOT__ instructions complete the computation.
|
|
446
446
|
|
|
447
|
-
| Platform
|
|
448
|
-
|
|
|
449
|
-
| __x86__
|
|
450
|
-
| Diamond Rapids (
|
|
451
|
-
| Genoa (2022)
|
|
452
|
-
| Ice Lake (2019)
|
|
453
|
-
| Skylake (2015)
|
|
454
|
-
| Haswell (2013)
|
|
455
|
-
| __Arm__
|
|
456
|
-
| NEON + FP8DOT (
|
|
457
|
-
| NEON + FP16FML (
|
|
458
|
-
| NEON (
|
|
459
|
-
| __RISC-V__
|
|
460
|
-
| RVV + Zvfbfwma
|
|
461
|
-
| RVV + Zvfh
|
|
462
|
-
| RVV
|
|
447
|
+
| Platform | E5M2 Path | Elem/Op | E4M3 Path | Elem/Op |
|
|
448
|
+
| :-------------------- | :----------------------------- | ------: | :----------------------------- | ------: |
|
|
449
|
+
| __x86__ | | | | |
|
|
450
|
+
| Diamond Rapids (2026) | `VCVTBF82PH` → F16 + `VDPPHPS` | 32 | `VCVTHF82PH` → F16 + `VDPPHPS` | 32 |
|
|
451
|
+
| Genoa (2022) | → BF16 + `VDPBF16PS` | 32 | ↓ Ice Lake | 64 |
|
|
452
|
+
| Ice Lake (2019) | ↓ Skylake | 16 | octave LUT + `VPDPBUSD` | 64 |
|
|
453
|
+
| Skylake (2015) | rebias → F32 FMA | 16 | rebias → F32 FMA | 16 |
|
|
454
|
+
| Haswell (2013) | rebias → F32 FMA | 8 | rebias → F32 FMA | 8 |
|
|
455
|
+
| __Arm__ | | | | |
|
|
456
|
+
| NEON + FP8DOT (2026) | native `FDOT` | 16 | native `FDOT` | 16 |
|
|
457
|
+
| NEON + FP16FML (2020) | SHL → F16 + `FMLAL` | 16 | LUT → F16 + `FMLAL` | 16 |
|
|
458
|
+
| NEON (2018) | SHL + `FCVTL` + FMA | 8 | → F16 + `FCVTL` + FMA | 8 |
|
|
459
|
+
| __RISC-V__ | | | | |
|
|
460
|
+
| RVV + Zvfbfwma | rebias → BF16 + `VFWMACCBF16` | 4–32 | LUT → BF16 + `VFWMACCBF16` | 4–32 |
|
|
461
|
+
| RVV + Zvfh | SHL → F16 + `VFWMACC` | 4–32 | LUT → F16 + `VFWMACC` | 4–32 |
|
|
462
|
+
| RVV | rebias → F32 + `VFMACC` | 4–32 | LUT → F32 + `VFMACC` | 4–32 |
|
|
463
463
|
|
|
464
464
|
> E5M2 shares Float16's exponent bias (15), so E5M2 → Float16 conversion is a single left-shift by 8 bits (`SHL 8`).
|
|
465
465
|
> E4M3 on Ice Lake uses "octave decomposition": the 4-bit exponent splits into 2 octave + 2 remainder bits, yielding 7 integer accumulators post-scaled by powers of 2.
|
|
@@ -469,20 +469,20 @@ Their smaller range allows scaling to exact integers that fit in `i8`/`i16`, ena
|
|
|
469
469
|
Float16 can also serve as an accumulator, accurately representing ~50 products of E3M2FN pairs or ~20 products of E2M3FN pairs before overflow.
|
|
470
470
|
On Arm, NEON FHM extensions bring widening `FMLAL` dot-products for Float16 — both faster and more widely available than `BFDOT` for BFloat16.
|
|
471
471
|
|
|
472
|
-
| Platform
|
|
473
|
-
|
|
|
474
|
-
| __x86__
|
|
475
|
-
|
|
|
476
|
-
|
|
|
477
|
-
|
|
|
478
|
-
| Skylake (2015)
|
|
479
|
-
| Haswell (2013)
|
|
480
|
-
| __Arm__
|
|
481
|
-
| NEON + FP8DOT (
|
|
482
|
-
| NEON + DotProd (
|
|
483
|
-
| NEON (
|
|
484
|
-
| __RISC-V__
|
|
485
|
-
| RVV
|
|
472
|
+
| Platform | E3M2 Path | Elem/Op | E2M3 Path | Elem/Op |
|
|
473
|
+
| :-------------------- | :------------------------- | ------: | :--------------------------- | ------: |
|
|
474
|
+
| __x86__ | | | | |
|
|
475
|
+
| Sierra Forest (2024) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBSSD` | 32 |
|
|
476
|
+
| Alder Lake (2021) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBUSD` | 32 |
|
|
477
|
+
| Ice Lake (2019) | `VPERMW` LUT + `VPMADDWD` | 32 | `VPERMB` LUT + `VPDPBUSD` | 64 |
|
|
478
|
+
| Skylake (2015) | `VPSHUFB` LUT + `VPMADDWD` | 64 | `VPSHUFB` LUT + `VPMADDUBSW` | 64 |
|
|
479
|
+
| Haswell (2013) | `VPSHUFB` LUT + `VPMADDWD` | 32 | `VPSHUFB` LUT + `VPMADDUBSW` | 32 |
|
|
480
|
+
| __Arm__ | | | | |
|
|
481
|
+
| NEON + FP8DOT (2026) | → E5M2 + `FDOT` | 16 | → E4M3 + `FDOT` | 16 |
|
|
482
|
+
| NEON + DotProd (2019) | `VQTBL2` LUT + `SMLAL` | 16 | `VQTBL2` LUT + `SDOT` | 16 |
|
|
483
|
+
| NEON (2018) | → F16 + `FCVTL` + FMA | 16 | → F16 + `FCVTL` + FMA | 16 |
|
|
484
|
+
| __RISC-V__ | | | | |
|
|
485
|
+
| RVV | I16 gather LUT + `VWMACC` | 4–32 | U8 gather LUT + `VWMACC` | 4–32 |
|
|
486
486
|
|
|
487
487
|
> E3M2/E2M3 values map to exact integers via 32-entry LUTs (magnitudes up to 448 for E3M2, 120 for E2M3), enabling integer accumulation with no rounding error.
|
|
488
488
|
> On NEON + FP8DOT, E3M2 is first promoted to E5M2 and E2M3 to E4M3 before the hardware `FDOT` instruction.
|
|
@@ -494,7 +494,7 @@ E5M2's range (±57,344) makes the scaled product exceed Int32 entirely.
|
|
|
494
494
|
Without the integer path, E5M2 falls back to Float32 accumulation — where its [2-bit mantissa (only 4 values per binade)](https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/) creates a [catastrophic cancellation risk](https://www.ac.uma.es/arith2024/papers/Fused%20FP8%204-Way%20Dot%20Product%20with%20Scaling%20and%20FP32%20Accumulation.pdf) that E2M3's integer path avoids completely:
|
|
495
495
|
|
|
496
496
|
| | _i_ = 0 | _i_ = 1 | _i_ = 2 | _i_ = 3 | _i_ = 4 | _i_ = 5 | _i_ = 6 |
|
|
497
|
-
|
|
|
497
|
+
| :------ | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
|
|
498
498
|
| _aᵢ_ | 0.00122 | 20480 | −0.00122 | 1.5 | −3072 | −640 | 0.00146 |
|
|
499
499
|
| _bᵢ_ | −40 | 320 | −1280 | −7.63e⁻⁵ | 0.000427 | 10240 | −4.58e⁻⁵ |
|
|
500
500
|
| _aᵢ·bᵢ_ | −0.04883 | 6553600 | 1.5625 | −0.000114 | −1.3125 | −6553600 | ≈ 0 |
|
package/c/numkong.c
CHANGED
|
@@ -943,7 +943,7 @@ BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved) {
|
|
|
943
943
|
// carries ZA state. So __arm_tpidr2_save is always a no-op and
|
|
944
944
|
// __arm_tpidr2_restore has nothing to restore.
|
|
945
945
|
// Weak linkage lets a real compiler-rt override these if available.
|
|
946
|
-
#if
|
|
946
|
+
#if NK_TARGET_ARM64_ && NK_TARGET_SME
|
|
947
947
|
__attribute__((weak, visibility("default"))) void __arm_tpidr2_save(void) {}
|
|
948
948
|
__attribute__((weak, visibility("default"))) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
|
|
949
949
|
#endif
|
|
@@ -62,7 +62,7 @@
|
|
|
62
62
|
#ifndef NK_ATTENTION_SAPPHIREAMX_H
|
|
63
63
|
#define NK_ATTENTION_SAPPHIREAMX_H
|
|
64
64
|
|
|
65
|
-
#if
|
|
65
|
+
#if NK_TARGET_X8664_
|
|
66
66
|
#if NK_TARGET_SAPPHIREAMX
|
|
67
67
|
|
|
68
68
|
#include "numkong/types.h"
|
|
@@ -1359,5 +1359,5 @@ NK_PUBLIC void nk_attention_causal_bf16_sapphireamx(nk_bf16_t const *q, void con
|
|
|
1359
1359
|
#endif
|
|
1360
1360
|
|
|
1361
1361
|
#endif // NK_TARGET_SAPPHIREAMX
|
|
1362
|
-
#endif //
|
|
1362
|
+
#endif // NK_TARGET_X8664_
|
|
1363
1363
|
#endif // NK_ATTENTION_SAPPHIREAMX_H
|
|
@@ -91,7 +91,7 @@
|
|
|
91
91
|
#ifndef NK_ATTENTION_SME_H
|
|
92
92
|
#define NK_ATTENTION_SME_H
|
|
93
93
|
|
|
94
|
-
#if
|
|
94
|
+
#if NK_TARGET_ARM64_
|
|
95
95
|
#if NK_TARGET_SME
|
|
96
96
|
|
|
97
97
|
#include "numkong/types.h"
|
|
@@ -2068,5 +2068,5 @@ NK_PUBLIC void nk_attention_causal_f16_sme(nk_f16_t const *q, void const *kv_pac
|
|
|
2068
2068
|
#endif
|
|
2069
2069
|
|
|
2070
2070
|
#endif // NK_TARGET_SME
|
|
2071
|
-
#endif //
|
|
2071
|
+
#endif // NK_TARGET_ARM64_
|
|
2072
2072
|
#endif // NK_ATTENTION_SME_H
|
|
@@ -96,7 +96,7 @@
|
|
|
96
96
|
|
|
97
97
|
#define NK_VERSION_MAJOR 7
|
|
98
98
|
#define NK_VERSION_MINOR 4
|
|
99
|
-
#define NK_VERSION_PATCH
|
|
99
|
+
#define NK_VERSION_PATCH 3
|
|
100
100
|
|
|
101
101
|
/**
|
|
102
102
|
* @brief Removes compile-time dispatching, and replaces it with runtime dispatching.
|
|
@@ -132,33 +132,33 @@
|
|
|
132
132
|
// With `-std=c11` glibc hides `syscall()` behind `_GNU_SOURCE`, but if any
|
|
133
133
|
// system header was included before us, `<features.h>` is already locked.
|
|
134
134
|
// Forward-declare `syscall` directly — it always exists in glibc.
|
|
135
|
-
#if defined(NK_DEFINED_LINUX_) && (
|
|
135
|
+
#if defined(NK_DEFINED_LINUX_) && (NK_TARGET_X8664_ || NK_TARGET_RISCV64_)
|
|
136
136
|
#include <sys/syscall.h> // `SYS_arch_prctl`, `SYS_riscv_hwprobe`
|
|
137
137
|
#ifdef __cplusplus
|
|
138
138
|
extern "C" long syscall(long, ...) noexcept;
|
|
139
139
|
#else
|
|
140
140
|
extern long syscall(long, ...);
|
|
141
141
|
#endif
|
|
142
|
-
#if
|
|
142
|
+
#if NK_TARGET_RISCV64_
|
|
143
143
|
#include <sys/auxv.h> // `getauxval`, `AT_HWCAP`
|
|
144
144
|
#endif
|
|
145
145
|
#endif
|
|
146
146
|
|
|
147
|
-
#if defined(NK_DEFINED_LINUX_) &&
|
|
147
|
+
#if defined(NK_DEFINED_LINUX_) && NK_TARGET_LOONGARCH64_
|
|
148
148
|
#include <sys/auxv.h> // `getauxval`, `AT_HWCAP`
|
|
149
149
|
#endif
|
|
150
150
|
|
|
151
|
-
#if defined(NK_DEFINED_LINUX_) &&
|
|
151
|
+
#if defined(NK_DEFINED_LINUX_) && NK_TARGET_POWER64_
|
|
152
152
|
#include <sys/auxv.h> // `getauxval`, `AT_HWCAP`
|
|
153
153
|
#endif
|
|
154
154
|
|
|
155
155
|
// On FreeBSD RISC-V, we use elf_aux_info for capability detection
|
|
156
|
-
#if defined(NK_DEFINED_FREEBSD_) &&
|
|
156
|
+
#if defined(NK_DEFINED_FREEBSD_) && NK_TARGET_RISCV64_
|
|
157
157
|
#include <sys/auxv.h> // `elf_aux_info`, `AT_HWCAP`
|
|
158
158
|
#endif
|
|
159
159
|
|
|
160
160
|
// On Windows ARM, we use IsProcessorFeaturePresent API for capability detection
|
|
161
|
-
#if defined(NK_DEFINED_WINDOWS_) &&
|
|
161
|
+
#if defined(NK_DEFINED_WINDOWS_) && NK_TARGET_ARM64_
|
|
162
162
|
#include <processthreadsapi.h> // `IsProcessorFeaturePresent`
|
|
163
163
|
#endif
|
|
164
164
|
|
|
@@ -388,7 +388,7 @@ typedef void (*nk_kernel_cast_punned_t)(void const *from, nk_dtype_t from_type,
|
|
|
388
388
|
|
|
389
389
|
typedef void (*nk_kernel_punned_t)(void *);
|
|
390
390
|
|
|
391
|
-
#if
|
|
391
|
+
#if NK_TARGET_X8664_
|
|
392
392
|
|
|
393
393
|
NK_PUBLIC int nk_configure_thread_x86_(nk_capability_t capabilities) {
|
|
394
394
|
#if NK_TARGET_SAPPHIREAMX
|
|
@@ -409,7 +409,7 @@ NK_PUBLIC int nk_configure_thread_x86_(nk_capability_t capabilities) {
|
|
|
409
409
|
return 1;
|
|
410
410
|
}
|
|
411
411
|
|
|
412
|
-
NK_PUBLIC nk_capability_t
|
|
412
|
+
NK_PUBLIC nk_capability_t nk_capabilities_x8664_(void) {
|
|
413
413
|
union four_registers_t {
|
|
414
414
|
int array[4];
|
|
415
415
|
struct separate_t {
|
|
@@ -496,9 +496,9 @@ NK_PUBLIC nk_capability_t nk_capabilities_x86_(void) {
|
|
|
496
496
|
(nk_cap_graniteamx_k * supports_graniteamx) | (nk_cap_serial_k));
|
|
497
497
|
}
|
|
498
498
|
|
|
499
|
-
#endif //
|
|
499
|
+
#endif // NK_TARGET_X8664_
|
|
500
500
|
|
|
501
|
-
#if
|
|
501
|
+
#if NK_TARGET_ARM64_
|
|
502
502
|
|
|
503
503
|
#if defined(__clang__)
|
|
504
504
|
#pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function)
|
|
@@ -508,14 +508,14 @@ NK_PUBLIC nk_capability_t nk_capabilities_x86_(void) {
|
|
|
508
508
|
#endif
|
|
509
509
|
|
|
510
510
|
#if NK_HAS_POSIX_EXTENSIONS_
|
|
511
|
-
static sigjmp_buf
|
|
512
|
-
static void
|
|
511
|
+
static sigjmp_buf nk_mrs_arm64_jump_buffer_;
|
|
512
|
+
static void nk_mrs_arm64_sigill_handler_(int sig) {
|
|
513
513
|
nk_unused_(sig);
|
|
514
|
-
siglongjmp(
|
|
514
|
+
siglongjmp(nk_mrs_arm64_jump_buffer_, 1);
|
|
515
515
|
}
|
|
516
516
|
#endif
|
|
517
517
|
|
|
518
|
-
NK_PUBLIC int
|
|
518
|
+
NK_PUBLIC int nk_configure_thread_arm64_(nk_capability_t capabilities) {
|
|
519
519
|
#if defined(_MSC_VER)
|
|
520
520
|
nk_unused_(capabilities);
|
|
521
521
|
return 1;
|
|
@@ -546,7 +546,7 @@ NK_PUBLIC int nk_configure_thread_arm_(nk_capability_t capabilities) {
|
|
|
546
546
|
|
|
547
547
|
#elif defined(NK_DEFINED_LINUX_) || defined(NK_DEFINED_FREEBSD_)
|
|
548
548
|
// Read ID registers via MRS. Only safe if MRS is known to work — indicated by
|
|
549
|
-
// capabilities beyond basic NEON (
|
|
549
|
+
// capabilities beyond basic NEON (nk_capabilities_arm64_ validated MRS via sigaction probe).
|
|
550
550
|
if (capabilities & ~(nk_cap_neon_k | nk_cap_serial_k)) {
|
|
551
551
|
// FEAT_EBF16: ID_AA64ISAR1_EL1.BF16 bits [47:44] >= 0b0010
|
|
552
552
|
register unsigned long isar1_val __asm__("x0");
|
|
@@ -570,7 +570,7 @@ NK_PUBLIC int nk_configure_thread_arm_(nk_capability_t capabilities) {
|
|
|
570
570
|
#endif // _MSC_VER
|
|
571
571
|
}
|
|
572
572
|
|
|
573
|
-
NK_PUBLIC nk_capability_t
|
|
573
|
+
NK_PUBLIC nk_capability_t nk_capabilities_arm64_(void) {
|
|
574
574
|
#if defined(NK_DEFINED_APPLE_)
|
|
575
575
|
size_t size = sizeof(unsigned);
|
|
576
576
|
unsigned supports_neon = 0, supports_fp16 = 0, supports_fhm = 0, supports_bf16 = 0, supports_i8mm = 0;
|
|
@@ -602,13 +602,13 @@ NK_PUBLIC nk_capability_t nk_capabilities_arm_(void) {
|
|
|
602
602
|
|
|
603
603
|
#if NK_HAS_POSIX_EXTENSIONS_
|
|
604
604
|
struct sigaction action_new, action_old;
|
|
605
|
-
action_new.sa_handler =
|
|
605
|
+
action_new.sa_handler = nk_mrs_arm64_sigill_handler_;
|
|
606
606
|
sigemptyset(&action_new.sa_mask);
|
|
607
607
|
action_new.sa_flags = 0;
|
|
608
608
|
|
|
609
609
|
int mrs_works = 0;
|
|
610
610
|
if (sigaction(SIGILL, &action_new, &action_old) == 0) {
|
|
611
|
-
if (sigsetjmp(
|
|
611
|
+
if (sigsetjmp(nk_mrs_arm64_jump_buffer_, 1) == 0) {
|
|
612
612
|
register unsigned long midr_value __asm__("x0");
|
|
613
613
|
__asm__ __volatile__(".inst 0xD5380000" : "=r"(midr_value)); // MRS x0, MIDR_EL1
|
|
614
614
|
mrs_works = 1;
|
|
@@ -722,11 +722,11 @@ NK_PUBLIC nk_capability_t nk_capabilities_arm_(void) {
|
|
|
722
722
|
#pragma GCC pop_options
|
|
723
723
|
#endif
|
|
724
724
|
|
|
725
|
-
#endif //
|
|
725
|
+
#endif // NK_TARGET_ARM64_
|
|
726
726
|
|
|
727
|
-
#if
|
|
727
|
+
#if NK_TARGET_RISCV64_
|
|
728
728
|
|
|
729
|
-
NK_PUBLIC nk_capability_t
|
|
729
|
+
NK_PUBLIC nk_capability_t nk_capabilities_riscv64_(void) {
|
|
730
730
|
#if defined(NK_DEFINED_LINUX_)
|
|
731
731
|
unsigned long hwcap = getauxval(AT_HWCAP);
|
|
732
732
|
nk_capability_t caps = nk_cap_serial_k;
|
|
@@ -758,11 +758,11 @@ NK_PUBLIC nk_capability_t nk_capabilities_riscv_(void) {
|
|
|
758
758
|
#endif
|
|
759
759
|
}
|
|
760
760
|
|
|
761
|
-
#endif //
|
|
761
|
+
#endif // NK_TARGET_RISCV64_
|
|
762
762
|
|
|
763
|
-
#if
|
|
763
|
+
#if NK_TARGET_LOONGARCH64_
|
|
764
764
|
|
|
765
|
-
NK_PUBLIC nk_capability_t
|
|
765
|
+
NK_PUBLIC nk_capability_t nk_capabilities_loongarch64_(void) {
|
|
766
766
|
#if defined(NK_DEFINED_LINUX_)
|
|
767
767
|
unsigned long hwcap = getauxval(AT_HWCAP);
|
|
768
768
|
nk_capability_t caps = nk_cap_serial_k;
|
|
@@ -774,11 +774,11 @@ NK_PUBLIC nk_capability_t nk_capabilities_loongarch_(void) {
|
|
|
774
774
|
#endif
|
|
775
775
|
}
|
|
776
776
|
|
|
777
|
-
#endif //
|
|
777
|
+
#endif // NK_TARGET_LOONGARCH64_
|
|
778
778
|
|
|
779
|
-
#if
|
|
779
|
+
#if NK_TARGET_POWER64_
|
|
780
780
|
|
|
781
|
-
NK_PUBLIC nk_capability_t
|
|
781
|
+
NK_PUBLIC nk_capability_t nk_capabilities_power64_(void) {
|
|
782
782
|
#if defined(NK_DEFINED_LINUX_)
|
|
783
783
|
unsigned long hwcap = getauxval(AT_HWCAP);
|
|
784
784
|
unsigned long hwcap2 = getauxval(AT_HWCAP2);
|
|
@@ -792,7 +792,7 @@ NK_PUBLIC nk_capability_t nk_capabilities_power_(void) {
|
|
|
792
792
|
#endif
|
|
793
793
|
}
|
|
794
794
|
|
|
795
|
-
#endif //
|
|
795
|
+
#endif // NK_TARGET_POWER64_
|
|
796
796
|
|
|
797
797
|
#if NK_TARGET_WASM_
|
|
798
798
|
|
|
@@ -826,27 +826,27 @@ NK_PUBLIC nk_capability_t nk_capabilities_v128relaxed_(void) {
|
|
|
826
826
|
#endif // NK_TARGET_WASM_
|
|
827
827
|
|
|
828
828
|
NK_PUBLIC int nk_configure_thread_(nk_capability_t capabilities) {
|
|
829
|
-
#if
|
|
829
|
+
#if NK_TARGET_X8664_
|
|
830
830
|
return nk_configure_thread_x86_(capabilities);
|
|
831
831
|
#endif
|
|
832
|
-
#if
|
|
833
|
-
return
|
|
832
|
+
#if NK_TARGET_ARM64_
|
|
833
|
+
return nk_configure_thread_arm64_(capabilities);
|
|
834
834
|
#endif
|
|
835
835
|
nk_unused_(capabilities);
|
|
836
836
|
return 1; // success — no platform-specific thread configuration needed
|
|
837
837
|
}
|
|
838
838
|
|
|
839
839
|
NK_PUBLIC nk_capability_t nk_capabilities_(void) {
|
|
840
|
-
#if
|
|
841
|
-
return
|
|
842
|
-
#elif
|
|
843
|
-
return
|
|
844
|
-
#elif
|
|
845
|
-
return
|
|
846
|
-
#elif
|
|
847
|
-
return
|
|
848
|
-
#elif
|
|
849
|
-
return
|
|
840
|
+
#if NK_TARGET_X8664_
|
|
841
|
+
return nk_capabilities_x8664_();
|
|
842
|
+
#elif NK_TARGET_ARM64_
|
|
843
|
+
return nk_capabilities_arm64_();
|
|
844
|
+
#elif NK_TARGET_RISCV64_
|
|
845
|
+
return nk_capabilities_riscv64_();
|
|
846
|
+
#elif NK_TARGET_LOONGARCH64_
|
|
847
|
+
return nk_capabilities_loongarch64_();
|
|
848
|
+
#elif NK_TARGET_POWER64_
|
|
849
|
+
return nk_capabilities_power64_();
|
|
850
850
|
#elif NK_TARGET_WASM_
|
|
851
851
|
return nk_capabilities_v128relaxed_();
|
|
852
852
|
#else
|
|
@@ -860,7 +860,7 @@ NK_PUBLIC nk_capability_t nk_capabilities_(void) {
|
|
|
860
860
|
*/
|
|
861
861
|
NK_PUBLIC nk_capability_t nk_capabilities_compiled_(void) {
|
|
862
862
|
nk_capability_t caps = nk_cap_serial_k;
|
|
863
|
-
#if
|
|
863
|
+
#if NK_TARGET_X8664_
|
|
864
864
|
caps |= nk_cap_haswell_k * NK_TARGET_HASWELL;
|
|
865
865
|
caps |= nk_cap_skylake_k * NK_TARGET_SKYLAKE;
|
|
866
866
|
caps |= nk_cap_icelake_k * NK_TARGET_ICELAKE;
|
|
@@ -873,7 +873,7 @@ NK_PUBLIC nk_capability_t nk_capabilities_compiled_(void) {
|
|
|
873
873
|
caps |= nk_cap_alder_k * NK_TARGET_ALDER;
|
|
874
874
|
caps |= nk_cap_sierra_k * NK_TARGET_SIERRA;
|
|
875
875
|
#endif
|
|
876
|
-
#if
|
|
876
|
+
#if NK_TARGET_ARM64_
|
|
877
877
|
caps |= nk_cap_neon_k * NK_TARGET_NEON;
|
|
878
878
|
caps |= nk_cap_neonhalf_k * NK_TARGET_NEONHALF;
|
|
879
879
|
caps |= nk_cap_neonsdot_k * NK_TARGET_NEONSDOT;
|
|
@@ -896,16 +896,16 @@ NK_PUBLIC nk_capability_t nk_capabilities_compiled_(void) {
|
|
|
896
896
|
caps |= nk_cap_smelut2_k * NK_TARGET_SMELUT2;
|
|
897
897
|
caps |= nk_cap_smefa64_k * NK_TARGET_SMEFA64;
|
|
898
898
|
#endif
|
|
899
|
-
#if
|
|
899
|
+
#if NK_TARGET_RISCV64_
|
|
900
900
|
caps |= nk_cap_rvv_k * NK_TARGET_RVV;
|
|
901
901
|
caps |= nk_cap_rvvhalf_k * NK_TARGET_RVVHALF;
|
|
902
902
|
caps |= nk_cap_rvvbf16_k * NK_TARGET_RVVBF16;
|
|
903
903
|
caps |= nk_cap_rvvbb_k * NK_TARGET_RVVBB;
|
|
904
904
|
#endif
|
|
905
|
-
#if
|
|
905
|
+
#if NK_TARGET_LOONGARCH64_
|
|
906
906
|
caps |= nk_cap_loongsonasx_k * NK_TARGET_LOONGSONASX;
|
|
907
907
|
#endif
|
|
908
|
-
#if
|
|
908
|
+
#if NK_TARGET_POWER64_
|
|
909
909
|
caps |= nk_cap_powervsx_k * NK_TARGET_POWERVSX;
|
|
910
910
|
#endif
|
|
911
911
|
#if NK_TARGET_WASM_
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
#ifndef NK_CAST_DIAMOND_H
|
|
13
13
|
#define NK_CAST_DIAMOND_H
|
|
14
14
|
|
|
15
|
-
#if
|
|
15
|
+
#if NK_TARGET_X8664_
|
|
16
16
|
#if NK_TARGET_DIAMOND
|
|
17
17
|
|
|
18
18
|
#include "numkong/types.h"
|
|
@@ -60,5 +60,5 @@ NK_INTERNAL void nk_partial_load_e5m2x32_to_f16x32_diamond_(nk_e5m2_t const *src
|
|
|
60
60
|
#endif
|
|
61
61
|
|
|
62
62
|
#endif // NK_TARGET_DIAMOND
|
|
63
|
-
#endif //
|
|
63
|
+
#endif // NK_TARGET_X8664_
|
|
64
64
|
#endif // NK_CAST_DIAMOND_H
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
#ifndef NK_CAST_HASWELL_H
|
|
21
21
|
#define NK_CAST_HASWELL_H
|
|
22
22
|
|
|
23
|
-
#if
|
|
23
|
+
#if NK_TARGET_X8664_
|
|
24
24
|
#if NK_TARGET_HASWELL
|
|
25
25
|
|
|
26
26
|
#include "numkong/types.h"
|
|
@@ -819,5 +819,5 @@ NK_PUBLIC void nk_cast_haswell(void const *from, nk_dtype_t from_type, nk_size_t
|
|
|
819
819
|
#endif
|
|
820
820
|
|
|
821
821
|
#endif // NK_TARGET_HASWELL
|
|
822
|
-
#endif //
|
|
822
|
+
#endif // NK_TARGET_X8664_
|
|
823
823
|
#endif // NK_CAST_HASWELL_H
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
#ifndef NK_CAST_ICELAKE_H
|
|
20
20
|
#define NK_CAST_ICELAKE_H
|
|
21
21
|
|
|
22
|
-
#if
|
|
22
|
+
#if NK_TARGET_X8664_
|
|
23
23
|
#if NK_TARGET_ICELAKE
|
|
24
24
|
|
|
25
25
|
#include "numkong/types.h"
|
|
@@ -471,5 +471,5 @@ NK_PUBLIC void nk_cast_icelake(void const *from, nk_dtype_t from_type, nk_size_t
|
|
|
471
471
|
#endif
|
|
472
472
|
|
|
473
473
|
#endif // NK_TARGET_ICELAKE
|
|
474
|
-
#endif //
|
|
474
|
+
#endif // NK_TARGET_X8664_
|
|
475
475
|
#endif // NK_CAST_ICELAKE_H
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
#ifndef NK_CAST_LOONGSONASX_H
|
|
28
28
|
#define NK_CAST_LOONGSONASX_H
|
|
29
29
|
|
|
30
|
-
#if
|
|
30
|
+
#if NK_TARGET_LOONGARCH64_
|
|
31
31
|
#if NK_TARGET_LOONGSONASX
|
|
32
32
|
|
|
33
33
|
#include "numkong/types.h"
|
|
@@ -248,5 +248,5 @@ NK_INTERNAL void nk_euclidean_through_u32_from_dot_loongsonasx_(nk_b128_vec_t do
|
|
|
248
248
|
#endif
|
|
249
249
|
|
|
250
250
|
#endif // NK_TARGET_LOONGSONASX
|
|
251
|
-
#endif //
|
|
251
|
+
#endif // NK_TARGET_LOONGARCH64_
|
|
252
252
|
#endif // NK_CAST_LOONGSONASX_H
|
|
@@ -49,7 +49,7 @@
|
|
|
49
49
|
#ifndef NK_CAST_NEON_H
|
|
50
50
|
#define NK_CAST_NEON_H
|
|
51
51
|
|
|
52
|
-
#if
|
|
52
|
+
#if NK_TARGET_ARM64_
|
|
53
53
|
#if NK_TARGET_NEON
|
|
54
54
|
|
|
55
55
|
#include "numkong/types.h"
|
|
@@ -1155,5 +1155,5 @@ NK_PUBLIC void nk_cast_neon(void const *from, nk_dtype_t from_type, nk_size_t n,
|
|
|
1155
1155
|
#endif
|
|
1156
1156
|
|
|
1157
1157
|
#endif // NK_TARGET_NEON
|
|
1158
|
-
#endif //
|
|
1158
|
+
#endif // NK_TARGET_ARM64_
|
|
1159
1159
|
#endif // NK_CAST_NEON_H
|