npm - numkong - Versions diffs - 7.4.2 → 7.4.3 - Mend

numkong 7.4.2 → 7.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

package/README.md +84 -84
package/c/numkong.c +1 -1
package/include/numkong/attention/sapphireamx.h +2 -2
package/include/numkong/attention/sme.h +2 -2
package/include/numkong/capabilities.h +47 -47
package/include/numkong/cast/diamond.h +2 -2
package/include/numkong/cast/haswell.h +2 -2
package/include/numkong/cast/icelake.h +2 -2
package/include/numkong/cast/loongsonasx.h +2 -2
package/include/numkong/cast/neon.h +2 -2
package/include/numkong/cast/powervsx.h +2 -2
package/include/numkong/cast/rvv.h +2 -2
package/include/numkong/cast/sapphire.h +2 -2
package/include/numkong/cast/skylake.h +2 -2
package/include/numkong/curved/genoa.h +2 -2
package/include/numkong/curved/haswell.h +2 -2
package/include/numkong/curved/neon.h +2 -2
package/include/numkong/curved/neonbfdot.h +2 -2
package/include/numkong/curved/rvv.h +2 -2
package/include/numkong/curved/skylake.h +2 -2
package/include/numkong/curved/smef64.h +2 -2
package/include/numkong/dot/alder.h +2 -2
package/include/numkong/dot/diamond.h +2 -2
package/include/numkong/dot/genoa.h +2 -2
package/include/numkong/dot/haswell.h +2 -2
package/include/numkong/dot/icelake.h +2 -2
package/include/numkong/dot/loongsonasx.h +2 -2
package/include/numkong/dot/neon.h +2 -2
package/include/numkong/dot/neonbfdot.h +2 -2
package/include/numkong/dot/neonfhm.h +2 -2
package/include/numkong/dot/neonfp8.h +2 -2
package/include/numkong/dot/neonsdot.h +2 -2
package/include/numkong/dot/rvv.h +2 -2
package/include/numkong/dot/rvvbb.h +2 -2
package/include/numkong/dot/rvvbf16.h +2 -2
package/include/numkong/dot/rvvhalf.h +2 -2
package/include/numkong/dot/sapphire.h +2 -2
package/include/numkong/dot/sierra.h +2 -2
package/include/numkong/dot/skylake.h +2 -2
package/include/numkong/dot/sve.h +2 -2
package/include/numkong/dot/svebfdot.h +2 -2
package/include/numkong/dot/svehalf.h +2 -2
package/include/numkong/dot/svesdot.h +2 -2
package/include/numkong/dots/alder.h +2 -2
package/include/numkong/dots/diamond.h +2 -2
package/include/numkong/dots/genoa.h +2 -2
package/include/numkong/dots/haswell.h +2 -2
package/include/numkong/dots/icelake.h +2 -2
package/include/numkong/dots/loongsonasx.h +2 -2
package/include/numkong/dots/neon.h +2 -2
package/include/numkong/dots/neonbfdot.h +2 -2
package/include/numkong/dots/neonfhm.h +2 -2
package/include/numkong/dots/neonfp8.h +2 -2
package/include/numkong/dots/neonsdot.h +2 -2
package/include/numkong/dots/powervsx.h +2 -2
package/include/numkong/dots/rvv.h +2 -2
package/include/numkong/dots/sapphireamx.h +2 -2
package/include/numkong/dots/sierra.h +2 -2
package/include/numkong/dots/skylake.h +2 -2
package/include/numkong/dots/sme.h +10 -10
package/include/numkong/dots/smebi32.h +2 -2
package/include/numkong/dots/smef64.h +2 -2
package/include/numkong/dots/smehalf.h +2 -2
package/include/numkong/each/haswell.h +2 -2
package/include/numkong/each/icelake.h +2 -2
package/include/numkong/each/neon.h +2 -2
package/include/numkong/each/neonbfdot.h +2 -2
package/include/numkong/each/neonhalf.h +2 -2
package/include/numkong/each/rvv.h +2 -2
package/include/numkong/each/sapphire.h +2 -2
package/include/numkong/each/skylake.h +2 -2
package/include/numkong/geospatial/haswell.h +2 -2
package/include/numkong/geospatial/neon.h +2 -2
package/include/numkong/geospatial/rvv.h +2 -2
package/include/numkong/geospatial/skylake.h +2 -2
package/include/numkong/maxsim/alder.h +2 -2
package/include/numkong/maxsim/genoa.h +2 -2
package/include/numkong/maxsim/haswell.h +2 -2
package/include/numkong/maxsim/icelake.h +2 -2
package/include/numkong/maxsim/neonsdot.h +2 -2
package/include/numkong/maxsim/sapphireamx.h +2 -2
package/include/numkong/maxsim/sme.h +2 -2
package/include/numkong/mesh/haswell.h +2 -2
package/include/numkong/mesh/neon.h +2 -2
package/include/numkong/mesh/neonbfdot.h +2 -2
package/include/numkong/mesh/rvv.h +2 -2
package/include/numkong/mesh/skylake.h +2 -2
package/include/numkong/numkong.h +1 -1
package/include/numkong/probability/haswell.h +2 -2
package/include/numkong/probability/neon.h +2 -2
package/include/numkong/probability/rvv.h +2 -2
package/include/numkong/probability/skylake.h +2 -2
package/include/numkong/reduce/alder.h +2 -2
package/include/numkong/reduce/genoa.h +2 -2
package/include/numkong/reduce/haswell.h +2 -2
package/include/numkong/reduce/icelake.h +2 -2
package/include/numkong/reduce/neon.h +2 -2
package/include/numkong/reduce/neonbfdot.h +2 -2
package/include/numkong/reduce/neonfhm.h +2 -2
package/include/numkong/reduce/neonsdot.h +2 -2
package/include/numkong/reduce/rvv.h +2 -2
package/include/numkong/reduce/sierra.h +2 -2
package/include/numkong/reduce/skylake.h +2 -2
package/include/numkong/scalar/haswell.h +2 -2
package/include/numkong/scalar/loongsonasx.h +2 -2
package/include/numkong/scalar/neon.h +2 -2
package/include/numkong/scalar/neonhalf.h +2 -2
package/include/numkong/scalar/powervsx.h +2 -2
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +2 -2
package/include/numkong/set/haswell.h +2 -2
package/include/numkong/set/icelake.h +2 -2
package/include/numkong/set/loongsonasx.h +2 -2
package/include/numkong/set/neon.h +2 -2
package/include/numkong/set/powervsx.h +2 -2
package/include/numkong/set/rvv.h +2 -2
package/include/numkong/set/rvvbb.h +2 -2
package/include/numkong/set/sve.h +2 -2
package/include/numkong/sets/haswell.h +2 -2
package/include/numkong/sets/icelake.h +2 -2
package/include/numkong/sets/loongsonasx.h +2 -2
package/include/numkong/sets/neon.h +2 -2
package/include/numkong/sets/powervsx.h +2 -2
package/include/numkong/sets/smebi32.h +2 -2
package/include/numkong/sparse/icelake.h +2 -2
package/include/numkong/sparse/neon.h +2 -2
package/include/numkong/sparse/sve2.h +2 -2
package/include/numkong/sparse/turin.h +2 -2
package/include/numkong/spatial/alder.h +2 -2
package/include/numkong/spatial/diamond.h +2 -2
package/include/numkong/spatial/genoa.h +2 -2
package/include/numkong/spatial/haswell.h +2 -2
package/include/numkong/spatial/icelake.h +2 -2
package/include/numkong/spatial/loongsonasx.h +2 -2
package/include/numkong/spatial/neon.h +2 -2
package/include/numkong/spatial/neonbfdot.h +2 -2
package/include/numkong/spatial/neonfp8.h +2 -2
package/include/numkong/spatial/neonsdot.h +2 -2
package/include/numkong/spatial/powervsx.h +2 -2
package/include/numkong/spatial/rvv.h +2 -2
package/include/numkong/spatial/rvvbf16.h +2 -2
package/include/numkong/spatial/rvvhalf.h +2 -2
package/include/numkong/spatial/sierra.h +2 -2
package/include/numkong/spatial/skylake.h +2 -2
package/include/numkong/spatial/sve.h +2 -2
package/include/numkong/spatial/svebfdot.h +2 -2
package/include/numkong/spatial/svehalf.h +2 -2
package/include/numkong/spatial/svesdot.h +2 -2
package/include/numkong/spatials/alder.h +2 -2
package/include/numkong/spatials/diamond.h +2 -2
package/include/numkong/spatials/genoa.h +2 -2
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/icelake.h +2 -2
package/include/numkong/spatials/loongsonasx.h +2 -2
package/include/numkong/spatials/neon.h +2 -2
package/include/numkong/spatials/neonbfdot.h +2 -2
package/include/numkong/spatials/neonfhm.h +2 -2
package/include/numkong/spatials/neonfp8.h +2 -2
package/include/numkong/spatials/neonsdot.h +2 -2
package/include/numkong/spatials/powervsx.h +2 -2
package/include/numkong/spatials/rvv.h +2 -2
package/include/numkong/spatials/sapphireamx.h +2 -2
package/include/numkong/spatials/sierra.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +2 -2
package/include/numkong/spatials/smef64.h +2 -2
package/include/numkong/trigonometry/haswell.h +2 -2
package/include/numkong/trigonometry/neon.h +2 -2
package/include/numkong/trigonometry/rvv.h +2 -2
package/include/numkong/trigonometry/skylake.h +2 -2
package/include/numkong/types.h +88 -80
package/package.json +7 -7

package/README.md CHANGED Viewed

@@ -10,39 +10,39 @@ Most libraries return dot products in the __same type as the input__ — Float16
 This leads to quiet overflow: a 2048-dimensional `i8` dot product can reach ±10 million, but `i8` maxes out at 127.
 NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results stay in range.
-> Single 2048-d dot product on Intel [Sapphire Rapids](https://en.wikipedia.org/wiki/Sapphire_Rapids), single-threaded.
+| Input  |     NumPy + OpenBLAS |        PyTorch + MKL |                  JAX |               NumKong |
+| :----- | -------------------: | -------------------: | -------------------: | --------------------: |
+|        |       ░░░░░░░░░░░░░░ |       ░░░░░░░░░░░░░░ |       ░░░░░░░░░░░░░░ |        ░░░░░░░░░░░░░░ |
+| `f64`  | 2.0 gso/s, 1e-15 err | 0.6 gso/s, 1e-15 err | 0.4 gso/s, 1e-14 err |  5.8 gso/s, 1e-16 err |
+| `f32`  |  1.5 gso/s, 2e-6 err |  0.6 gso/s, 2e-6 err |  0.4 gso/s, 5e-6 err |   7.1 gso/s, 2e-7 err |
+| `bf16` |                    — |  0.5 gso/s, 1.9% err |  0.5 gso/s, 1.9% err |   9.7 gso/s, 1.8% err |
+| `f16`  | 0.2 gso/s, 0.25% err | 0.5 gso/s, 0.25% err | 0.4 gso/s, 0.25% err | 11.5 gso/s, 0.24% err |
+| `e5m2` |                    — |  0.7 gso/s, 4.6% err |  0.5 gso/s, 4.6% err |     7.1 gso/s, 0% err |
+| `i8`   |  1.1 gso/s, overflow |  0.5 gso/s, overflow |  0.5 gso/s, overflow |    14.8 gso/s, 0% err |
+> Single 2048-d dot product on Intel Sapphire Rapids, single-threaded.
 > Each cell shows __gso/s, mean relative error__ vs higher-precision reference.
 > gso/s = Giga Scalar Operations per Second — a more suitable name than GFLOP/s when counting both integer and floating-point work.
 > NumPy 2.4, PyTorch 2.10, JAX 0.9.
-| Input  |        NumPy + OpenBLAS |           PyTorch + MKL |                     JAX |               NumKong |
-| :----- | ----------------------: | ----------------------: | ----------------------: | --------------------: |
-|        |          ░░░░░░░░░░░░░░ |          ░░░░░░░░░░░░░░ |          ░░░░░░░░░░░░░░ |        ░░░░░░░░░░░░░░ |
-| `f64`  |    2.0 gso/s, 1e-15 err |    0.6 gso/s, 1e-15 err |    0.4 gso/s, 1e-14 err |  5.8 gso/s, 1e-16 err |
-| `f32`  |     1.5 gso/s, 2e-6 err |     0.6 gso/s, 2e-6 err |     0.4 gso/s, 5e-6 err |   7.1 gso/s, 2e-7 err |
-| `bf16` |                       — |     0.5 gso/s, 1.9% err |     0.5 gso/s, 1.9% err |   9.7 gso/s, 1.8% err |
-| `f16`  |    0.2 gso/s, 0.25% err |    0.5 gso/s, 0.25% err |    0.4 gso/s, 0.25% err | 11.5 gso/s, 0.24% err |
-| `e5m2` |                       — |     0.7 gso/s, 4.6% err |     0.5 gso/s, 4.6% err |     7.1 gso/s, 0% err |
-| `i8`   | 1.1 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 0.5 gso/s, __overflow__ |    14.8 gso/s, 0% err |
 A fair objection: PyTorch and JAX are designed for throughput, not single-call latency.
 They lower execution graphs through [XLA](https://openxla.org/) or vendored BLAS libraries like [Intel MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) and Nvidia [cuBLAS](https://developer.nvidia.com/cublas).
 So here's the same comparison on a throughput-oriented workload — matrix multiplication:
+| Input  |      NumPy + OpenBLAS |         PyTorch + MKL |                    JAX |              NumKong |
+| :----- | --------------------: | --------------------: | ---------------------: | -------------------: |
+|        |        ░░░░░░░░░░░░░░ |        ░░░░░░░░░░░░░░ |         ░░░░░░░░░░░░░░ |       ░░░░░░░░░░░░░░ |
+| `f64`  | 65.5 gso/s, 1e-15 err | 68.2 gso/s, 1e-15 err | ~14.3 gso/s, 1e-15 err | 8.6 gso/s, 1e-16 err |
+| `f32`  |   140 gso/s, 9e-7 err |   145 gso/s, 1e-6 err |  ~60.5 gso/s, 1e-6 err | 37.7 gso/s, 4e-7 err |
+| `bf16` |                     — |   851 gso/s, 1.8% err |  ~25.8 gso/s, 3.4% err |  458 gso/s, 3.6% err |
+| `f16`  |  0.3 gso/s, 0.25% err |  140 gso/s, 0.37% err | ~26.1 gso/s, 0.35% err | 103 gso/s, 0.26% err |
+| `e5m2` |                     — |   0.4 gso/s, 4.6% err |  ~26.4 gso/s, 4.6% err |    398 gso/s, 0% err |
+| `i8`   |   0.4 gso/s, overflow |  50.0 gso/s, overflow |   ~0.0 gso/s, overflow |   1279 gso/s, 0% err |
 > Matrix multiplication (2048 × 2048) × (2048 × 2048) on Intel Sapphire Rapids, single-threaded.
 > gso/s = Giga Scalar Operations per Second, same format.
 > NumPy 2.4, PyTorch 2.10, JAX 0.9, same versions.
-| Input  |        NumPy + OpenBLAS |            PyTorch + MKL |                      JAX |              NumKong |
-| :----- | ----------------------: | -----------------------: | -----------------------: | -------------------: |
-|        |          ░░░░░░░░░░░░░░ |           ░░░░░░░░░░░░░░ |           ░░░░░░░░░░░░░░ |       ░░░░░░░░░░░░░░ |
-| `f64`  |   65.5 gso/s, 1e-15 err |    68.2 gso/s, 1e-15 err |   ~14.3 gso/s, 1e-15 err | 8.6 gso/s, 1e-16 err |
-| `f32`  |     140 gso/s, 9e-7 err |      145 gso/s, 1e-6 err |    ~60.5 gso/s, 1e-6 err | 37.7 gso/s, 4e-7 err |
-| `bf16` |                       — |      851 gso/s, 1.8% err |    ~25.8 gso/s, 3.4% err |  458 gso/s, 3.6% err |
-| `f16`  |    0.3 gso/s, 0.25% err |     140 gso/s, 0.37% err |   ~26.1 gso/s, 0.35% err | 103 gso/s, 0.26% err |
-| `e5m2` |                       — |      0.4 gso/s, 4.6% err |    ~26.4 gso/s, 4.6% err |    398 gso/s, 0% err |
-| `i8`   | 0.4 gso/s, __overflow__ | 50.0 gso/s, __overflow__ | ~0.0 gso/s, __overflow__ |   1279 gso/s, 0% err |
 For `f64`, compensated "Dot2" summation reduces error by 10–50× compared to naive Float64 accumulation, depending on vector length.
 For `f32`, widening to Float64 gives 5–10× lower error.
 The library ships as a relatively small binary:
@@ -77,27 +77,27 @@ NumKong covers 17 numeric types — from 6-bit floats to 64-bit complex numbers
 ### Language Bindings
-| Operation                   | [C and C++][c] | [Python][py] | [Rust][rs] | [JavaScript][js] | [Swift][swift] | [GoLang][go] |
-| :-------------------------- | :------------: | :----------: | :--------: | :--------------: | :------------: | :----------: |
-| __Vector Ops__              |                |              |            |                  |                |              |
-| [Dot] Product               |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
-| [Spatial] Metric            |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
-| [Set] Similarity            |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
-| [Geo]spatial                |       ●        |      ●       |     ●      |        ·         |       ●        |      ●       |
-| [Mesh] Alignment            |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
-| [Sparse] Products           |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
-| [Probability] Divergences   |       ●        |      ●       |     ●      |        ●         |       ·        |      ●       |
-| [Curved] Spaces             |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
-| __Many-to-Many Vector Ops__ |                |              |            |                  |                |              |
-| "[Dots]" Products           |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
-| "[Spatials]" Metrics        |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
-| "[Sets]" Similarities       |       ●        |      ●       |     ●      |        ·         |       ●        |      ●       |
-| [MaxSim] Scoring            |       ●        |      ●       |     ●      |        ·         |       ●        |      ●       |
-| __Scalar Ops__              |                |              |            |                  |                |              |
-| [Cast]                      |       ●        |      ●       |     ●      |        ●         |       ·        |      ·       |
-| [Reduce]                    |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
-| [Each]                      |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
-| [Trigonometry]              |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
+| Operation                   | [C 99 & C++ 23][c] | [Python][py] | [Rust][rs] | [JavaScript][js] | [Swift][swift] | [GoLang][go] |
+| :-------------------------- | :----------------: | :----------: | :--------: | :--------------: | :------------: | :----------: |
+| __Vector Ops__              |                    |              |            |                  |                |              |
+| [Dot] Product               |         ●          |      ●       |     ●      |        ●         |       ●        |      ●       |
+| [Spatial] Metric            |         ●          |      ●       |     ●      |        ●         |       ●        |      ●       |
+| [Set] Similarity            |         ●          |      ●       |     ●      |        ●         |       ●        |      ●       |
+| [Geo]spatial                |         ●          |      ●       |     ●      |        ·         |       ●        |      ●       |
+| [Mesh] Alignment            |         ●          |      ●       |     ●      |        ·         |       ·        |      ·       |
+| [Sparse] Products           |         ●          |      ●       |     ●      |        ·         |       ·        |      ·       |
+| [Probability] Divergences   |         ●          |      ●       |     ●      |        ●         |       ·        |      ●       |
+| [Curved] Spaces             |         ●          |      ●       |     ●      |        ·         |       ·        |      ·       |
+| __Many-to-Many Vector Ops__ |                    |              |            |                  |                |              |
+| "[Dots]" Products           |         ●          |      ●       |     ●      |        ●         |       ●        |      ●       |
+| "[Spatials]" Metrics        |         ●          |      ●       |     ●      |        ●         |       ●        |      ●       |
+| "[Sets]" Similarities       |         ●          |      ●       |     ●      |        ·         |       ●        |      ●       |
+| [MaxSim] Scoring            |         ●          |      ●       |     ●      |        ·         |       ●        |      ●       |
+| __Scalar Ops__              |                    |              |            |                  |                |              |
+| [Cast]                      |         ●          |      ●       |     ●      |        ●         |       ·        |      ·       |
+| [Reduce]                    |         ●          |      ●       |     ●      |        ·         |       ·        |      ·       |
+| [Each]                      |         ●          |      ●       |     ●      |        ·         |       ·        |      ·       |
+| [Trigonometry]              |         ●          |      ●       |     ●      |        ·         |       ·        |      ·       |
 [Dot]: include/numkong/dot/README.md
 [Dots]: include/numkong/dots/README.md
@@ -392,16 +392,16 @@ On x86, older CPUs use __F16C extensions__ (Ivy Bridge+) for fast Float16 → Fl
 On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float32 widening multiply-accumulate, reducing the total latency from 7 cycles to 4 cycles and achieving 20–48% speedup over the separate convert-then-FMA path.
 | Platform               | BFloat16 Path              | Elem/Op | Float16 Path           | Elem/Op |
-| ---------------------- | -------------------------- | ------: | ---------------------- | ------: |
+| :--------------------- | :------------------------- | ------: | :--------------------- | ------: |
 | __x86__                |                            |         |                        |         |
-| Diamond Rapids (2025)  | ↓ Genoa                    |      32 | `VDPPHPS` widening dot |      32 |
+| Diamond Rapids (2026)  | ↓ Genoa                    |      32 | `VDPPHPS` widening dot |      32 |
 | Sapphire Rapids (2023) | ↓ Genoa                    |      32 | ↓ Skylake              |      16 |
 | Genoa (2022)           | `VDPBF16PS` widening dot   |      32 | ↓ Skylake              |      16 |
 | Skylake (2015)         | `SLLI` + `VFMADD`          |      16 | `VCVTPH2PS` + `VFMADD` |      16 |
 | Haswell (2013)         | `SLLI` + `VFMADD`          |       8 | `VCVTPH2PS` + `VFMADD` |       8 |
 | __Arm__                |                            |         |                        |         |
-| Graviton 3 (2021)      | `SVBFDOT` widening dot     |    4–32 | `SVCVT` → `SVFMLA`     |    4–32 |
 | Apple M2+ (2022)       | `BFDOT` widening dot       |       8 | ↓ FP16FML              |       8 |
+| Graviton 3+ (2021)     | `SVBFDOT` widening dot     |    4–32 | `SVCVT` → `SVFMLA`     |    4–32 |
 | Apple M1 (2020)        | ↓ NEON                     |       8 | `FMLAL` widening FMA   |       8 |
 | Graviton 2 (2019)      | ↓ NEON                     |       8 | `FCVTL` + `FMLA`       |       4 |
 | Graviton 1 (2018)      | `SHLL` + `FMLA`            |       8 | bit-manip → `FMLA`     |       8 |
@@ -420,14 +420,14 @@ On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float
 ### Mini-Floats: E4M3, E5M2, E3M2, & E2M3
-| Format                    |  Bits |  Range | NumKong Promotion Rules                         | Support in GPUs   |
-| ------------------------- | ----: | -----: | ----------------------------------------------- | ----------------- |
-| E5M2FN                    |     8 | ±57344 | BFloat16 → Float32                              | H100+, MI300+     |
-| E4M3FN                    |     8 |   ±448 | BFloat16 → Float32                              | H100+, MI300+     |
-| E3M2FN                    | 6 → 8 |    ±28 | BFloat16 & Float16 → Float32,<br/>Int16 → Int32 | only block-scaled |
-| E2M3FN                    | 6 → 8 |   ±7.5 | BFloat16 & Float16 → Float32,<br/>Int8 → Int32  | only block-scaled |
-| Block-scaled NVFP4        |     4 |     ±6 | —                                               | B200+             |
-| Block-scaled MXFP4 / E2M1 |     4 |     ±6 | —                                               | B200+, MI325+     |
+| Format       |  Bits |  Range | NumKong Promotion Rules               | Support in GPUs   |
+| :----------- | ----: | -----: | ------------------------------------- | ----------------- |
+| E5M2FN       |     8 | ±57344 | BFloat16 → Float32                    | H100+, MI300+     |
+| E4M3FN       |     8 |   ±448 | BFloat16 → Float32                    | H100+, MI300+     |
+| E3M2FN       | 6 → 8 |    ±28 | B- & Float16 → Float32, Int16 → Int32 | only block-scaled |
+| E2M3FN       | 6 → 8 |   ±7.5 | B- & Float16 → Float32, Int8 → Int32  | only block-scaled |
+| Scaled NVFP4 |     4 |     ±6 | —                                     | B200+             |
+| Scaled MXFP4 |     4 |     ±6 | —                                     | B200+, MI325+     |
 > __Block scaling.__
 > NumKong does not implement block-scaled variants (MXFP4, NVFP4, or block-scaled E3M2/E2M3).
@@ -444,22 +444,22 @@ E4M3FN (no infinities, NaN only) is preferred for __training__ where precision n
 On x86 Genoa/Sapphire Rapids, E4M3/E5M2 values upcast to BFloat16 via lookup tables, then use native __DPBF16PS__ for 2-per-lane dot products accumulating to Float32.
 On Arm Graviton 3+, the same BFloat16 upcast happens via NEON table lookups, then __BFDOT__ instructions complete the computation.
-| Platform                   | E5M2 Path                      | Elem/Op | E4M3 Path                      | Elem/Op |
-| -------------------------- | ------------------------------ | ------: | ------------------------------ | ------: |
-| __x86__                    |                                |         |                                |         |
-| Diamond Rapids (2025)      | `VCVTBF82PH` → F16 + `VDPPHPS` |      32 | `VCVTHF82PH` → F16 + `VDPPHPS` |      32 |
-| Genoa (2022)               | → BF16 + `VDPBF16PS`           |      32 | ↓ Ice Lake                     |      64 |
-| Ice Lake (2019)            | ↓ Skylake                      |      16 | octave LUT + `VPDPBUSD`        |      64 |
-| Skylake (2015)             | rebias → F32 FMA               |      16 | rebias → F32 FMA               |      16 |
-| Haswell (2013)             | rebias → F32 FMA               |       8 | rebias → F32 FMA               |       8 |
-| __Arm__                    |                                |         |                                |         |
-| NEON + FP8DOT (Olympus)    | native `FDOT`                  |      16 | native `FDOT`                  |      16 |
-| NEON + FP16FML (Apple M1+) | SHL → F16 + `FMLAL`            |      16 | LUT → F16 + `FMLAL`            |      16 |
-| NEON (Graviton 1+)         | SHL + `FCVTL` + FMA            |       8 | → F16 + `FCVTL` + FMA          |       8 |
-| __RISC-V__                 |                                |         |                                |         |
-| RVV + Zvfbfwma             | rebias → BF16 + `VFWMACCBF16`  |    4–32 | LUT → BF16 + `VFWMACCBF16`     |    4–32 |
-| RVV + Zvfh                 | SHL → F16 + `VFWMACC`          |    4–32 | LUT → F16 + `VFWMACC`          |    4–32 |
-| RVV                        | rebias → F32 + `VFMACC`        |    4–32 | LUT → F32 + `VFMACC`           |    4–32 |
+| Platform              | E5M2 Path                      | Elem/Op | E4M3 Path                      | Elem/Op |
+| :-------------------- | :----------------------------- | ------: | :----------------------------- | ------: |
+| __x86__               |                                |         |                                |         |
+| Diamond Rapids (2026) | `VCVTBF82PH` → F16 + `VDPPHPS` |      32 | `VCVTHF82PH` → F16 + `VDPPHPS` |      32 |
+| Genoa (2022)          | → BF16 + `VDPBF16PS`           |      32 | ↓ Ice Lake                     |      64 |
+| Ice Lake (2019)       | ↓ Skylake                      |      16 | octave LUT + `VPDPBUSD`        |      64 |
+| Skylake (2015)        | rebias → F32 FMA               |      16 | rebias → F32 FMA               |      16 |
+| Haswell (2013)        | rebias → F32 FMA               |       8 | rebias → F32 FMA               |       8 |
+| __Arm__               |                                |         |                                |         |
+| NEON + FP8DOT (2026)  | native `FDOT`                  |      16 | native `FDOT`                  |      16 |
+| NEON + FP16FML (2020) | SHL → F16 + `FMLAL`            |      16 | LUT → F16 + `FMLAL`            |      16 |
+| NEON (2018)           | SHL + `FCVTL` + FMA            |       8 | → F16 + `FCVTL` + FMA          |       8 |
+| __RISC-V__            |                                |         |                                |         |
+| RVV + Zvfbfwma        | rebias → BF16 + `VFWMACCBF16`  |    4–32 | LUT → BF16 + `VFWMACCBF16`     |    4–32 |
+| RVV + Zvfh            | SHL → F16 + `VFWMACC`          |    4–32 | LUT → F16 + `VFWMACC`          |    4–32 |
+| RVV                   | rebias → F32 + `VFMACC`        |    4–32 | LUT → F32 + `VFMACC`           |    4–32 |
 > E5M2 shares Float16's exponent bias (15), so E5M2 → Float16 conversion is a single left-shift by 8 bits (`SHL 8`).
 > E4M3 on Ice Lake uses "octave decomposition": the 4-bit exponent splits into 2 octave + 2 remainder bits, yielding 7 integer accumulators post-scaled by powers of 2.
@@ -469,20 +469,20 @@ Their smaller range allows scaling to exact integers that fit in `i8`/`i16`, ena
 Float16 can also serve as an accumulator, accurately representing ~50 products of E3M2FN pairs or ~20 products of E2M3FN pairs before overflow.
 On Arm, NEON FHM extensions bring widening `FMLAL` dot-products for Float16 — both faster and more widely available than `BFDOT` for BFloat16.
-| Platform                     | E3M2 Path                  | Elem/Op | E2M3 Path                    | Elem/Op |
-| ---------------------------- | -------------------------- | ------: | ---------------------------- | ------: |
-| __x86__                      |                            |         |                              |         |
-| Ice Lake (2019)              | `VPERMW` LUT + `VPMADDWD`  |      32 | `VPERMB` LUT + `VPDPBUSD`    |      64 |
-| Sierra Forest (2024)         | ↓ Haswell                  |      32 | `VPSHUFB` LUT + `VPDPBSSD`   |      32 |
-| Alder Lake (2021)            | ↓ Haswell                  |      32 | `VPSHUFB` LUT + `VPDPBUSD`   |      32 |
-| Skylake (2015)               | `VPSHUFB` LUT + `VPMADDWD` |      64 | `VPSHUFB` LUT + `VPMADDUBSW` |      64 |
-| Haswell (2013)               | `VPSHUFB` LUT + `VPMADDWD` |      32 | `VPSHUFB` LUT + `VPMADDUBSW` |      32 |
-| __Arm__                      |                            |         |                              |         |
-| NEON + FP8DOT (Olympus)      | → E5M2 + `FDOT`            |      16 | → E4M3 + `FDOT`              |      16 |
-| NEON + DotProd (Graviton 2+) | `VQTBL2` LUT + `SMLAL`     |      16 | `VQTBL2` LUT + `SDOT`        |      16 |
-| NEON (Graviton 1+)           | → F16 + `FCVTL` + FMA      |      16 | → F16 + `FCVTL` + FMA        |      16 |
-| __RISC-V__                   |                            |         |                              |         |
-| RVV                          | I16 gather LUT + `VWMACC`  |    4–32 | U8 gather LUT + `VWMACC`     |    4–32 |
+| Platform              | E3M2 Path                  | Elem/Op | E2M3 Path                    | Elem/Op |
+| :-------------------- | :------------------------- | ------: | :--------------------------- | ------: |
+| __x86__               |                            |         |                              |         |
+| Sierra Forest (2024)  | ↓ Haswell                  |      32 | `VPSHUFB` LUT + `VPDPBSSD`   |      32 |
+| Alder Lake (2021)     | ↓ Haswell                  |      32 | `VPSHUFB` LUT + `VPDPBUSD`   |      32 |
+| Ice Lake (2019)       | `VPERMW` LUT + `VPMADDWD`  |      32 | `VPERMB` LUT + `VPDPBUSD`    |      64 |
+| Skylake (2015)        | `VPSHUFB` LUT + `VPMADDWD` |      64 | `VPSHUFB` LUT + `VPMADDUBSW` |      64 |
+| Haswell (2013)        | `VPSHUFB` LUT + `VPMADDWD` |      32 | `VPSHUFB` LUT + `VPMADDUBSW` |      32 |
+| __Arm__               |                            |         |                              |         |
+| NEON + FP8DOT (2026)  | → E5M2 + `FDOT`            |      16 | → E4M3 + `FDOT`              |      16 |
+| NEON + DotProd (2019) | `VQTBL2` LUT + `SMLAL`     |      16 | `VQTBL2` LUT + `SDOT`        |      16 |
+| NEON (2018)           | → F16 + `FCVTL` + FMA      |      16 | → F16 + `FCVTL` + FMA        |      16 |
+| __RISC-V__            |                            |         |                              |         |
+| RVV                   | I16 gather LUT + `VWMACC`  |    4–32 | U8 gather LUT + `VWMACC`     |    4–32 |
 > E3M2/E2M3 values map to exact integers via 32-entry LUTs (magnitudes up to 448 for E3M2, 120 for E2M3), enabling integer accumulation with no rounding error.
 > On NEON + FP8DOT, E3M2 is first promoted to E5M2 and E2M3 to E4M3 before the hardware `FDOT` instruction.
@@ -494,7 +494,7 @@ E5M2's range (±57,344) makes the scaled product exceed Int32 entirely.
 Without the integer path, E5M2 falls back to Float32 accumulation — where its [2-bit mantissa (only 4 values per binade)](https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/) creates a [catastrophic cancellation risk](https://www.ac.uma.es/arith2024/papers/Fused%20FP8%204-Way%20Dot%20Product%20with%20Scaling%20and%20FP32%20Accumulation.pdf) that E2M3's integer path avoids completely:
 |         |  _i_ = 0 | _i_ = 1 |  _i_ = 2 |   _i_ = 3 |  _i_ = 4 |  _i_ = 5 |  _i_ = 6 |
-| ------- | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
+| :------ | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
 | _aᵢ_    |  0.00122 |   20480 | −0.00122 |       1.5 |    −3072 |     −640 |  0.00146 |
 | _bᵢ_    |      −40 |     320 |    −1280 |  −7.63e⁻⁵ | 0.000427 |    10240 | −4.58e⁻⁵ |
 | _aᵢ·bᵢ_ | −0.04883 | 6553600 |   1.5625 | −0.000114 |  −1.3125 | −6553600 |      ≈ 0 |

package/c/numkong.c CHANGED Viewed

@@ -943,7 +943,7 @@ BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved) {
 // carries ZA state. So __arm_tpidr2_save is always a no-op and
 // __arm_tpidr2_restore has nothing to restore.
 // Weak linkage lets a real compiler-rt override these if available.
-#if NK_TARGET_ARM_ && NK_TARGET_SME
+#if NK_TARGET_ARM64_ && NK_TARGET_SME
 __attribute__((weak, visibility("default"))) void __arm_tpidr2_save(void) {}
 __attribute__((weak, visibility("default"))) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
 #endif

package/include/numkong/attention/sapphireamx.h CHANGED Viewed

@@ -62,7 +62,7 @@
 #ifndef NK_ATTENTION_SAPPHIREAMX_H
 #define NK_ATTENTION_SAPPHIREAMX_H
-#if NK_TARGET_X86_
+#if NK_TARGET_X8664_
 #if NK_TARGET_SAPPHIREAMX
 #include "numkong/types.h"
@@ -1359,5 +1359,5 @@ NK_PUBLIC void nk_attention_causal_bf16_sapphireamx(nk_bf16_t const *q, void con
 #endif
 #endif // NK_TARGET_SAPPHIREAMX
-#endif // NK_TARGET_X86_
+#endif // NK_TARGET_X8664_
 #endif // NK_ATTENTION_SAPPHIREAMX_H

package/include/numkong/attention/sme.h CHANGED Viewed

@@ -91,7 +91,7 @@
 #ifndef NK_ATTENTION_SME_H
 #define NK_ATTENTION_SME_H
-#if NK_TARGET_ARM_
+#if NK_TARGET_ARM64_
 #if NK_TARGET_SME
 #include "numkong/types.h"
@@ -2068,5 +2068,5 @@ NK_PUBLIC void nk_attention_causal_f16_sme(nk_f16_t const *q, void const *kv_pac
 #endif
 #endif // NK_TARGET_SME
-#endif // NK_TARGET_ARM_
+#endif // NK_TARGET_ARM64_
 #endif // NK_ATTENTION_SME_H

package/include/numkong/capabilities.h CHANGED Viewed

@@ -96,7 +96,7 @@
 #define NK_VERSION_MAJOR 7
 #define NK_VERSION_MINOR 4
-#define NK_VERSION_PATCH 2
+#define NK_VERSION_PATCH 3
 /**
  *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
@@ -132,33 +132,33 @@
 // With `-std=c11` glibc hides `syscall()` behind `_GNU_SOURCE`, but if any
 // system header was included before us, `<features.h>` is already locked.
 // Forward-declare `syscall` directly — it always exists in glibc.
-#if defined(NK_DEFINED_LINUX_) && (NK_TARGET_X86_ || NK_TARGET_RISCV_)
+#if defined(NK_DEFINED_LINUX_) && (NK_TARGET_X8664_ || NK_TARGET_RISCV64_)
 #include <sys/syscall.h> // `SYS_arch_prctl`, `SYS_riscv_hwprobe`
 #ifdef __cplusplus
 extern "C" long syscall(long, ...) noexcept;
 #else
 extern long syscall(long, ...);
 #endif
-#if NK_TARGET_RISCV_
+#if NK_TARGET_RISCV64_
 #include <sys/auxv.h> // `getauxval`, `AT_HWCAP`
 #endif
 #endif
-#if defined(NK_DEFINED_LINUX_) && NK_TARGET_LOONGARCH_
+#if defined(NK_DEFINED_LINUX_) && NK_TARGET_LOONGARCH64_
 #include <sys/auxv.h> // `getauxval`, `AT_HWCAP`
 #endif
-#if defined(NK_DEFINED_LINUX_) && NK_TARGET_POWER_
+#if defined(NK_DEFINED_LINUX_) && NK_TARGET_POWER64_
 #include <sys/auxv.h> // `getauxval`, `AT_HWCAP`
 #endif
 // On FreeBSD RISC-V, we use elf_aux_info for capability detection
-#if defined(NK_DEFINED_FREEBSD_) && NK_TARGET_RISCV_
+#if defined(NK_DEFINED_FREEBSD_) && NK_TARGET_RISCV64_
 #include <sys/auxv.h> // `elf_aux_info`, `AT_HWCAP`
 #endif
 // On Windows ARM, we use IsProcessorFeaturePresent API for capability detection
-#if defined(NK_DEFINED_WINDOWS_) && NK_TARGET_ARM_
+#if defined(NK_DEFINED_WINDOWS_) && NK_TARGET_ARM64_
 #include <processthreadsapi.h> // `IsProcessorFeaturePresent`
 #endif
@@ -388,7 +388,7 @@ typedef void (*nk_kernel_cast_punned_t)(void const *from, nk_dtype_t from_type,
 typedef void (*nk_kernel_punned_t)(void *);
-#if NK_TARGET_X86_
+#if NK_TARGET_X8664_
 NK_PUBLIC int nk_configure_thread_x86_(nk_capability_t capabilities) {
 #if NK_TARGET_SAPPHIREAMX
@@ -409,7 +409,7 @@ NK_PUBLIC int nk_configure_thread_x86_(nk_capability_t capabilities) {
     return 1;
 }
-NK_PUBLIC nk_capability_t nk_capabilities_x86_(void) {
+NK_PUBLIC nk_capability_t nk_capabilities_x8664_(void) {
     union four_registers_t {
         int array[4];
         struct separate_t {
@@ -496,9 +496,9 @@ NK_PUBLIC nk_capability_t nk_capabilities_x86_(void) {
                              (nk_cap_graniteamx_k * supports_graniteamx) | (nk_cap_serial_k));
 }
-#endif // NK_TARGET_X86_
+#endif // NK_TARGET_X8664_
-#if NK_TARGET_ARM_
+#if NK_TARGET_ARM64_
 #if defined(__clang__)
 #pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function)
@@ -508,14 +508,14 @@ NK_PUBLIC nk_capability_t nk_capabilities_x86_(void) {
 #endif
 #if NK_HAS_POSIX_EXTENSIONS_
-static sigjmp_buf nk_mrs_test_jump_buffer_;
-static void nk_mrs_test_sigill_handler_(int sig) {
+static sigjmp_buf nk_mrs_arm64_jump_buffer_;
+static void nk_mrs_arm64_sigill_handler_(int sig) {
     nk_unused_(sig);
-    siglongjmp(nk_mrs_test_jump_buffer_, 1);
+    siglongjmp(nk_mrs_arm64_jump_buffer_, 1);
 }
 #endif
-NK_PUBLIC int nk_configure_thread_arm_(nk_capability_t capabilities) {
+NK_PUBLIC int nk_configure_thread_arm64_(nk_capability_t capabilities) {
 #if defined(_MSC_VER)
     nk_unused_(capabilities);
     return 1;
@@ -546,7 +546,7 @@ NK_PUBLIC int nk_configure_thread_arm_(nk_capability_t capabilities) {
 #elif defined(NK_DEFINED_LINUX_) || defined(NK_DEFINED_FREEBSD_)
     // Read ID registers via MRS. Only safe if MRS is known to work — indicated by
-    // capabilities beyond basic NEON (nk_capabilities_arm_ validated MRS via sigaction probe).
+    // capabilities beyond basic NEON (nk_capabilities_arm64_ validated MRS via sigaction probe).
     if (capabilities & ~(nk_cap_neon_k | nk_cap_serial_k)) {
         // FEAT_EBF16: ID_AA64ISAR1_EL1.BF16 bits [47:44] >= 0b0010
         register unsigned long isar1_val __asm__("x0");
@@ -570,7 +570,7 @@ NK_PUBLIC int nk_configure_thread_arm_(nk_capability_t capabilities) {
 #endif // _MSC_VER
 }
-NK_PUBLIC nk_capability_t nk_capabilities_arm_(void) {
+NK_PUBLIC nk_capability_t nk_capabilities_arm64_(void) {
 #if defined(NK_DEFINED_APPLE_)
     size_t size = sizeof(unsigned);
     unsigned supports_neon = 0, supports_fp16 = 0, supports_fhm = 0, supports_bf16 = 0, supports_i8mm = 0;
@@ -602,13 +602,13 @@ NK_PUBLIC nk_capability_t nk_capabilities_arm_(void) {
 #if NK_HAS_POSIX_EXTENSIONS_
     struct sigaction action_new, action_old;
-    action_new.sa_handler = nk_mrs_test_sigill_handler_;
+    action_new.sa_handler = nk_mrs_arm64_sigill_handler_;
     sigemptyset(&action_new.sa_mask);
     action_new.sa_flags = 0;
     int mrs_works = 0;
     if (sigaction(SIGILL, &action_new, &action_old) == 0) {
-        if (sigsetjmp(nk_mrs_test_jump_buffer_, 1) == 0) {
+        if (sigsetjmp(nk_mrs_arm64_jump_buffer_, 1) == 0) {
             register unsigned long midr_value __asm__("x0");
             __asm__ __volatile__(".inst 0xD5380000" : "=r"(midr_value)); // MRS x0, MIDR_EL1
             mrs_works = 1;
@@ -722,11 +722,11 @@ NK_PUBLIC nk_capability_t nk_capabilities_arm_(void) {
 #pragma GCC pop_options
 #endif
-#endif // NK_TARGET_ARM_
+#endif // NK_TARGET_ARM64_
-#if NK_TARGET_RISCV_
+#if NK_TARGET_RISCV64_
-NK_PUBLIC nk_capability_t nk_capabilities_riscv_(void) {
+NK_PUBLIC nk_capability_t nk_capabilities_riscv64_(void) {
 #if defined(NK_DEFINED_LINUX_)
     unsigned long hwcap = getauxval(AT_HWCAP);
     nk_capability_t caps = nk_cap_serial_k;
@@ -758,11 +758,11 @@ NK_PUBLIC nk_capability_t nk_capabilities_riscv_(void) {
 #endif
 }
-#endif // NK_TARGET_RISCV_
+#endif // NK_TARGET_RISCV64_
-#if NK_TARGET_LOONGARCH_
+#if NK_TARGET_LOONGARCH64_
-NK_PUBLIC nk_capability_t nk_capabilities_loongarch_(void) {
+NK_PUBLIC nk_capability_t nk_capabilities_loongarch64_(void) {
 #if defined(NK_DEFINED_LINUX_)
     unsigned long hwcap = getauxval(AT_HWCAP);
     nk_capability_t caps = nk_cap_serial_k;
@@ -774,11 +774,11 @@ NK_PUBLIC nk_capability_t nk_capabilities_loongarch_(void) {
 #endif
 }
-#endif // NK_TARGET_LOONGARCH_
+#endif // NK_TARGET_LOONGARCH64_
-#if NK_TARGET_POWER_
+#if NK_TARGET_POWER64_
-NK_PUBLIC nk_capability_t nk_capabilities_power_(void) {
+NK_PUBLIC nk_capability_t nk_capabilities_power64_(void) {
 #if defined(NK_DEFINED_LINUX_)
     unsigned long hwcap = getauxval(AT_HWCAP);
     unsigned long hwcap2 = getauxval(AT_HWCAP2);
@@ -792,7 +792,7 @@ NK_PUBLIC nk_capability_t nk_capabilities_power_(void) {
 #endif
 }
-#endif // NK_TARGET_POWER_
+#endif // NK_TARGET_POWER64_
 #if NK_TARGET_WASM_
@@ -826,27 +826,27 @@ NK_PUBLIC nk_capability_t nk_capabilities_v128relaxed_(void) {
 #endif // NK_TARGET_WASM_
 NK_PUBLIC int nk_configure_thread_(nk_capability_t capabilities) {
-#if NK_TARGET_X86_
+#if NK_TARGET_X8664_
     return nk_configure_thread_x86_(capabilities);
 #endif
-#if NK_TARGET_ARM_
-    return nk_configure_thread_arm_(capabilities);
+#if NK_TARGET_ARM64_
+    return nk_configure_thread_arm64_(capabilities);
 #endif
     nk_unused_(capabilities);
     return 1; // success — no platform-specific thread configuration needed
 }
 NK_PUBLIC nk_capability_t nk_capabilities_(void) {
-#if NK_TARGET_X86_
-    return nk_capabilities_x86_();
-#elif NK_TARGET_ARM_
-    return nk_capabilities_arm_();
-#elif NK_TARGET_RISCV_
-    return nk_capabilities_riscv_();
-#elif NK_TARGET_LOONGARCH_
-    return nk_capabilities_loongarch_();
-#elif NK_TARGET_POWER_
-    return nk_capabilities_power_();
+#if NK_TARGET_X8664_
+    return nk_capabilities_x8664_();
+#elif NK_TARGET_ARM64_
+    return nk_capabilities_arm64_();
+#elif NK_TARGET_RISCV64_
+    return nk_capabilities_riscv64_();
+#elif NK_TARGET_LOONGARCH64_
+    return nk_capabilities_loongarch64_();
+#elif NK_TARGET_POWER64_
+    return nk_capabilities_power64_();
 #elif NK_TARGET_WASM_
     return nk_capabilities_v128relaxed_();
 #else
@@ -860,7 +860,7 @@ NK_PUBLIC nk_capability_t nk_capabilities_(void) {
  */
 NK_PUBLIC nk_capability_t nk_capabilities_compiled_(void) {
     nk_capability_t caps = nk_cap_serial_k;
-#if NK_TARGET_X86_
+#if NK_TARGET_X8664_
     caps |= nk_cap_haswell_k * NK_TARGET_HASWELL;
     caps |= nk_cap_skylake_k * NK_TARGET_SKYLAKE;
     caps |= nk_cap_icelake_k * NK_TARGET_ICELAKE;
@@ -873,7 +873,7 @@ NK_PUBLIC nk_capability_t nk_capabilities_compiled_(void) {
     caps |= nk_cap_alder_k * NK_TARGET_ALDER;
     caps |= nk_cap_sierra_k * NK_TARGET_SIERRA;
 #endif
-#if NK_TARGET_ARM_
+#if NK_TARGET_ARM64_
     caps |= nk_cap_neon_k * NK_TARGET_NEON;
     caps |= nk_cap_neonhalf_k * NK_TARGET_NEONHALF;
     caps |= nk_cap_neonsdot_k * NK_TARGET_NEONSDOT;
@@ -896,16 +896,16 @@ NK_PUBLIC nk_capability_t nk_capabilities_compiled_(void) {
     caps |= nk_cap_smelut2_k * NK_TARGET_SMELUT2;
     caps |= nk_cap_smefa64_k * NK_TARGET_SMEFA64;
 #endif
-#if NK_TARGET_RISCV_
+#if NK_TARGET_RISCV64_
     caps |= nk_cap_rvv_k * NK_TARGET_RVV;
     caps |= nk_cap_rvvhalf_k * NK_TARGET_RVVHALF;
     caps |= nk_cap_rvvbf16_k * NK_TARGET_RVVBF16;
     caps |= nk_cap_rvvbb_k * NK_TARGET_RVVBB;
 #endif
-#if NK_TARGET_LOONGARCH_
+#if NK_TARGET_LOONGARCH64_
     caps |= nk_cap_loongsonasx_k * NK_TARGET_LOONGSONASX;
 #endif
-#if NK_TARGET_POWER_
+#if NK_TARGET_POWER64_
     caps |= nk_cap_powervsx_k * NK_TARGET_POWERVSX;
 #endif
 #if NK_TARGET_WASM_

package/include/numkong/cast/diamond.h CHANGED Viewed

@@ -12,7 +12,7 @@
 #ifndef NK_CAST_DIAMOND_H
 #define NK_CAST_DIAMOND_H
-#if NK_TARGET_X86_
+#if NK_TARGET_X8664_
 #if NK_TARGET_DIAMOND
 #include "numkong/types.h"
@@ -60,5 +60,5 @@ NK_INTERNAL void nk_partial_load_e5m2x32_to_f16x32_diamond_(nk_e5m2_t const *src
 #endif
 #endif // NK_TARGET_DIAMOND
-#endif // NK_TARGET_X86_
+#endif // NK_TARGET_X8664_
 #endif // NK_CAST_DIAMOND_H

package/include/numkong/cast/haswell.h CHANGED Viewed

@@ -20,7 +20,7 @@
 #ifndef NK_CAST_HASWELL_H
 #define NK_CAST_HASWELL_H
-#if NK_TARGET_X86_
+#if NK_TARGET_X8664_
 #if NK_TARGET_HASWELL
 #include "numkong/types.h"
@@ -819,5 +819,5 @@ NK_PUBLIC void nk_cast_haswell(void const *from, nk_dtype_t from_type, nk_size_t
 #endif
 #endif // NK_TARGET_HASWELL
-#endif // NK_TARGET_X86_
+#endif // NK_TARGET_X8664_
 #endif // NK_CAST_HASWELL_H

package/include/numkong/cast/icelake.h CHANGED Viewed

@@ -19,7 +19,7 @@
 #ifndef NK_CAST_ICELAKE_H
 #define NK_CAST_ICELAKE_H
-#if NK_TARGET_X86_
+#if NK_TARGET_X8664_
 #if NK_TARGET_ICELAKE
 #include "numkong/types.h"
@@ -471,5 +471,5 @@ NK_PUBLIC void nk_cast_icelake(void const *from, nk_dtype_t from_type, nk_size_t
 #endif
 #endif // NK_TARGET_ICELAKE
-#endif // NK_TARGET_X86_
+#endif // NK_TARGET_X8664_
 #endif // NK_CAST_ICELAKE_H

package/include/numkong/cast/loongsonasx.h CHANGED Viewed

@@ -27,7 +27,7 @@
 #ifndef NK_CAST_LOONGSONASX_H
 #define NK_CAST_LOONGSONASX_H
-#if NK_TARGET_LOONGARCH_
+#if NK_TARGET_LOONGARCH64_
 #if NK_TARGET_LOONGSONASX
 #include "numkong/types.h"
@@ -248,5 +248,5 @@ NK_INTERNAL void nk_euclidean_through_u32_from_dot_loongsonasx_(nk_b128_vec_t do
 #endif
 #endif // NK_TARGET_LOONGSONASX
-#endif // NK_TARGET_LOONGARCH_
+#endif // NK_TARGET_LOONGARCH64_
 #endif // NK_CAST_LOONGSONASX_H

package/include/numkong/cast/neon.h CHANGED Viewed

@@ -49,7 +49,7 @@
 #ifndef NK_CAST_NEON_H
 #define NK_CAST_NEON_H
-#if NK_TARGET_ARM_
+#if NK_TARGET_ARM64_
 #if NK_TARGET_NEON
 #include "numkong/types.h"
@@ -1155,5 +1155,5 @@ NK_PUBLIC void nk_cast_neon(void const *from, nk_dtype_t from_type, nk_size_t n,
 #endif
 #endif // NK_TARGET_NEON
-#endif // NK_TARGET_ARM_
+#endif // NK_TARGET_ARM64_
 #endif // NK_CAST_NEON_H