numkong 7.4.2 → 7.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/README.md +84 -84
  2. package/c/numkong.c +1 -1
  3. package/include/numkong/attention/sapphireamx.h +2 -2
  4. package/include/numkong/attention/sme.h +2 -2
  5. package/include/numkong/capabilities.h +47 -47
  6. package/include/numkong/cast/diamond.h +2 -2
  7. package/include/numkong/cast/haswell.h +2 -2
  8. package/include/numkong/cast/icelake.h +2 -2
  9. package/include/numkong/cast/loongsonasx.h +2 -2
  10. package/include/numkong/cast/neon.h +2 -2
  11. package/include/numkong/cast/powervsx.h +2 -2
  12. package/include/numkong/cast/rvv.h +2 -2
  13. package/include/numkong/cast/sapphire.h +2 -2
  14. package/include/numkong/cast/skylake.h +2 -2
  15. package/include/numkong/curved/genoa.h +2 -2
  16. package/include/numkong/curved/haswell.h +2 -2
  17. package/include/numkong/curved/neon.h +2 -2
  18. package/include/numkong/curved/neonbfdot.h +2 -2
  19. package/include/numkong/curved/rvv.h +2 -2
  20. package/include/numkong/curved/skylake.h +2 -2
  21. package/include/numkong/curved/smef64.h +2 -2
  22. package/include/numkong/dot/alder.h +2 -2
  23. package/include/numkong/dot/diamond.h +2 -2
  24. package/include/numkong/dot/genoa.h +2 -2
  25. package/include/numkong/dot/haswell.h +2 -2
  26. package/include/numkong/dot/icelake.h +2 -2
  27. package/include/numkong/dot/loongsonasx.h +2 -2
  28. package/include/numkong/dot/neon.h +2 -2
  29. package/include/numkong/dot/neonbfdot.h +2 -2
  30. package/include/numkong/dot/neonfhm.h +2 -2
  31. package/include/numkong/dot/neonfp8.h +2 -2
  32. package/include/numkong/dot/neonsdot.h +2 -2
  33. package/include/numkong/dot/rvv.h +2 -2
  34. package/include/numkong/dot/rvvbb.h +2 -2
  35. package/include/numkong/dot/rvvbf16.h +2 -2
  36. package/include/numkong/dot/rvvhalf.h +2 -2
  37. package/include/numkong/dot/sapphire.h +2 -2
  38. package/include/numkong/dot/sierra.h +2 -2
  39. package/include/numkong/dot/skylake.h +2 -2
  40. package/include/numkong/dot/sve.h +2 -2
  41. package/include/numkong/dot/svebfdot.h +2 -2
  42. package/include/numkong/dot/svehalf.h +2 -2
  43. package/include/numkong/dot/svesdot.h +2 -2
  44. package/include/numkong/dots/alder.h +2 -2
  45. package/include/numkong/dots/diamond.h +2 -2
  46. package/include/numkong/dots/genoa.h +2 -2
  47. package/include/numkong/dots/haswell.h +2 -2
  48. package/include/numkong/dots/icelake.h +2 -2
  49. package/include/numkong/dots/loongsonasx.h +2 -2
  50. package/include/numkong/dots/neon.h +2 -2
  51. package/include/numkong/dots/neonbfdot.h +2 -2
  52. package/include/numkong/dots/neonfhm.h +2 -2
  53. package/include/numkong/dots/neonfp8.h +2 -2
  54. package/include/numkong/dots/neonsdot.h +2 -2
  55. package/include/numkong/dots/powervsx.h +2 -2
  56. package/include/numkong/dots/rvv.h +2 -2
  57. package/include/numkong/dots/sapphireamx.h +2 -2
  58. package/include/numkong/dots/sierra.h +2 -2
  59. package/include/numkong/dots/skylake.h +2 -2
  60. package/include/numkong/dots/sme.h +10 -10
  61. package/include/numkong/dots/smebi32.h +2 -2
  62. package/include/numkong/dots/smef64.h +2 -2
  63. package/include/numkong/dots/smehalf.h +2 -2
  64. package/include/numkong/each/haswell.h +2 -2
  65. package/include/numkong/each/icelake.h +2 -2
  66. package/include/numkong/each/neon.h +2 -2
  67. package/include/numkong/each/neonbfdot.h +2 -2
  68. package/include/numkong/each/neonhalf.h +2 -2
  69. package/include/numkong/each/rvv.h +2 -2
  70. package/include/numkong/each/sapphire.h +2 -2
  71. package/include/numkong/each/skylake.h +2 -2
  72. package/include/numkong/geospatial/haswell.h +2 -2
  73. package/include/numkong/geospatial/neon.h +2 -2
  74. package/include/numkong/geospatial/rvv.h +2 -2
  75. package/include/numkong/geospatial/skylake.h +2 -2
  76. package/include/numkong/maxsim/alder.h +2 -2
  77. package/include/numkong/maxsim/genoa.h +2 -2
  78. package/include/numkong/maxsim/haswell.h +2 -2
  79. package/include/numkong/maxsim/icelake.h +2 -2
  80. package/include/numkong/maxsim/neonsdot.h +2 -2
  81. package/include/numkong/maxsim/sapphireamx.h +2 -2
  82. package/include/numkong/maxsim/sme.h +2 -2
  83. package/include/numkong/mesh/haswell.h +2 -2
  84. package/include/numkong/mesh/neon.h +2 -2
  85. package/include/numkong/mesh/neonbfdot.h +2 -2
  86. package/include/numkong/mesh/rvv.h +2 -2
  87. package/include/numkong/mesh/skylake.h +2 -2
  88. package/include/numkong/numkong.h +1 -1
  89. package/include/numkong/probability/haswell.h +2 -2
  90. package/include/numkong/probability/neon.h +2 -2
  91. package/include/numkong/probability/rvv.h +2 -2
  92. package/include/numkong/probability/skylake.h +2 -2
  93. package/include/numkong/reduce/alder.h +2 -2
  94. package/include/numkong/reduce/genoa.h +2 -2
  95. package/include/numkong/reduce/haswell.h +2 -2
  96. package/include/numkong/reduce/icelake.h +2 -2
  97. package/include/numkong/reduce/neon.h +2 -2
  98. package/include/numkong/reduce/neonbfdot.h +2 -2
  99. package/include/numkong/reduce/neonfhm.h +2 -2
  100. package/include/numkong/reduce/neonsdot.h +2 -2
  101. package/include/numkong/reduce/rvv.h +2 -2
  102. package/include/numkong/reduce/sierra.h +2 -2
  103. package/include/numkong/reduce/skylake.h +2 -2
  104. package/include/numkong/scalar/haswell.h +2 -2
  105. package/include/numkong/scalar/loongsonasx.h +2 -2
  106. package/include/numkong/scalar/neon.h +2 -2
  107. package/include/numkong/scalar/neonhalf.h +2 -2
  108. package/include/numkong/scalar/powervsx.h +2 -2
  109. package/include/numkong/scalar/rvv.h +2 -2
  110. package/include/numkong/scalar/sapphire.h +2 -2
  111. package/include/numkong/set/haswell.h +2 -2
  112. package/include/numkong/set/icelake.h +2 -2
  113. package/include/numkong/set/loongsonasx.h +2 -2
  114. package/include/numkong/set/neon.h +2 -2
  115. package/include/numkong/set/powervsx.h +2 -2
  116. package/include/numkong/set/rvv.h +2 -2
  117. package/include/numkong/set/rvvbb.h +2 -2
  118. package/include/numkong/set/sve.h +2 -2
  119. package/include/numkong/sets/haswell.h +2 -2
  120. package/include/numkong/sets/icelake.h +2 -2
  121. package/include/numkong/sets/loongsonasx.h +2 -2
  122. package/include/numkong/sets/neon.h +2 -2
  123. package/include/numkong/sets/powervsx.h +2 -2
  124. package/include/numkong/sets/smebi32.h +2 -2
  125. package/include/numkong/sparse/icelake.h +2 -2
  126. package/include/numkong/sparse/neon.h +2 -2
  127. package/include/numkong/sparse/sve2.h +2 -2
  128. package/include/numkong/sparse/turin.h +2 -2
  129. package/include/numkong/spatial/alder.h +2 -2
  130. package/include/numkong/spatial/diamond.h +2 -2
  131. package/include/numkong/spatial/genoa.h +2 -2
  132. package/include/numkong/spatial/haswell.h +2 -2
  133. package/include/numkong/spatial/icelake.h +2 -2
  134. package/include/numkong/spatial/loongsonasx.h +2 -2
  135. package/include/numkong/spatial/neon.h +2 -2
  136. package/include/numkong/spatial/neonbfdot.h +2 -2
  137. package/include/numkong/spatial/neonfp8.h +2 -2
  138. package/include/numkong/spatial/neonsdot.h +2 -2
  139. package/include/numkong/spatial/powervsx.h +2 -2
  140. package/include/numkong/spatial/rvv.h +2 -2
  141. package/include/numkong/spatial/rvvbf16.h +2 -2
  142. package/include/numkong/spatial/rvvhalf.h +2 -2
  143. package/include/numkong/spatial/sierra.h +2 -2
  144. package/include/numkong/spatial/skylake.h +2 -2
  145. package/include/numkong/spatial/sve.h +2 -2
  146. package/include/numkong/spatial/svebfdot.h +2 -2
  147. package/include/numkong/spatial/svehalf.h +2 -2
  148. package/include/numkong/spatial/svesdot.h +2 -2
  149. package/include/numkong/spatials/alder.h +2 -2
  150. package/include/numkong/spatials/diamond.h +2 -2
  151. package/include/numkong/spatials/genoa.h +2 -2
  152. package/include/numkong/spatials/haswell.h +2 -2
  153. package/include/numkong/spatials/icelake.h +2 -2
  154. package/include/numkong/spatials/loongsonasx.h +2 -2
  155. package/include/numkong/spatials/neon.h +2 -2
  156. package/include/numkong/spatials/neonbfdot.h +2 -2
  157. package/include/numkong/spatials/neonfhm.h +2 -2
  158. package/include/numkong/spatials/neonfp8.h +2 -2
  159. package/include/numkong/spatials/neonsdot.h +2 -2
  160. package/include/numkong/spatials/powervsx.h +2 -2
  161. package/include/numkong/spatials/rvv.h +2 -2
  162. package/include/numkong/spatials/sapphireamx.h +2 -2
  163. package/include/numkong/spatials/sierra.h +2 -2
  164. package/include/numkong/spatials/skylake.h +2 -2
  165. package/include/numkong/spatials/sme.h +2 -2
  166. package/include/numkong/spatials/smef64.h +2 -2
  167. package/include/numkong/trigonometry/haswell.h +2 -2
  168. package/include/numkong/trigonometry/neon.h +2 -2
  169. package/include/numkong/trigonometry/rvv.h +2 -2
  170. package/include/numkong/trigonometry/skylake.h +2 -2
  171. package/include/numkong/types.h +88 -80
  172. package/package.json +7 -7
package/README.md CHANGED
@@ -10,39 +10,39 @@ Most libraries return dot products in the __same type as the input__ — Float16
10
10
  This leads to quiet overflow: a 2048-dimensional `i8` dot product can reach ±10 million, but `i8` maxes out at 127.
11
11
  NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results stay in range.
12
12
 
13
- > Single 2048-d dot product on Intel [Sapphire Rapids](https://en.wikipedia.org/wiki/Sapphire_Rapids), single-threaded.
13
+ | Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
14
+ | :----- | -------------------: | -------------------: | -------------------: | --------------------: |
15
+ | | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
16
+ | `f64` | 2.0 gso/s, 1e-15 err | 0.6 gso/s, 1e-15 err | 0.4 gso/s, 1e-14 err | 5.8 gso/s, 1e-16 err |
17
+ | `f32` | 1.5 gso/s, 2e-6 err | 0.6 gso/s, 2e-6 err | 0.4 gso/s, 5e-6 err | 7.1 gso/s, 2e-7 err |
18
+ | `bf16` | — | 0.5 gso/s, 1.9% err | 0.5 gso/s, 1.9% err | 9.7 gso/s, 1.8% err |
19
+ | `f16` | 0.2 gso/s, 0.25% err | 0.5 gso/s, 0.25% err | 0.4 gso/s, 0.25% err | 11.5 gso/s, 0.24% err |
20
+ | `e5m2` | — | 0.7 gso/s, 4.6% err | 0.5 gso/s, 4.6% err | 7.1 gso/s, 0% err |
21
+ | `i8` | 1.1 gso/s, overflow | 0.5 gso/s, overflow | 0.5 gso/s, overflow | 14.8 gso/s, 0% err |
22
+
23
+ > Single 2048-d dot product on Intel Sapphire Rapids, single-threaded.
14
24
  > Each cell shows __gso/s, mean relative error__ vs higher-precision reference.
15
25
  > gso/s = Giga Scalar Operations per Second — a more suitable name than GFLOP/s when counting both integer and floating-point work.
16
26
  > NumPy 2.4, PyTorch 2.10, JAX 0.9.
17
27
 
18
- | Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
19
- | :----- | ----------------------: | ----------------------: | ----------------------: | --------------------: |
20
- | | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
21
- | `f64` | 2.0 gso/s, 1e-15 err | 0.6 gso/s, 1e-15 err | 0.4 gso/s, 1e-14 err | 5.8 gso/s, 1e-16 err |
22
- | `f32` | 1.5 gso/s, 2e-6 err | 0.6 gso/s, 2e-6 err | 0.4 gso/s, 5e-6 err | 7.1 gso/s, 2e-7 err |
23
- | `bf16` | — | 0.5 gso/s, 1.9% err | 0.5 gso/s, 1.9% err | 9.7 gso/s, 1.8% err |
24
- | `f16` | 0.2 gso/s, 0.25% err | 0.5 gso/s, 0.25% err | 0.4 gso/s, 0.25% err | 11.5 gso/s, 0.24% err |
25
- | `e5m2` | — | 0.7 gso/s, 4.6% err | 0.5 gso/s, 4.6% err | 7.1 gso/s, 0% err |
26
- | `i8` | 1.1 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 14.8 gso/s, 0% err |
27
-
28
28
  A fair objection: PyTorch and JAX are designed for throughput, not single-call latency.
29
29
  They lower execution graphs through [XLA](https://openxla.org/) or vendored BLAS libraries like [Intel MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) and Nvidia [cuBLAS](https://developer.nvidia.com/cublas).
30
30
  So here's the same comparison on a throughput-oriented workload — matrix multiplication:
31
31
 
32
+ | Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
33
+ | :----- | --------------------: | --------------------: | ---------------------: | -------------------: |
34
+ | | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
35
+ | `f64` | 65.5 gso/s, 1e-15 err | 68.2 gso/s, 1e-15 err | ~14.3 gso/s, 1e-15 err | 8.6 gso/s, 1e-16 err |
36
+ | `f32` | 140 gso/s, 9e-7 err | 145 gso/s, 1e-6 err | ~60.5 gso/s, 1e-6 err | 37.7 gso/s, 4e-7 err |
37
+ | `bf16` | — | 851 gso/s, 1.8% err | ~25.8 gso/s, 3.4% err | 458 gso/s, 3.6% err |
38
+ | `f16` | 0.3 gso/s, 0.25% err | 140 gso/s, 0.37% err | ~26.1 gso/s, 0.35% err | 103 gso/s, 0.26% err |
39
+ | `e5m2` | — | 0.4 gso/s, 4.6% err | ~26.4 gso/s, 4.6% err | 398 gso/s, 0% err |
40
+ | `i8` | 0.4 gso/s, overflow | 50.0 gso/s, overflow | ~0.0 gso/s, overflow | 1279 gso/s, 0% err |
41
+
32
42
  > Matrix multiplication (2048 × 2048) × (2048 × 2048) on Intel Sapphire Rapids, single-threaded.
33
43
  > gso/s = Giga Scalar Operations per Second, same format.
34
44
  > NumPy 2.4, PyTorch 2.10, JAX 0.9, same versions.
35
45
 
36
- | Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
37
- | :----- | ----------------------: | -----------------------: | -----------------------: | -------------------: |
38
- | | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░ |
39
- | `f64` | 65.5 gso/s, 1e-15 err | 68.2 gso/s, 1e-15 err | ~14.3 gso/s, 1e-15 err | 8.6 gso/s, 1e-16 err |
40
- | `f32` | 140 gso/s, 9e-7 err | 145 gso/s, 1e-6 err | ~60.5 gso/s, 1e-6 err | 37.7 gso/s, 4e-7 err |
41
- | `bf16` | — | 851 gso/s, 1.8% err | ~25.8 gso/s, 3.4% err | 458 gso/s, 3.6% err |
42
- | `f16` | 0.3 gso/s, 0.25% err | 140 gso/s, 0.37% err | ~26.1 gso/s, 0.35% err | 103 gso/s, 0.26% err |
43
- | `e5m2` | — | 0.4 gso/s, 4.6% err | ~26.4 gso/s, 4.6% err | 398 gso/s, 0% err |
44
- | `i8` | 0.4 gso/s, __overflow__ | 50.0 gso/s, __overflow__ | ~0.0 gso/s, __overflow__ | 1279 gso/s, 0% err |
45
-
46
46
  For `f64`, compensated "Dot2" summation reduces error by 10–50× compared to naive Float64 accumulation, depending on vector length.
47
47
  For `f32`, widening to Float64 gives 5–10× lower error.
48
48
  The library ships as a relatively small binary:
@@ -77,27 +77,27 @@ NumKong covers 17 numeric types — from 6-bit floats to 64-bit complex numbers
77
77
 
78
78
  ### Language Bindings
79
79
 
80
- | Operation | [C and C++][c] | [Python][py] | [Rust][rs] | [JavaScript][js] | [Swift][swift] | [GoLang][go] |
81
- | :-------------------------- | :------------: | :----------: | :--------: | :--------------: | :------------: | :----------: |
82
- | __Vector Ops__ | | | | | | |
83
- | [Dot] Product | | ● | ● | ● | ● | ● |
84
- | [Spatial] Metric | | ● | ● | ● | ● | ● |
85
- | [Set] Similarity | | ● | ● | ● | ● | ● |
86
- | [Geo]spatial | | ● | ● | · | ● | ● |
87
- | [Mesh] Alignment | | ● | ● | · | · | · |
88
- | [Sparse] Products | | ● | ● | · | · | · |
89
- | [Probability] Divergences | | ● | ● | ● | · | ● |
90
- | [Curved] Spaces | | ● | ● | · | · | · |
91
- | __Many-to-Many Vector Ops__ | | | | | | |
92
- | "[Dots]" Products | | ● | ● | ● | ● | ● |
93
- | "[Spatials]" Metrics | | ● | ● | ● | ● | ● |
94
- | "[Sets]" Similarities | | ● | ● | · | ● | ● |
95
- | [MaxSim] Scoring | | ● | ● | · | ● | ● |
96
- | __Scalar Ops__ | | | | | | |
97
- | [Cast] | | ● | ● | ● | · | · |
98
- | [Reduce] | | ● | ● | · | · | · |
99
- | [Each] | | ● | ● | · | · | · |
100
- | [Trigonometry] | | ● | ● | · | · | · |
80
+ | Operation | [C 99 & C++ 23][c] | [Python][py] | [Rust][rs] | [JavaScript][js] | [Swift][swift] | [GoLang][go] |
81
+ | :-------------------------- | :----------------: | :----------: | :--------: | :--------------: | :------------: | :----------: |
82
+ | __Vector Ops__ | | | | | | |
83
+ | [Dot] Product | | ● | ● | ● | ● | ● |
84
+ | [Spatial] Metric | | ● | ● | ● | ● | ● |
85
+ | [Set] Similarity | | ● | ● | ● | ● | ● |
86
+ | [Geo]spatial | | ● | ● | · | ● | ● |
87
+ | [Mesh] Alignment | | ● | ● | · | · | · |
88
+ | [Sparse] Products | | ● | ● | · | · | · |
89
+ | [Probability] Divergences | | ● | ● | ● | · | ● |
90
+ | [Curved] Spaces | | ● | ● | · | · | · |
91
+ | __Many-to-Many Vector Ops__ | | | | | | |
92
+ | "[Dots]" Products | | ● | ● | ● | ● | ● |
93
+ | "[Spatials]" Metrics | | ● | ● | ● | ● | ● |
94
+ | "[Sets]" Similarities | | ● | ● | · | ● | ● |
95
+ | [MaxSim] Scoring | | ● | ● | · | ● | ● |
96
+ | __Scalar Ops__ | | | | | | |
97
+ | [Cast] | | ● | ● | ● | · | · |
98
+ | [Reduce] | | ● | ● | · | · | · |
99
+ | [Each] | | ● | ● | · | · | · |
100
+ | [Trigonometry] | | ● | ● | · | · | · |
101
101
 
102
102
  [Dot]: include/numkong/dot/README.md
103
103
  [Dots]: include/numkong/dots/README.md
@@ -392,16 +392,16 @@ On x86, older CPUs use __F16C extensions__ (Ivy Bridge+) for fast Float16 → Fl
392
392
  On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float32 widening multiply-accumulate, reducing the total latency from 7 cycles to 4 cycles and achieving 20–48% speedup over the separate convert-then-FMA path.
393
393
 
394
394
  | Platform | BFloat16 Path | Elem/Op | Float16 Path | Elem/Op |
395
- | ---------------------- | -------------------------- | ------: | ---------------------- | ------: |
395
+ | :--------------------- | :------------------------- | ------: | :--------------------- | ------: |
396
396
  | __x86__ | | | | |
397
- | Diamond Rapids (2025) | ↓ Genoa | 32 | `VDPPHPS` widening dot | 32 |
397
+ | Diamond Rapids (2026) | ↓ Genoa | 32 | `VDPPHPS` widening dot | 32 |
398
398
  | Sapphire Rapids (2023) | ↓ Genoa | 32 | ↓ Skylake | 16 |
399
399
  | Genoa (2022) | `VDPBF16PS` widening dot | 32 | ↓ Skylake | 16 |
400
400
  | Skylake (2015) | `SLLI` + `VFMADD` | 16 | `VCVTPH2PS` + `VFMADD` | 16 |
401
401
  | Haswell (2013) | `SLLI` + `VFMADD` | 8 | `VCVTPH2PS` + `VFMADD` | 8 |
402
402
  | __Arm__ | | | | |
403
- | Graviton 3 (2021) | `SVBFDOT` widening dot | 4–32 | `SVCVT` → `SVFMLA` | 4–32 |
404
403
  | Apple M2+ (2022) | `BFDOT` widening dot | 8 | ↓ FP16FML | 8 |
404
+ | Graviton 3+ (2021) | `SVBFDOT` widening dot | 4–32 | `SVCVT` → `SVFMLA` | 4–32 |
405
405
  | Apple M1 (2020) | ↓ NEON | 8 | `FMLAL` widening FMA | 8 |
406
406
  | Graviton 2 (2019) | ↓ NEON | 8 | `FCVTL` + `FMLA` | 4 |
407
407
  | Graviton 1 (2018) | `SHLL` + `FMLA` | 8 | bit-manip → `FMLA` | 8 |
@@ -420,14 +420,14 @@ On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float
420
420
 
421
421
  ### Mini-Floats: E4M3, E5M2, E3M2, & E2M3
422
422
 
423
- | Format | Bits | Range | NumKong Promotion Rules | Support in GPUs |
424
- | ------------------------- | ----: | -----: | ----------------------------------------------- | ----------------- |
425
- | E5M2FN | 8 | ±57344 | BFloat16 → Float32 | H100+, MI300+ |
426
- | E4M3FN | 8 | ±448 | BFloat16 → Float32 | H100+, MI300+ |
427
- | E3M2FN | 6 → 8 | ±28 | BFloat16 & Float16 → Float32,<br/>Int16 → Int32 | only block-scaled |
428
- | E2M3FN | 6 → 8 | ±7.5 | BFloat16 & Float16 → Float32,<br/>Int8 → Int32 | only block-scaled |
429
- | Block-scaled NVFP4 | 4 | ±6 | — | B200+ |
430
- | Block-scaled MXFP4 / E2M1 | 4 | ±6 | — | B200+, MI325+ |
423
+ | Format | Bits | Range | NumKong Promotion Rules | Support in GPUs |
424
+ | :----------- | ----: | -----: | ------------------------------------- | ----------------- |
425
+ | E5M2FN | 8 | ±57344 | BFloat16 → Float32 | H100+, MI300+ |
426
+ | E4M3FN | 8 | ±448 | BFloat16 → Float32 | H100+, MI300+ |
427
+ | E3M2FN | 6 → 8 | ±28 | B- & Float16 → Float32, Int16 → Int32 | only block-scaled |
428
+ | E2M3FN | 6 → 8 | ±7.5 | B- & Float16 → Float32, Int8 → Int32 | only block-scaled |
429
+ | Scaled NVFP4 | 4 | ±6 | — | B200+ |
430
+ | Scaled MXFP4 | 4 | ±6 | — | B200+, MI325+ |
431
431
 
432
432
  > __Block scaling.__
433
433
  > NumKong does not implement block-scaled variants (MXFP4, NVFP4, or block-scaled E3M2/E2M3).
@@ -444,22 +444,22 @@ E4M3FN (no infinities, NaN only) is preferred for __training__ where precision n
444
444
  On x86 Genoa/Sapphire Rapids, E4M3/E5M2 values upcast to BFloat16 via lookup tables, then use native __DPBF16PS__ for 2-per-lane dot products accumulating to Float32.
445
445
  On Arm Graviton 3+, the same BFloat16 upcast happens via NEON table lookups, then __BFDOT__ instructions complete the computation.
446
446
 
447
- | Platform | E5M2 Path | Elem/Op | E4M3 Path | Elem/Op |
448
- | -------------------------- | ------------------------------ | ------: | ------------------------------ | ------: |
449
- | __x86__ | | | | |
450
- | Diamond Rapids (2025) | `VCVTBF82PH` → F16 + `VDPPHPS` | 32 | `VCVTHF82PH` → F16 + `VDPPHPS` | 32 |
451
- | Genoa (2022) | → BF16 + `VDPBF16PS` | 32 | ↓ Ice Lake | 64 |
452
- | Ice Lake (2019) | ↓ Skylake | 16 | octave LUT + `VPDPBUSD` | 64 |
453
- | Skylake (2015) | rebias → F32 FMA | 16 | rebias → F32 FMA | 16 |
454
- | Haswell (2013) | rebias → F32 FMA | 8 | rebias → F32 FMA | 8 |
455
- | __Arm__ | | | | |
456
- | NEON + FP8DOT (Olympus) | native `FDOT` | 16 | native `FDOT` | 16 |
457
- | NEON + FP16FML (Apple M1+) | SHL → F16 + `FMLAL` | 16 | LUT → F16 + `FMLAL` | 16 |
458
- | NEON (Graviton 1+) | SHL + `FCVTL` + FMA | 8 | → F16 + `FCVTL` + FMA | 8 |
459
- | __RISC-V__ | | | | |
460
- | RVV + Zvfbfwma | rebias → BF16 + `VFWMACCBF16` | 4–32 | LUT → BF16 + `VFWMACCBF16` | 4–32 |
461
- | RVV + Zvfh | SHL → F16 + `VFWMACC` | 4–32 | LUT → F16 + `VFWMACC` | 4–32 |
462
- | RVV | rebias → F32 + `VFMACC` | 4–32 | LUT → F32 + `VFMACC` | 4–32 |
447
+ | Platform | E5M2 Path | Elem/Op | E4M3 Path | Elem/Op |
448
+ | :-------------------- | :----------------------------- | ------: | :----------------------------- | ------: |
449
+ | __x86__ | | | | |
450
+ | Diamond Rapids (2026) | `VCVTBF82PH` → F16 + `VDPPHPS` | 32 | `VCVTHF82PH` → F16 + `VDPPHPS` | 32 |
451
+ | Genoa (2022) | → BF16 + `VDPBF16PS` | 32 | ↓ Ice Lake | 64 |
452
+ | Ice Lake (2019) | ↓ Skylake | 16 | octave LUT + `VPDPBUSD` | 64 |
453
+ | Skylake (2015) | rebias → F32 FMA | 16 | rebias → F32 FMA | 16 |
454
+ | Haswell (2013) | rebias → F32 FMA | 8 | rebias → F32 FMA | 8 |
455
+ | __Arm__ | | | | |
456
+ | NEON + FP8DOT (2026) | native `FDOT` | 16 | native `FDOT` | 16 |
457
+ | NEON + FP16FML (2020) | SHL → F16 + `FMLAL` | 16 | LUT → F16 + `FMLAL` | 16 |
458
+ | NEON (2018) | SHL + `FCVTL` + FMA | 8 | → F16 + `FCVTL` + FMA | 8 |
459
+ | __RISC-V__ | | | | |
460
+ | RVV + Zvfbfwma | rebias → BF16 + `VFWMACCBF16` | 4–32 | LUT → BF16 + `VFWMACCBF16` | 4–32 |
461
+ | RVV + Zvfh | SHL → F16 + `VFWMACC` | 4–32 | LUT → F16 + `VFWMACC` | 4–32 |
462
+ | RVV | rebias → F32 + `VFMACC` | 4–32 | LUT → F32 + `VFMACC` | 4–32 |
463
463
 
464
464
  > E5M2 shares Float16's exponent bias (15), so E5M2 → Float16 conversion is a single left-shift by 8 bits (`SHL 8`).
465
465
  > E4M3 on Ice Lake uses "octave decomposition": the 4-bit exponent splits into 2 octave + 2 remainder bits, yielding 7 integer accumulators post-scaled by powers of 2.
@@ -469,20 +469,20 @@ Their smaller range allows scaling to exact integers that fit in `i8`/`i16`, ena
469
469
  Float16 can also serve as an accumulator, accurately representing ~50 products of E3M2FN pairs or ~20 products of E2M3FN pairs before overflow.
470
470
  On Arm, NEON FHM extensions bring widening `FMLAL` dot-products for Float16 — both faster and more widely available than `BFDOT` for BFloat16.
471
471
 
472
- | Platform | E3M2 Path | Elem/Op | E2M3 Path | Elem/Op |
473
- | ---------------------------- | -------------------------- | ------: | ---------------------------- | ------: |
474
- | __x86__ | | | | |
475
- | Ice Lake (2019) | `VPERMW` LUT + `VPMADDWD` | 32 | `VPERMB` LUT + `VPDPBUSD` | 64 |
476
- | Sierra Forest (2024) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBSSD` | 32 |
477
- | Alder Lake (2021) | Haswell | 32 | `VPSHUFB` LUT + `VPDPBUSD` | 32 |
478
- | Skylake (2015) | `VPSHUFB` LUT + `VPMADDWD` | 64 | `VPSHUFB` LUT + `VPMADDUBSW` | 64 |
479
- | Haswell (2013) | `VPSHUFB` LUT + `VPMADDWD` | 32 | `VPSHUFB` LUT + `VPMADDUBSW` | 32 |
480
- | __Arm__ | | | | |
481
- | NEON + FP8DOT (Olympus) | → E5M2 + `FDOT` | 16 | → E4M3 + `FDOT` | 16 |
482
- | NEON + DotProd (Graviton 2+) | `VQTBL2` LUT + `SMLAL` | 16 | `VQTBL2` LUT + `SDOT` | 16 |
483
- | NEON (Graviton 1+) | → F16 + `FCVTL` + FMA | 16 | → F16 + `FCVTL` + FMA | 16 |
484
- | __RISC-V__ | | | | |
485
- | RVV | I16 gather LUT + `VWMACC` | 4–32 | U8 gather LUT + `VWMACC` | 4–32 |
472
+ | Platform | E3M2 Path | Elem/Op | E2M3 Path | Elem/Op |
473
+ | :-------------------- | :------------------------- | ------: | :--------------------------- | ------: |
474
+ | __x86__ | | | | |
475
+ | Sierra Forest (2024) | Haswell | 32 | `VPSHUFB` LUT + `VPDPBSSD` | 32 |
476
+ | Alder Lake (2021) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBUSD` | 32 |
477
+ | Ice Lake (2019) | `VPERMW` LUT + `VPMADDWD` | 32 | `VPERMB` LUT + `VPDPBUSD` | 64 |
478
+ | Skylake (2015) | `VPSHUFB` LUT + `VPMADDWD` | 64 | `VPSHUFB` LUT + `VPMADDUBSW` | 64 |
479
+ | Haswell (2013) | `VPSHUFB` LUT + `VPMADDWD` | 32 | `VPSHUFB` LUT + `VPMADDUBSW` | 32 |
480
+ | __Arm__ | | | | |
481
+ | NEON + FP8DOT (2026) | → E5M2 + `FDOT` | 16 | → E4M3 + `FDOT` | 16 |
482
+ | NEON + DotProd (2019) | `VQTBL2` LUT + `SMLAL` | 16 | `VQTBL2` LUT + `SDOT` | 16 |
483
+ | NEON (2018) | → F16 + `FCVTL` + FMA | 16 | → F16 + `FCVTL` + FMA | 16 |
484
+ | __RISC-V__ | | | | |
485
+ | RVV | I16 gather LUT + `VWMACC` | 4–32 | U8 gather LUT + `VWMACC` | 4–32 |
486
486
 
487
487
  > E3M2/E2M3 values map to exact integers via 32-entry LUTs (magnitudes up to 448 for E3M2, 120 for E2M3), enabling integer accumulation with no rounding error.
488
488
  > On NEON + FP8DOT, E3M2 is first promoted to E5M2 and E2M3 to E4M3 before the hardware `FDOT` instruction.
@@ -494,7 +494,7 @@ E5M2's range (±57,344) makes the scaled product exceed Int32 entirely.
494
494
  Without the integer path, E5M2 falls back to Float32 accumulation — where its [2-bit mantissa (only 4 values per binade)](https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/) creates a [catastrophic cancellation risk](https://www.ac.uma.es/arith2024/papers/Fused%20FP8%204-Way%20Dot%20Product%20with%20Scaling%20and%20FP32%20Accumulation.pdf) that E2M3's integer path avoids completely:
495
495
 
496
496
  | | _i_ = 0 | _i_ = 1 | _i_ = 2 | _i_ = 3 | _i_ = 4 | _i_ = 5 | _i_ = 6 |
497
- | ------- | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
497
+ | :------ | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
498
498
  | _aᵢ_ | 0.00122 | 20480 | −0.00122 | 1.5 | −3072 | −640 | 0.00146 |
499
499
  | _bᵢ_ | −40 | 320 | −1280 | −7.63e⁻⁵ | 0.000427 | 10240 | −4.58e⁻⁵ |
500
500
  | _aᵢ·bᵢ_ | −0.04883 | 6553600 | 1.5625 | −0.000114 | −1.3125 | −6553600 | ≈ 0 |
package/c/numkong.c CHANGED
@@ -943,7 +943,7 @@ BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved) {
943
943
  // carries ZA state. So __arm_tpidr2_save is always a no-op and
944
944
  // __arm_tpidr2_restore has nothing to restore.
945
945
  // Weak linkage lets a real compiler-rt override these if available.
946
- #if NK_TARGET_ARM_ && NK_TARGET_SME
946
+ #if NK_TARGET_ARM64_ && NK_TARGET_SME
947
947
  __attribute__((weak, visibility("default"))) void __arm_tpidr2_save(void) {}
948
948
  __attribute__((weak, visibility("default"))) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
949
949
  #endif
@@ -62,7 +62,7 @@
62
62
  #ifndef NK_ATTENTION_SAPPHIREAMX_H
63
63
  #define NK_ATTENTION_SAPPHIREAMX_H
64
64
 
65
- #if NK_TARGET_X86_
65
+ #if NK_TARGET_X8664_
66
66
  #if NK_TARGET_SAPPHIREAMX
67
67
 
68
68
  #include "numkong/types.h"
@@ -1359,5 +1359,5 @@ NK_PUBLIC void nk_attention_causal_bf16_sapphireamx(nk_bf16_t const *q, void con
1359
1359
  #endif
1360
1360
 
1361
1361
  #endif // NK_TARGET_SAPPHIREAMX
1362
- #endif // NK_TARGET_X86_
1362
+ #endif // NK_TARGET_X8664_
1363
1363
  #endif // NK_ATTENTION_SAPPHIREAMX_H
@@ -91,7 +91,7 @@
91
91
  #ifndef NK_ATTENTION_SME_H
92
92
  #define NK_ATTENTION_SME_H
93
93
 
94
- #if NK_TARGET_ARM_
94
+ #if NK_TARGET_ARM64_
95
95
  #if NK_TARGET_SME
96
96
 
97
97
  #include "numkong/types.h"
@@ -2068,5 +2068,5 @@ NK_PUBLIC void nk_attention_causal_f16_sme(nk_f16_t const *q, void const *kv_pac
2068
2068
  #endif
2069
2069
 
2070
2070
  #endif // NK_TARGET_SME
2071
- #endif // NK_TARGET_ARM_
2071
+ #endif // NK_TARGET_ARM64_
2072
2072
  #endif // NK_ATTENTION_SME_H
@@ -96,7 +96,7 @@
96
96
 
97
97
  #define NK_VERSION_MAJOR 7
98
98
  #define NK_VERSION_MINOR 4
99
- #define NK_VERSION_PATCH 2
99
+ #define NK_VERSION_PATCH 3
100
100
 
101
101
  /**
102
102
  * @brief Removes compile-time dispatching, and replaces it with runtime dispatching.
@@ -132,33 +132,33 @@
132
132
  // With `-std=c11` glibc hides `syscall()` behind `_GNU_SOURCE`, but if any
133
133
  // system header was included before us, `<features.h>` is already locked.
134
134
  // Forward-declare `syscall` directly — it always exists in glibc.
135
- #if defined(NK_DEFINED_LINUX_) && (NK_TARGET_X86_ || NK_TARGET_RISCV_)
135
+ #if defined(NK_DEFINED_LINUX_) && (NK_TARGET_X8664_ || NK_TARGET_RISCV64_)
136
136
  #include <sys/syscall.h> // `SYS_arch_prctl`, `SYS_riscv_hwprobe`
137
137
  #ifdef __cplusplus
138
138
  extern "C" long syscall(long, ...) noexcept;
139
139
  #else
140
140
  extern long syscall(long, ...);
141
141
  #endif
142
- #if NK_TARGET_RISCV_
142
+ #if NK_TARGET_RISCV64_
143
143
  #include <sys/auxv.h> // `getauxval`, `AT_HWCAP`
144
144
  #endif
145
145
  #endif
146
146
 
147
- #if defined(NK_DEFINED_LINUX_) && NK_TARGET_LOONGARCH_
147
+ #if defined(NK_DEFINED_LINUX_) && NK_TARGET_LOONGARCH64_
148
148
  #include <sys/auxv.h> // `getauxval`, `AT_HWCAP`
149
149
  #endif
150
150
 
151
- #if defined(NK_DEFINED_LINUX_) && NK_TARGET_POWER_
151
+ #if defined(NK_DEFINED_LINUX_) && NK_TARGET_POWER64_
152
152
  #include <sys/auxv.h> // `getauxval`, `AT_HWCAP`
153
153
  #endif
154
154
 
155
155
  // On FreeBSD RISC-V, we use elf_aux_info for capability detection
156
- #if defined(NK_DEFINED_FREEBSD_) && NK_TARGET_RISCV_
156
+ #if defined(NK_DEFINED_FREEBSD_) && NK_TARGET_RISCV64_
157
157
  #include <sys/auxv.h> // `elf_aux_info`, `AT_HWCAP`
158
158
  #endif
159
159
 
160
160
  // On Windows ARM, we use IsProcessorFeaturePresent API for capability detection
161
- #if defined(NK_DEFINED_WINDOWS_) && NK_TARGET_ARM_
161
+ #if defined(NK_DEFINED_WINDOWS_) && NK_TARGET_ARM64_
162
162
  #include <processthreadsapi.h> // `IsProcessorFeaturePresent`
163
163
  #endif
164
164
 
@@ -388,7 +388,7 @@ typedef void (*nk_kernel_cast_punned_t)(void const *from, nk_dtype_t from_type,
388
388
 
389
389
  typedef void (*nk_kernel_punned_t)(void *);
390
390
 
391
- #if NK_TARGET_X86_
391
+ #if NK_TARGET_X8664_
392
392
 
393
393
  NK_PUBLIC int nk_configure_thread_x86_(nk_capability_t capabilities) {
394
394
  #if NK_TARGET_SAPPHIREAMX
@@ -409,7 +409,7 @@ NK_PUBLIC int nk_configure_thread_x86_(nk_capability_t capabilities) {
409
409
  return 1;
410
410
  }
411
411
 
412
- NK_PUBLIC nk_capability_t nk_capabilities_x86_(void) {
412
+ NK_PUBLIC nk_capability_t nk_capabilities_x8664_(void) {
413
413
  union four_registers_t {
414
414
  int array[4];
415
415
  struct separate_t {
@@ -496,9 +496,9 @@ NK_PUBLIC nk_capability_t nk_capabilities_x86_(void) {
496
496
  (nk_cap_graniteamx_k * supports_graniteamx) | (nk_cap_serial_k));
497
497
  }
498
498
 
499
- #endif // NK_TARGET_X86_
499
+ #endif // NK_TARGET_X8664_
500
500
 
501
- #if NK_TARGET_ARM_
501
+ #if NK_TARGET_ARM64_
502
502
 
503
503
  #if defined(__clang__)
504
504
  #pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function)
@@ -508,14 +508,14 @@ NK_PUBLIC nk_capability_t nk_capabilities_x86_(void) {
508
508
  #endif
509
509
 
510
510
  #if NK_HAS_POSIX_EXTENSIONS_
511
- static sigjmp_buf nk_mrs_test_jump_buffer_;
512
- static void nk_mrs_test_sigill_handler_(int sig) {
511
+ static sigjmp_buf nk_mrs_arm64_jump_buffer_;
512
+ static void nk_mrs_arm64_sigill_handler_(int sig) {
513
513
  nk_unused_(sig);
514
- siglongjmp(nk_mrs_test_jump_buffer_, 1);
514
+ siglongjmp(nk_mrs_arm64_jump_buffer_, 1);
515
515
  }
516
516
  #endif
517
517
 
518
- NK_PUBLIC int nk_configure_thread_arm_(nk_capability_t capabilities) {
518
+ NK_PUBLIC int nk_configure_thread_arm64_(nk_capability_t capabilities) {
519
519
  #if defined(_MSC_VER)
520
520
  nk_unused_(capabilities);
521
521
  return 1;
@@ -546,7 +546,7 @@ NK_PUBLIC int nk_configure_thread_arm_(nk_capability_t capabilities) {
546
546
 
547
547
  #elif defined(NK_DEFINED_LINUX_) || defined(NK_DEFINED_FREEBSD_)
548
548
  // Read ID registers via MRS. Only safe if MRS is known to work — indicated by
549
- // capabilities beyond basic NEON (nk_capabilities_arm_ validated MRS via sigaction probe).
549
+ // capabilities beyond basic NEON (nk_capabilities_arm64_ validated MRS via sigaction probe).
550
550
  if (capabilities & ~(nk_cap_neon_k | nk_cap_serial_k)) {
551
551
  // FEAT_EBF16: ID_AA64ISAR1_EL1.BF16 bits [47:44] >= 0b0010
552
552
  register unsigned long isar1_val __asm__("x0");
@@ -570,7 +570,7 @@ NK_PUBLIC int nk_configure_thread_arm_(nk_capability_t capabilities) {
570
570
  #endif // _MSC_VER
571
571
  }
572
572
 
573
- NK_PUBLIC nk_capability_t nk_capabilities_arm_(void) {
573
+ NK_PUBLIC nk_capability_t nk_capabilities_arm64_(void) {
574
574
  #if defined(NK_DEFINED_APPLE_)
575
575
  size_t size = sizeof(unsigned);
576
576
  unsigned supports_neon = 0, supports_fp16 = 0, supports_fhm = 0, supports_bf16 = 0, supports_i8mm = 0;
@@ -602,13 +602,13 @@ NK_PUBLIC nk_capability_t nk_capabilities_arm_(void) {
602
602
 
603
603
  #if NK_HAS_POSIX_EXTENSIONS_
604
604
  struct sigaction action_new, action_old;
605
- action_new.sa_handler = nk_mrs_test_sigill_handler_;
605
+ action_new.sa_handler = nk_mrs_arm64_sigill_handler_;
606
606
  sigemptyset(&action_new.sa_mask);
607
607
  action_new.sa_flags = 0;
608
608
 
609
609
  int mrs_works = 0;
610
610
  if (sigaction(SIGILL, &action_new, &action_old) == 0) {
611
- if (sigsetjmp(nk_mrs_test_jump_buffer_, 1) == 0) {
611
+ if (sigsetjmp(nk_mrs_arm64_jump_buffer_, 1) == 0) {
612
612
  register unsigned long midr_value __asm__("x0");
613
613
  __asm__ __volatile__(".inst 0xD5380000" : "=r"(midr_value)); // MRS x0, MIDR_EL1
614
614
  mrs_works = 1;
@@ -722,11 +722,11 @@ NK_PUBLIC nk_capability_t nk_capabilities_arm_(void) {
722
722
  #pragma GCC pop_options
723
723
  #endif
724
724
 
725
- #endif // NK_TARGET_ARM_
725
+ #endif // NK_TARGET_ARM64_
726
726
 
727
- #if NK_TARGET_RISCV_
727
+ #if NK_TARGET_RISCV64_
728
728
 
729
- NK_PUBLIC nk_capability_t nk_capabilities_riscv_(void) {
729
+ NK_PUBLIC nk_capability_t nk_capabilities_riscv64_(void) {
730
730
  #if defined(NK_DEFINED_LINUX_)
731
731
  unsigned long hwcap = getauxval(AT_HWCAP);
732
732
  nk_capability_t caps = nk_cap_serial_k;
@@ -758,11 +758,11 @@ NK_PUBLIC nk_capability_t nk_capabilities_riscv_(void) {
758
758
  #endif
759
759
  }
760
760
 
761
- #endif // NK_TARGET_RISCV_
761
+ #endif // NK_TARGET_RISCV64_
762
762
 
763
- #if NK_TARGET_LOONGARCH_
763
+ #if NK_TARGET_LOONGARCH64_
764
764
 
765
- NK_PUBLIC nk_capability_t nk_capabilities_loongarch_(void) {
765
+ NK_PUBLIC nk_capability_t nk_capabilities_loongarch64_(void) {
766
766
  #if defined(NK_DEFINED_LINUX_)
767
767
  unsigned long hwcap = getauxval(AT_HWCAP);
768
768
  nk_capability_t caps = nk_cap_serial_k;
@@ -774,11 +774,11 @@ NK_PUBLIC nk_capability_t nk_capabilities_loongarch_(void) {
774
774
  #endif
775
775
  }
776
776
 
777
- #endif // NK_TARGET_LOONGARCH_
777
+ #endif // NK_TARGET_LOONGARCH64_
778
778
 
779
- #if NK_TARGET_POWER_
779
+ #if NK_TARGET_POWER64_
780
780
 
781
- NK_PUBLIC nk_capability_t nk_capabilities_power_(void) {
781
+ NK_PUBLIC nk_capability_t nk_capabilities_power64_(void) {
782
782
  #if defined(NK_DEFINED_LINUX_)
783
783
  unsigned long hwcap = getauxval(AT_HWCAP);
784
784
  unsigned long hwcap2 = getauxval(AT_HWCAP2);
@@ -792,7 +792,7 @@ NK_PUBLIC nk_capability_t nk_capabilities_power_(void) {
792
792
  #endif
793
793
  }
794
794
 
795
- #endif // NK_TARGET_POWER_
795
+ #endif // NK_TARGET_POWER64_
796
796
 
797
797
  #if NK_TARGET_WASM_
798
798
 
@@ -826,27 +826,27 @@ NK_PUBLIC nk_capability_t nk_capabilities_v128relaxed_(void) {
826
826
  #endif // NK_TARGET_WASM_
827
827
 
828
828
  NK_PUBLIC int nk_configure_thread_(nk_capability_t capabilities) {
829
- #if NK_TARGET_X86_
829
+ #if NK_TARGET_X8664_
830
830
  return nk_configure_thread_x86_(capabilities);
831
831
  #endif
832
- #if NK_TARGET_ARM_
833
- return nk_configure_thread_arm_(capabilities);
832
+ #if NK_TARGET_ARM64_
833
+ return nk_configure_thread_arm64_(capabilities);
834
834
  #endif
835
835
  nk_unused_(capabilities);
836
836
  return 1; // success — no platform-specific thread configuration needed
837
837
  }
838
838
 
839
839
  NK_PUBLIC nk_capability_t nk_capabilities_(void) {
840
- #if NK_TARGET_X86_
841
- return nk_capabilities_x86_();
842
- #elif NK_TARGET_ARM_
843
- return nk_capabilities_arm_();
844
- #elif NK_TARGET_RISCV_
845
- return nk_capabilities_riscv_();
846
- #elif NK_TARGET_LOONGARCH_
847
- return nk_capabilities_loongarch_();
848
- #elif NK_TARGET_POWER_
849
- return nk_capabilities_power_();
840
+ #if NK_TARGET_X8664_
841
+ return nk_capabilities_x8664_();
842
+ #elif NK_TARGET_ARM64_
843
+ return nk_capabilities_arm64_();
844
+ #elif NK_TARGET_RISCV64_
845
+ return nk_capabilities_riscv64_();
846
+ #elif NK_TARGET_LOONGARCH64_
847
+ return nk_capabilities_loongarch64_();
848
+ #elif NK_TARGET_POWER64_
849
+ return nk_capabilities_power64_();
850
850
  #elif NK_TARGET_WASM_
851
851
  return nk_capabilities_v128relaxed_();
852
852
  #else
@@ -860,7 +860,7 @@ NK_PUBLIC nk_capability_t nk_capabilities_(void) {
860
860
  */
861
861
  NK_PUBLIC nk_capability_t nk_capabilities_compiled_(void) {
862
862
  nk_capability_t caps = nk_cap_serial_k;
863
- #if NK_TARGET_X86_
863
+ #if NK_TARGET_X8664_
864
864
  caps |= nk_cap_haswell_k * NK_TARGET_HASWELL;
865
865
  caps |= nk_cap_skylake_k * NK_TARGET_SKYLAKE;
866
866
  caps |= nk_cap_icelake_k * NK_TARGET_ICELAKE;
@@ -873,7 +873,7 @@ NK_PUBLIC nk_capability_t nk_capabilities_compiled_(void) {
873
873
  caps |= nk_cap_alder_k * NK_TARGET_ALDER;
874
874
  caps |= nk_cap_sierra_k * NK_TARGET_SIERRA;
875
875
  #endif
876
- #if NK_TARGET_ARM_
876
+ #if NK_TARGET_ARM64_
877
877
  caps |= nk_cap_neon_k * NK_TARGET_NEON;
878
878
  caps |= nk_cap_neonhalf_k * NK_TARGET_NEONHALF;
879
879
  caps |= nk_cap_neonsdot_k * NK_TARGET_NEONSDOT;
@@ -896,16 +896,16 @@ NK_PUBLIC nk_capability_t nk_capabilities_compiled_(void) {
896
896
  caps |= nk_cap_smelut2_k * NK_TARGET_SMELUT2;
897
897
  caps |= nk_cap_smefa64_k * NK_TARGET_SMEFA64;
898
898
  #endif
899
- #if NK_TARGET_RISCV_
899
+ #if NK_TARGET_RISCV64_
900
900
  caps |= nk_cap_rvv_k * NK_TARGET_RVV;
901
901
  caps |= nk_cap_rvvhalf_k * NK_TARGET_RVVHALF;
902
902
  caps |= nk_cap_rvvbf16_k * NK_TARGET_RVVBF16;
903
903
  caps |= nk_cap_rvvbb_k * NK_TARGET_RVVBB;
904
904
  #endif
905
- #if NK_TARGET_LOONGARCH_
905
+ #if NK_TARGET_LOONGARCH64_
906
906
  caps |= nk_cap_loongsonasx_k * NK_TARGET_LOONGSONASX;
907
907
  #endif
908
- #if NK_TARGET_POWER_
908
+ #if NK_TARGET_POWER64_
909
909
  caps |= nk_cap_powervsx_k * NK_TARGET_POWERVSX;
910
910
  #endif
911
911
  #if NK_TARGET_WASM_
@@ -12,7 +12,7 @@
12
12
  #ifndef NK_CAST_DIAMOND_H
13
13
  #define NK_CAST_DIAMOND_H
14
14
 
15
- #if NK_TARGET_X86_
15
+ #if NK_TARGET_X8664_
16
16
  #if NK_TARGET_DIAMOND
17
17
 
18
18
  #include "numkong/types.h"
@@ -60,5 +60,5 @@ NK_INTERNAL void nk_partial_load_e5m2x32_to_f16x32_diamond_(nk_e5m2_t const *src
60
60
  #endif
61
61
 
62
62
  #endif // NK_TARGET_DIAMOND
63
- #endif // NK_TARGET_X86_
63
+ #endif // NK_TARGET_X8664_
64
64
  #endif // NK_CAST_DIAMOND_H
@@ -20,7 +20,7 @@
20
20
  #ifndef NK_CAST_HASWELL_H
21
21
  #define NK_CAST_HASWELL_H
22
22
 
23
- #if NK_TARGET_X86_
23
+ #if NK_TARGET_X8664_
24
24
  #if NK_TARGET_HASWELL
25
25
 
26
26
  #include "numkong/types.h"
@@ -819,5 +819,5 @@ NK_PUBLIC void nk_cast_haswell(void const *from, nk_dtype_t from_type, nk_size_t
819
819
  #endif
820
820
 
821
821
  #endif // NK_TARGET_HASWELL
822
- #endif // NK_TARGET_X86_
822
+ #endif // NK_TARGET_X8664_
823
823
  #endif // NK_CAST_HASWELL_H
@@ -19,7 +19,7 @@
19
19
  #ifndef NK_CAST_ICELAKE_H
20
20
  #define NK_CAST_ICELAKE_H
21
21
 
22
- #if NK_TARGET_X86_
22
+ #if NK_TARGET_X8664_
23
23
  #if NK_TARGET_ICELAKE
24
24
 
25
25
  #include "numkong/types.h"
@@ -471,5 +471,5 @@ NK_PUBLIC void nk_cast_icelake(void const *from, nk_dtype_t from_type, nk_size_t
471
471
  #endif
472
472
 
473
473
  #endif // NK_TARGET_ICELAKE
474
- #endif // NK_TARGET_X86_
474
+ #endif // NK_TARGET_X8664_
475
475
  #endif // NK_CAST_ICELAKE_H
@@ -27,7 +27,7 @@
27
27
  #ifndef NK_CAST_LOONGSONASX_H
28
28
  #define NK_CAST_LOONGSONASX_H
29
29
 
30
- #if NK_TARGET_LOONGARCH_
30
+ #if NK_TARGET_LOONGARCH64_
31
31
  #if NK_TARGET_LOONGSONASX
32
32
 
33
33
  #include "numkong/types.h"
@@ -248,5 +248,5 @@ NK_INTERNAL void nk_euclidean_through_u32_from_dot_loongsonasx_(nk_b128_vec_t do
248
248
  #endif
249
249
 
250
250
  #endif // NK_TARGET_LOONGSONASX
251
- #endif // NK_TARGET_LOONGARCH_
251
+ #endif // NK_TARGET_LOONGARCH64_
252
252
  #endif // NK_CAST_LOONGSONASX_H
@@ -49,7 +49,7 @@
49
49
  #ifndef NK_CAST_NEON_H
50
50
  #define NK_CAST_NEON_H
51
51
 
52
- #if NK_TARGET_ARM_
52
+ #if NK_TARGET_ARM64_
53
53
  #if NK_TARGET_NEON
54
54
 
55
55
  #include "numkong/types.h"
@@ -1155,5 +1155,5 @@ NK_PUBLIC void nk_cast_neon(void const *from, nk_dtype_t from_type, nk_size_t n,
1155
1155
  #endif
1156
1156
 
1157
1157
  #endif // NK_TARGET_NEON
1158
- #endif // NK_TARGET_ARM_
1158
+ #endif // NK_TARGET_ARM64_
1159
1159
  #endif // NK_CAST_NEON_H