numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,89 @@
1
+ # Scalar Math Primitives in NumKong
2
+
3
+ NumKong provides single-element math operations — square root, reciprocal square root, fused multiply-add, and saturating integer arithmetic — with per-ISA implementations.
4
+ These primitives serve as building blocks for vectorized kernels: distance finalizers call `nk_f32_rsqrt` for angular normalization, packing routines call `nk_f32_sqrt` for norm computation.
5
+ Ordering functions (`nk_f16_order`, `nk_bf16_order`, `nk_e4m3_order`) convert floating-point bit patterns to integers that sort in the same order as the original floats.
6
+
7
+ Reciprocal square root:
8
+
9
+ ```math
10
+ \text{rsqrt}(x) = \frac{1}{\sqrt{x}}
11
+ ```
12
+
13
+ Fused multiply-add:
14
+
15
+ ```math
16
+ \text{fma}(a, b, c) = a \cdot b + c
17
+ ```
18
+
19
+ Saturating addition:
20
+
21
+ ```math
22
+ \text{sat\_add}(a, b) = \text{clamp}(a + b, \text{T\_MIN}, \text{T\_MAX})
23
+ ```
24
+
25
+ Reformulating as Python pseudocode:
26
+
27
+ ```python
28
+ import numpy as np
29
+
30
+ def rsqrt(x: float) -> float:
31
+ return 1.0 / np.sqrt(x)
32
+
33
+ def fma(a: float, b: float, c: float) -> float:
34
+ return a * b + c
35
+
36
+ def saturating_add(a: int, b: int, bits: int, signed: bool) -> int:
37
+ lo, hi = (-(1 << (bits-1)), (1 << (bits-1)) - 1) if signed else (0, (1 << bits) - 1)
38
+ return max(lo, min(a + b, hi))
39
+ ```
40
+
41
+ ## Input & Output Types
42
+
43
+ | Input Type | Output Type | Description |
44
+ | ---------- | ----------- | ----------------------------------------------- |
45
+ | `f64` | `f64` | sqrt, rsqrt, fma for 64-bit doubles |
46
+ | `f32` | `f32` | sqrt, rsqrt, fma for 32-bit floats |
47
+ | `f16` | `f16` | sqrt, rsqrt, fma for 16-bit halfs |
48
+ | `i8` | `i8` | Saturating add and multiply |
49
+ | `u8` | `u8` | Saturating add and multiply |
50
+ | `i16` | `i16` | Saturating add and multiply |
51
+ | `u16` | `u16` | Saturating add and multiply |
52
+ | `i32` | `i32` | Saturating add and multiply |
53
+ | `u32` | `u32` | Saturating add and multiply |
54
+ | `i64` | `i64` | Saturating add and multiply |
55
+ | `u64` | `u64` | Saturating add and multiply |
56
+ | `i4x2` | `i4x2` | Saturating add for packed signed nibble pairs |
57
+ | `u4x2` | `u4x2` | Saturating add for packed unsigned nibble pairs |
58
+ | `f16` | `u16` | Ordering: bit pattern to sortable integer |
59
+ | `bf16` | `u16` | Ordering: bit pattern to sortable integer |
60
+ | `e4m3` | `u8` | Ordering: bit pattern to sortable integer |
61
+ | `e5m2` | `u8` | Ordering: bit pattern to sortable integer |
62
+ | `e2m3` | `u8` | Ordering: bit pattern to sortable integer |
63
+ | `e3m2` | `u8` | Ordering: bit pattern to sortable integer |
64
+
65
+ ## Optimizations
66
+
67
+ ### Quake 3 Fast Inverse Square Root
68
+
69
+ `nk_f32_rsqrt_serial` uses the classic bit-manipulation trick: reinterpret Float32 bits as Int32, compute `0x5F375A86 - (bits >> 1)`, reinterpret back to Float32, then refine with 3 Newton-Raphson iterations reaching ~34.9 correct bits.
70
+ Each Newton-Raphson iteration: `y = y * (1.5f - 0.5f * x * y * y)` — 2 multiplies and 1 subtract, ~4cy per iteration.
71
+ `nk_f32_rsqrt_haswell` replaces this with hardware `VRSQRT14PS` ($2^{-14}$ relative error, ~4cy latency) plus one Newton-Raphson refinement (~22-24 correct bits).
72
+ `nk_f64_rsqrt_serial` uses the Float64 magic constant `0x5FE6EB50C7B537A9` with 4 iterations for 52-bit mantissa coverage.
73
+
74
+ ### Dekker Error-Free Multiplication for FMA
75
+
76
+ `nk_f32_fma_serial` emulates fused multiply-add on platforms without hardware FMA using Dekker's algorithm: splits each operand into high and low halves via `a_hi = (a * 134217729.0f) - ((a * 134217729.0f) - a)`, then computes the exact product error term.
77
+ The magic constant $134217729 = 2^{27} + 1$ splits a 24-bit mantissa into two 12-bit halves that multiply without rounding.
78
+ `nk_f32_fma_haswell` uses hardware `VFMADD231SS` — single instruction, single cycle, exact to the last bit.
79
+
80
+ ### Float-to-Integer Ordering
81
+
82
+ `nk_f16_order_serial`, `nk_bf16_order_serial`, `nk_e4m3_order_serial` convert floating-point bit patterns to unsigned integers that preserve the total order.
83
+ Positive floats are already ordered by their bit patterns; negative floats need bit inversion: `if (bits & sign_bit) bits = ~bits; else bits ^= sign_bit`.
84
+ This enables integer comparison instructions (`VPCMPUD`) for floating-point sorting without branching — used by `nk_reduce_minmax_*` for Float8 and sub-32-bit types that lack native SIMD comparison.
85
+
86
+ ## Performance
87
+
88
+ Scalar primitives operate on single elements and are not independently benchmarked.
89
+ Their performance is captured within the vector kernels that call them.
@@ -0,0 +1,113 @@
1
+ /**
2
+ * @brief SIMD-accelerated Scalar Math Helpers for Haswell.
3
+ * @file include/numkong/scalar/haswell.h
4
+ * @author Ash Vardanian
5
+ * @date March 1, 2026
6
+ *
7
+ * @sa include/numkong/scalar.h
8
+ *
9
+ * @section scalars_haswell_instructions Key AVX2/FMA Scalar Instructions
10
+ *
11
+ * Intrinsic Instruction Latency Throughput Ports
12
+ * _mm_sqrt_ps VSQRTPS (XMM, XMM) 11cy 7cy p0
13
+ * _mm_sqrt_pd VSQRTPD (XMM, XMM) 16cy 12cy p0
14
+ * _mm_fmadd_ss VFMADD (XMM, XMM, XMM) 5cy 0.5/cy p01
15
+ * _mm_fmadd_sd VFMADD (XMM, XMM, XMM) 5cy 0.5/cy p01
16
+ * _mm_cvtps_ph VCVTPS2PH (XMM, XMM, I8) 4cy 1/cy p01+p5
17
+ * _mm_cvtph_ps VCVTPH2PS (XMM, XMM) 5cy 1/cy p01
18
+ */
19
+ #ifndef NK_SCALAR_HASWELL_H
20
+ #define NK_SCALAR_HASWELL_H
21
+
22
+ #if NK_TARGET_X86_
23
+ #if NK_TARGET_HASWELL
24
+
25
+ #include "numkong/types.h"
26
+
27
+ #if defined(__cplusplus)
28
+ extern "C" {
29
+ #endif
30
+
31
+ #if defined(__clang__)
32
+ #pragma clang attribute push(__attribute__((target("avx2,f16c,fma,bmi,bmi2"))), apply_to = function)
33
+ #elif defined(__GNUC__)
34
+ #pragma GCC push_options
35
+ #pragma GCC target("avx2", "f16c", "fma", "bmi", "bmi2")
36
+ #endif
37
+
38
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_haswell(nk_f32_t x) { return _mm_cvtss_f32(_mm_sqrt_ps(_mm_set_ss(x))); }
39
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_haswell(nk_f64_t x) { return _mm_cvtsd_f64(_mm_sqrt_pd(_mm_set_sd(x))); }
40
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_haswell(nk_f32_t x) {
41
+ __m128 x_f32x4 = _mm_set_ss(x);
42
+ __m128 estimate_f32x4 = _mm_rsqrt_ss(x_f32x4);
43
+ __m128 refinement_f32x4 = _mm_mul_ss(_mm_mul_ss(x_f32x4, estimate_f32x4), estimate_f32x4);
44
+ refinement_f32x4 = _mm_sub_ss(_mm_set_ss(3.0f), refinement_f32x4);
45
+ return _mm_cvtss_f32(_mm_mul_ss(_mm_mul_ss(_mm_set_ss(0.5f), estimate_f32x4), refinement_f32x4));
46
+ }
47
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_haswell(nk_f64_t x) { return 1.0 / nk_f64_sqrt_haswell(x); }
48
+ NK_PUBLIC nk_f32_t nk_f32_fma_haswell(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
49
+ return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a), _mm_set_ss(b), _mm_set_ss(c)));
50
+ }
51
+ NK_PUBLIC nk_f64_t nk_f64_fma_haswell(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
52
+ return _mm_cvtsd_f64(_mm_fmadd_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c)));
53
+ }
54
+ NK_PUBLIC nk_f16_t nk_f16_sqrt_haswell(nk_f16_t x) {
55
+ __m128 x_f32x4 = _mm_cvtph_ps(_mm_cvtsi32_si128(x));
56
+ return (nk_f16_t)_mm_cvtsi128_si32(_mm_cvtps_ph(_mm_sqrt_ps(x_f32x4), _MM_FROUND_TO_NEAREST_INT));
57
+ }
58
+ NK_PUBLIC nk_f16_t nk_f16_rsqrt_haswell(nk_f16_t x) {
59
+ __m128 x_f32x4 = _mm_cvtph_ps(_mm_cvtsi32_si128(x));
60
+ __m128 estimate_f32x4 = _mm_rsqrt_ss(x_f32x4);
61
+ __m128 refinement_f32x4 = _mm_mul_ss(_mm_mul_ss(x_f32x4, estimate_f32x4), estimate_f32x4);
62
+ refinement_f32x4 = _mm_sub_ss(_mm_set_ss(3.0f), refinement_f32x4);
63
+ estimate_f32x4 = _mm_mul_ss(_mm_mul_ss(_mm_set_ss(0.5f), estimate_f32x4), refinement_f32x4);
64
+ return (nk_f16_t)_mm_cvtsi128_si32(_mm_cvtps_ph(estimate_f32x4, _MM_FROUND_TO_NEAREST_INT));
65
+ }
66
+ NK_PUBLIC nk_f16_t nk_f16_fma_haswell(nk_f16_t a, nk_f16_t b, nk_f16_t c) {
67
+ __m128 a_f32x4 = _mm_cvtph_ps(_mm_cvtsi32_si128(a));
68
+ __m128 b_f32x4 = _mm_cvtph_ps(_mm_cvtsi32_si128(b));
69
+ __m128 c_f32x4 = _mm_cvtph_ps(_mm_cvtsi32_si128(c));
70
+ return (nk_f16_t)_mm_cvtsi128_si32(
71
+ _mm_cvtps_ph(_mm_fmadd_ss(a_f32x4, b_f32x4, c_f32x4), _MM_FROUND_TO_NEAREST_INT));
72
+ }
73
+ NK_PUBLIC nk_u8_t nk_u8_saturating_add_haswell(nk_u8_t a, nk_u8_t b) {
74
+ return (nk_u8_t)_mm_cvtsi128_si32(_mm_adds_epu8(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b)));
75
+ }
76
+ NK_PUBLIC nk_i8_t nk_i8_saturating_add_haswell(nk_i8_t a, nk_i8_t b) {
77
+ return (nk_i8_t)_mm_cvtsi128_si32(_mm_adds_epi8(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b)));
78
+ }
79
+ NK_PUBLIC nk_u16_t nk_u16_saturating_add_haswell(nk_u16_t a, nk_u16_t b) {
80
+ return (nk_u16_t)_mm_cvtsi128_si32(_mm_adds_epu16(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b)));
81
+ }
82
+ NK_PUBLIC nk_i16_t nk_i16_saturating_add_haswell(nk_i16_t a, nk_i16_t b) {
83
+ return (nk_i16_t)_mm_cvtsi128_si32(_mm_adds_epi16(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b)));
84
+ }
85
+ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_haswell(nk_u64_t a, nk_u64_t b) {
86
+ unsigned long long high;
87
+ unsigned long long low = _mulx_u64(a, b, &high);
88
+ return high ? 18446744073709551615ull : low;
89
+ }
90
+ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_haswell(nk_i64_t a, nk_i64_t b) {
91
+ int sign = (a < 0) ^ (b < 0);
92
+ nk_u64_t abs_a = a < 0 ? -(nk_u64_t)a : (nk_u64_t)a;
93
+ nk_u64_t abs_b = b < 0 ? -(nk_u64_t)b : (nk_u64_t)b;
94
+ unsigned long long high;
95
+ unsigned long long low = _mulx_u64(abs_a, abs_b, &high);
96
+ if (high || (sign && low > 9223372036854775808ull) || (!sign && low > 9223372036854775807ull))
97
+ return sign ? (-9223372036854775807ll - 1ll) : 9223372036854775807ll;
98
+ return sign ? -(nk_i64_t)low : (nk_i64_t)low;
99
+ }
100
+
101
+ #if defined(__clang__)
102
+ #pragma clang attribute pop
103
+ #elif defined(__GNUC__)
104
+ #pragma GCC pop_options
105
+ #endif
106
+
107
+ #if defined(__cplusplus)
108
+ } // extern "C"
109
+ #endif
110
+
111
+ #endif // NK_TARGET_HASWELL
112
+ #endif // NK_TARGET_X86_
113
+ #endif // NK_SCALAR_HASWELL_H
@@ -0,0 +1,122 @@
1
+ /**
2
+ * @brief SIMD-accelerated Scalar Math Helpers for NEON.
3
+ * @file include/numkong/scalar/neon.h
4
+ * @author Ash Vardanian
5
+ * @date March 1, 2026
6
+ *
7
+ * @sa include/numkong/scalar.h
8
+ *
9
+ * @section scalars_neon_instructions Key NEON Scalar Instructions
10
+ *
11
+ * Intrinsic Instruction Latency Throughput
12
+ * vsqrt_f32 FSQRT (S) 9-12cy 0.25/cy
13
+ * vsqrt_f64 FSQRT (D) 12-18cy 0.25/cy
14
+ * vfmas_f32 FMADD (S, S, S, S) 4cy 2/cy
15
+ * vfmad_f64 FMADD (D, D, D, D) 4cy 2/cy
16
+ * vqaddb_u8 UQADD (B) 1cy 4/cy
17
+ * vqaddb_s8 SQADD (B) 1cy 4/cy
18
+ */
19
+ #ifndef NK_SCALAR_NEON_H
20
+ #define NK_SCALAR_NEON_H
21
+
22
+ #if NK_TARGET_ARM_
23
+ #if NK_TARGET_NEON
24
+
25
+ #include "numkong/types.h"
26
+
27
+ #if defined(__cplusplus)
28
+ extern "C" {
29
+ #endif
30
+
31
+ #if defined(__clang__)
32
+ #pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
33
+ #elif defined(__GNUC__)
34
+ #pragma GCC push_options
35
+ #pragma GCC target("arch=armv8-a+simd")
36
+ #endif
37
+
38
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_neon(nk_f32_t x) { return vget_lane_f32(vsqrt_f32(vdup_n_f32(x)), 0); }
39
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_neon(nk_f64_t x) { return vget_lane_f64(vsqrt_f64(vdup_n_f64(x)), 0); }
40
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_neon(nk_f32_t x) {
41
+ nk_f32_t r = vrsqrtes_f32(x);
42
+ r *= vrsqrtss_f32(x * r, r);
43
+ r *= vrsqrtss_f32(x * r, r);
44
+ return r;
45
+ }
46
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_neon(nk_f64_t x) {
47
+ nk_f64_t r = vrsqrted_f64(x);
48
+ r *= vrsqrtsd_f64(x * r, r);
49
+ r *= vrsqrtsd_f64(x * r, r);
50
+ r *= vrsqrtsd_f64(x * r, r);
51
+ return r;
52
+ }
53
+ NK_PUBLIC nk_f32_t nk_f32_fma_neon(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
54
+ // MSVC lacks both GCC inline asm and scalar ACLE FMA intrinsics (vfmas_f32/vfmad_f64).
55
+ // GCC/Clang: use inline asm for scalar FMADD.
56
+ // MSVC: use vector FMA + lane extract (compiler may optimize to scalar FMADD).
57
+ #if defined(_MSC_VER)
58
+ return vget_lane_f32(vfma_f32(vdup_n_f32(c), vdup_n_f32(a), vdup_n_f32(b)), 0);
59
+ #else
60
+ nk_f32_t r;
61
+ __asm__("fmadd %s0, %s1, %s2, %s3" : "=w"(r) : "w"(a), "w"(b), "w"(c));
62
+ return r;
63
+ #endif
64
+ }
65
+ NK_PUBLIC nk_f64_t nk_f64_fma_neon(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
66
+ // MSVC lacks both GCC inline asm and scalar ACLE FMA intrinsics (vfmas_f32/vfmad_f64).
67
+ // GCC/Clang: use inline asm for scalar FMADD.
68
+ // MSVC: use vector FMA + lane extract (compiler may optimize to scalar FMADD).
69
+ #if defined(_MSC_VER)
70
+ return vget_lane_f64(vfma_f64(vdup_n_f64(c), vdup_n_f64(a), vdup_n_f64(b)), 0);
71
+ #else
72
+ nk_f64_t r;
73
+ __asm__("fmadd %d0, %d1, %d2, %d3" : "=w"(r) : "w"(a), "w"(b), "w"(c));
74
+ return r;
75
+ #endif
76
+ }
77
+
78
+ NK_PUBLIC nk_u8_t nk_u8_saturating_add_neon(nk_u8_t a, nk_u8_t b) { return vqaddb_u8(a, b); }
79
+ NK_PUBLIC nk_i8_t nk_i8_saturating_add_neon(nk_i8_t a, nk_i8_t b) { return vqaddb_s8(a, b); }
80
+ NK_PUBLIC nk_u16_t nk_u16_saturating_add_neon(nk_u16_t a, nk_u16_t b) { return vqaddh_u16(a, b); }
81
+ NK_PUBLIC nk_i16_t nk_i16_saturating_add_neon(nk_i16_t a, nk_i16_t b) { return vqaddh_s16(a, b); }
82
+ NK_PUBLIC nk_u32_t nk_u32_saturating_add_neon(nk_u32_t a, nk_u32_t b) { return vqadds_u32(a, b); }
83
+ NK_PUBLIC nk_i32_t nk_i32_saturating_add_neon(nk_i32_t a, nk_i32_t b) { return vqadds_s32(a, b); }
84
+ NK_PUBLIC nk_u64_t nk_u64_saturating_add_neon(nk_u64_t a, nk_u64_t b) { return vqaddd_u64(a, b); }
85
+ NK_PUBLIC nk_i64_t nk_i64_saturating_add_neon(nk_i64_t a, nk_i64_t b) { return vqaddd_s64(a, b); }
86
+
87
+ NK_INTERNAL nk_u64_t nk_u64_mulhigh_neon_(nk_u64_t a, nk_u64_t b) {
88
+ #if defined(_MSC_VER)
89
+ return __umulh(a, b);
90
+ #else
91
+ nk_u64_t high;
92
+ __asm__("umulh %0, %1, %2" : "=r"(high) : "r"(a), "r"(b));
93
+ return high;
94
+ #endif
95
+ }
96
+ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_neon(nk_u64_t a, nk_u64_t b) {
97
+ return nk_u64_mulhigh_neon_(a, b) ? 18446744073709551615ull : (a * b);
98
+ }
99
+ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_neon(nk_i64_t a, nk_i64_t b) {
100
+ int sign = (a < 0) ^ (b < 0);
101
+ nk_u64_t abs_a = a < 0 ? -(nk_u64_t)a : (nk_u64_t)a;
102
+ nk_u64_t abs_b = b < 0 ? -(nk_u64_t)b : (nk_u64_t)b;
103
+ nk_u64_t high = nk_u64_mulhigh_neon_(abs_a, abs_b);
104
+ nk_u64_t low = abs_a * abs_b;
105
+ if (high || (sign && low > 9223372036854775808ull) || (!sign && low > 9223372036854775807ull))
106
+ return sign ? (-9223372036854775807ll - 1ll) : 9223372036854775807ll;
107
+ return sign ? -(nk_i64_t)low : (nk_i64_t)low;
108
+ }
109
+
110
+ #if defined(__clang__)
111
+ #pragma clang attribute pop
112
+ #elif defined(__GNUC__)
113
+ #pragma GCC pop_options
114
+ #endif
115
+
116
+ #if defined(__cplusplus)
117
+ } // extern "C"
118
+ #endif
119
+
120
+ #endif // NK_TARGET_NEON
121
+ #endif // NK_TARGET_ARM_
122
+ #endif // NK_SCALAR_NEON_H
@@ -0,0 +1,70 @@
1
+ /**
2
+ * @brief SIMD-accelerated Scalar Math Helpers for NEON FP16 (FEAT_FP16).
3
+ * @file include/numkong/scalar/neonhalf.h
4
+ * @author Ash Vardanian
5
+ * @date March 1, 2026
6
+ *
7
+ * @sa include/numkong/scalar.h
8
+ *
9
+ * ARMv8.2-A FEAT_FP16 provides native scalar f16 sqrt, rsqrt estimate, and fma.
10
+ * `vrsqrte_f16` gives ~4-bit estimate; 2 Newton-Raphson steps refine to ~16 bits,
11
+ * exceeding f16's 10-bit mantissa precision.
12
+ */
13
+ #ifndef NK_SCALAR_NEONHALF_H
14
+ #define NK_SCALAR_NEONHALF_H
15
+
16
+ #if NK_TARGET_ARM_
17
+ #if NK_TARGET_NEONHALF
18
+
19
+ #include "numkong/types.h"
20
+
21
+ #if defined(__cplusplus)
22
+ extern "C" {
23
+ #endif
24
+
25
+ #if defined(__clang__)
26
+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
27
+ #elif defined(__GNUC__)
28
+ #pragma GCC push_options
29
+ #pragma GCC target("arch=armv8.2-a+simd+fp16")
30
+ #endif
31
+
32
+ NK_PUBLIC nk_f16_t nk_f16_sqrt_neonhalf(nk_f16_t x) {
33
+ float16x4_t x_f16x4 = vld1_dup_f16((nk_f16_for_arm_simd_t const *)&x);
34
+ x_f16x4 = vsqrt_f16(x_f16x4);
35
+ nk_f16_t result;
36
+ vst1_lane_f16((nk_f16_for_arm_simd_t *)&result, x_f16x4, 0);
37
+ return result;
38
+ }
39
+ NK_PUBLIC nk_f16_t nk_f16_rsqrt_neonhalf(nk_f16_t x) {
40
+ float16x4_t x_f16x4 = vld1_dup_f16((nk_f16_for_arm_simd_t const *)&x);
41
+ float16x4_t estimate_f16x4 = vrsqrte_f16(x_f16x4);
42
+ estimate_f16x4 = vmul_f16(estimate_f16x4, vrsqrts_f16(vmul_f16(x_f16x4, estimate_f16x4), estimate_f16x4));
43
+ estimate_f16x4 = vmul_f16(estimate_f16x4, vrsqrts_f16(vmul_f16(x_f16x4, estimate_f16x4), estimate_f16x4));
44
+ nk_f16_t result;
45
+ vst1_lane_f16((nk_f16_for_arm_simd_t *)&result, estimate_f16x4, 0);
46
+ return result;
47
+ }
48
+ NK_PUBLIC nk_f16_t nk_f16_fma_neonhalf(nk_f16_t a, nk_f16_t b, nk_f16_t c) {
49
+ float16x4_t a_f16x4 = vld1_dup_f16((nk_f16_for_arm_simd_t const *)&a);
50
+ float16x4_t b_f16x4 = vld1_dup_f16((nk_f16_for_arm_simd_t const *)&b);
51
+ float16x4_t c_f16x4 = vld1_dup_f16((nk_f16_for_arm_simd_t const *)&c);
52
+ c_f16x4 = vfma_f16(c_f16x4, a_f16x4, b_f16x4);
53
+ nk_f16_t result;
54
+ vst1_lane_f16((nk_f16_for_arm_simd_t *)&result, c_f16x4, 0);
55
+ return result;
56
+ }
57
+
58
+ #if defined(__clang__)
59
+ #pragma clang attribute pop
60
+ #elif defined(__GNUC__)
61
+ #pragma GCC pop_options
62
+ #endif
63
+
64
+ #if defined(__cplusplus)
65
+ } // extern "C"
66
+ #endif
67
+
68
+ #endif // NK_TARGET_NEONHALF
69
+ #endif // NK_TARGET_ARM_
70
+ #endif // NK_SCALAR_NEONHALF_H
@@ -0,0 +1,211 @@
1
+ /**
2
+ * @brief SIMD-accelerated Scalar Math Helpers for RISC-V.
3
+ * @file include/numkong/scalar/rvv.h
4
+ * @author Ash Vardanian
5
+ * @date March 1, 2026
6
+ *
7
+ * @sa include/numkong/scalar.h
8
+ *
9
+ * RVV scalar helpers use vector instructions with VL=1 for hardware sqrt/rsqrt.
10
+ * `vfrsqrt7` provides 7-bit mantissa precision; Newton-Raphson refines to full precision.
11
+ */
12
+ #ifndef NK_SCALAR_RVV_H
13
+ #define NK_SCALAR_RVV_H
14
+
15
+ #if NK_TARGET_RISCV_
16
+ #if NK_TARGET_RVV
17
+
18
+ #include "numkong/types.h"
19
+
20
+ #if defined(__clang__)
21
+ #pragma clang attribute push(__attribute__((target("arch=+v"))), apply_to = function)
22
+ #elif defined(__GNUC__)
23
+ #pragma GCC push_options
24
+ #pragma GCC target("arch=+v")
25
+ #endif
26
+
27
+ #if defined(__cplusplus)
28
+ extern "C" {
29
+ #endif
30
+
31
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_rvv(nk_f32_t number) {
32
+ vfloat32m1_t x_f32m1 = __riscv_vfmv_s_f_f32m1(number, 1);
33
+ vfloat32m1_t estimate_f32m1 = __riscv_vfrsqrt7_v_f32m1(x_f32m1, 1);
34
+ vfloat32m1_t half_f32m1 = __riscv_vfmv_s_f_f32m1(0.5f, 1);
35
+ vfloat32m1_t three_half_f32m1 = __riscv_vfmv_s_f_f32m1(1.5f, 1);
36
+ vfloat32m1_t half_x_f32m1 = __riscv_vfmul_vv_f32m1(half_f32m1, x_f32m1, 1);
37
+ // Iteration 1
38
+ vfloat32m1_t estimate_sq_f32m1 = __riscv_vfmul_vv_f32m1(estimate_f32m1, estimate_f32m1, 1);
39
+ vfloat32m1_t correction_f32m1 = __riscv_vfmul_vv_f32m1(half_x_f32m1, estimate_sq_f32m1, 1);
40
+ vfloat32m1_t factor_f32m1 = __riscv_vfsub_vv_f32m1(three_half_f32m1, correction_f32m1, 1);
41
+ estimate_f32m1 = __riscv_vfmul_vv_f32m1(estimate_f32m1, factor_f32m1, 1);
42
+ // Iteration 2
43
+ estimate_sq_f32m1 = __riscv_vfmul_vv_f32m1(estimate_f32m1, estimate_f32m1, 1);
44
+ correction_f32m1 = __riscv_vfmul_vv_f32m1(half_x_f32m1, estimate_sq_f32m1, 1);
45
+ factor_f32m1 = __riscv_vfsub_vv_f32m1(three_half_f32m1, correction_f32m1, 1);
46
+ estimate_f32m1 = __riscv_vfmul_vv_f32m1(estimate_f32m1, factor_f32m1, 1);
47
+ return __riscv_vfmv_f_s_f32m1_f32(estimate_f32m1);
48
+ }
49
+
50
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_rvv(nk_f32_t number) {
51
+ vfloat32m1_t x_f32m1 = __riscv_vfmv_s_f_f32m1(number, 1);
52
+ return __riscv_vfmv_f_s_f32m1_f32(__riscv_vfsqrt_v_f32m1(x_f32m1, 1));
53
+ }
54
+
55
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_rvv(nk_f64_t number) {
56
+ vfloat64m1_t x_f64m1 = __riscv_vfmv_s_f_f64m1(number, 1);
57
+ vfloat64m1_t estimate_f64m1 = __riscv_vfrsqrt7_v_f64m1(x_f64m1, 1);
58
+ vfloat64m1_t half_f64m1 = __riscv_vfmv_s_f_f64m1(0.5, 1);
59
+ vfloat64m1_t three_half_f64m1 = __riscv_vfmv_s_f_f64m1(1.5, 1);
60
+ vfloat64m1_t half_x_f64m1 = __riscv_vfmul_vv_f64m1(half_f64m1, x_f64m1, 1);
61
+ // Iteration 1
62
+ vfloat64m1_t estimate_sq_f64m1 = __riscv_vfmul_vv_f64m1(estimate_f64m1, estimate_f64m1, 1);
63
+ vfloat64m1_t correction_f64m1 = __riscv_vfmul_vv_f64m1(half_x_f64m1, estimate_sq_f64m1, 1);
64
+ vfloat64m1_t factor_f64m1 = __riscv_vfsub_vv_f64m1(three_half_f64m1, correction_f64m1, 1);
65
+ estimate_f64m1 = __riscv_vfmul_vv_f64m1(estimate_f64m1, factor_f64m1, 1);
66
+ // Iteration 2
67
+ estimate_sq_f64m1 = __riscv_vfmul_vv_f64m1(estimate_f64m1, estimate_f64m1, 1);
68
+ correction_f64m1 = __riscv_vfmul_vv_f64m1(half_x_f64m1, estimate_sq_f64m1, 1);
69
+ factor_f64m1 = __riscv_vfsub_vv_f64m1(three_half_f64m1, correction_f64m1, 1);
70
+ estimate_f64m1 = __riscv_vfmul_vv_f64m1(estimate_f64m1, factor_f64m1, 1);
71
+ // Iteration 3
72
+ estimate_sq_f64m1 = __riscv_vfmul_vv_f64m1(estimate_f64m1, estimate_f64m1, 1);
73
+ correction_f64m1 = __riscv_vfmul_vv_f64m1(half_x_f64m1, estimate_sq_f64m1, 1);
74
+ factor_f64m1 = __riscv_vfsub_vv_f64m1(three_half_f64m1, correction_f64m1, 1);
75
+ estimate_f64m1 = __riscv_vfmul_vv_f64m1(estimate_f64m1, factor_f64m1, 1);
76
+ return __riscv_vfmv_f_s_f64m1_f64(estimate_f64m1);
77
+ }
78
+
79
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_rvv(nk_f64_t number) {
80
+ vfloat64m1_t x_f64m1 = __riscv_vfmv_s_f_f64m1(number, 1);
81
+ return __riscv_vfmv_f_s_f64m1_f64(__riscv_vfsqrt_v_f64m1(x_f64m1, 1));
82
+ }
83
+
84
+ NK_PUBLIC nk_f32_t nk_f32_fma_rvv(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
85
+ vfloat32m1_t a_f32m1 = __riscv_vfmv_s_f_f32m1(a, 1);
86
+ vfloat32m1_t c_f32m1 = __riscv_vfmv_s_f_f32m1(c, 1);
87
+ return __riscv_vfmv_f_s_f32m1_f32(__riscv_vfmacc_vf_f32m1(c_f32m1, b, a_f32m1, 1));
88
+ }
89
+
90
+ NK_PUBLIC nk_f64_t nk_f64_fma_rvv(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
91
+ vfloat64m1_t a_f64m1 = __riscv_vfmv_s_f_f64m1(a, 1);
92
+ vfloat64m1_t c_f64m1 = __riscv_vfmv_s_f_f64m1(c, 1);
93
+ return __riscv_vfmv_f_s_f64m1_f64(__riscv_vfmacc_vf_f64m1(c_f64m1, b, a_f64m1, 1));
94
+ }
95
+
96
+ NK_PUBLIC nk_u8_t nk_u8_saturating_add_rvv(nk_u8_t a, nk_u8_t b) {
97
+ vuint8m1_t a_u8m1 = __riscv_vmv_v_x_u8m1(a, 1);
98
+ vuint8m1_t b_u8m1 = __riscv_vmv_v_x_u8m1(b, 1);
99
+ return __riscv_vmv_x_s_u8m1_u8(__riscv_vsaddu_vv_u8m1(a_u8m1, b_u8m1, 1));
100
+ }
101
+
102
+ NK_PUBLIC nk_i8_t nk_i8_saturating_add_rvv(nk_i8_t a, nk_i8_t b) {
103
+ vint8m1_t a_i8m1 = __riscv_vmv_v_x_i8m1(a, 1);
104
+ vint8m1_t b_i8m1 = __riscv_vmv_v_x_i8m1(b, 1);
105
+ return __riscv_vmv_x_s_i8m1_i8(__riscv_vsadd_vv_i8m1(a_i8m1, b_i8m1, 1));
106
+ }
107
+
108
+ NK_PUBLIC nk_u16_t nk_u16_saturating_add_rvv(nk_u16_t a, nk_u16_t b) {
109
+ vuint16m1_t a_u16m1 = __riscv_vmv_v_x_u16m1(a, 1);
110
+ vuint16m1_t b_u16m1 = __riscv_vmv_v_x_u16m1(b, 1);
111
+ return __riscv_vmv_x_s_u16m1_u16(__riscv_vsaddu_vv_u16m1(a_u16m1, b_u16m1, 1));
112
+ }
113
+
114
+ NK_PUBLIC nk_i16_t nk_i16_saturating_add_rvv(nk_i16_t a, nk_i16_t b) {
115
+ vint16m1_t a_i16m1 = __riscv_vmv_v_x_i16m1(a, 1);
116
+ vint16m1_t b_i16m1 = __riscv_vmv_v_x_i16m1(b, 1);
117
+ return __riscv_vmv_x_s_i16m1_i16(__riscv_vsadd_vv_i16m1(a_i16m1, b_i16m1, 1));
118
+ }
119
+
120
+ NK_PUBLIC nk_u32_t nk_u32_saturating_add_rvv(nk_u32_t a, nk_u32_t b) {
121
+ vuint32m1_t a_u32m1 = __riscv_vmv_v_x_u32m1(a, 1);
122
+ vuint32m1_t b_u32m1 = __riscv_vmv_v_x_u32m1(b, 1);
123
+ return __riscv_vmv_x_s_u32m1_u32(__riscv_vsaddu_vv_u32m1(a_u32m1, b_u32m1, 1));
124
+ }
125
+
126
+ NK_PUBLIC nk_i32_t nk_i32_saturating_add_rvv(nk_i32_t a, nk_i32_t b) {
127
+ vint32m1_t a_i32m1 = __riscv_vmv_v_x_i32m1(a, 1);
128
+ vint32m1_t b_i32m1 = __riscv_vmv_v_x_i32m1(b, 1);
129
+ return __riscv_vmv_x_s_i32m1_i32(__riscv_vsadd_vv_i32m1(a_i32m1, b_i32m1, 1));
130
+ }
131
+
132
+ NK_PUBLIC nk_u64_t nk_u64_saturating_add_rvv(nk_u64_t a, nk_u64_t b) {
133
+ vuint64m1_t a_u64m1 = __riscv_vmv_v_x_u64m1(a, 1);
134
+ vuint64m1_t b_u64m1 = __riscv_vmv_v_x_u64m1(b, 1);
135
+ return __riscv_vmv_x_s_u64m1_u64(__riscv_vsaddu_vv_u64m1(a_u64m1, b_u64m1, 1));
136
+ }
137
+
138
+ NK_PUBLIC nk_i64_t nk_i64_saturating_add_rvv(nk_i64_t a, nk_i64_t b) {
139
+ vint64m1_t a_i64m1 = __riscv_vmv_v_x_i64m1(a, 1);
140
+ vint64m1_t b_i64m1 = __riscv_vmv_v_x_i64m1(b, 1);
141
+ return __riscv_vmv_x_s_i64m1_i64(__riscv_vsadd_vv_i64m1(a_i64m1, b_i64m1, 1));
142
+ }
143
+
144
+ NK_PUBLIC nk_u8_t nk_u8_saturating_mul_rvv(nk_u8_t a, nk_u8_t b) {
145
+ vuint8m1_t a_u8m1 = __riscv_vmv_v_x_u8m1(a, 1);
146
+ vuint8m1_t b_u8m1 = __riscv_vmv_v_x_u8m1(b, 1);
147
+ vuint16m2_t product_u16m2 = __riscv_vwmulu_vv_u16m2(a_u8m1, b_u8m1, 1);
148
+ return __riscv_vmv_x_s_u8m1_u8(__riscv_vnclipu_wx_u8m1(product_u16m2, 0, __RISCV_VXRM_RDN, 1));
149
+ }
150
+ NK_PUBLIC nk_i8_t nk_i8_saturating_mul_rvv(nk_i8_t a, nk_i8_t b) {
151
+ vint8m1_t a_i8m1 = __riscv_vmv_v_x_i8m1(a, 1);
152
+ vint8m1_t b_i8m1 = __riscv_vmv_v_x_i8m1(b, 1);
153
+ vint16m2_t product_i16m2 = __riscv_vwmul_vv_i16m2(a_i8m1, b_i8m1, 1);
154
+ return __riscv_vmv_x_s_i8m1_i8(__riscv_vnclip_wx_i8m1(product_i16m2, 0, __RISCV_VXRM_RDN, 1));
155
+ }
156
+ NK_PUBLIC nk_u16_t nk_u16_saturating_mul_rvv(nk_u16_t a, nk_u16_t b) {
157
+ vuint16m1_t a_u16m1 = __riscv_vmv_v_x_u16m1(a, 1);
158
+ vuint16m1_t b_u16m1 = __riscv_vmv_v_x_u16m1(b, 1);
159
+ vuint32m2_t product_u32m2 = __riscv_vwmulu_vv_u32m2(a_u16m1, b_u16m1, 1);
160
+ return __riscv_vmv_x_s_u16m1_u16(__riscv_vnclipu_wx_u16m1(product_u32m2, 0, __RISCV_VXRM_RDN, 1));
161
+ }
162
+ NK_PUBLIC nk_i16_t nk_i16_saturating_mul_rvv(nk_i16_t a, nk_i16_t b) {
163
+ vint16m1_t a_i16m1 = __riscv_vmv_v_x_i16m1(a, 1);
164
+ vint16m1_t b_i16m1 = __riscv_vmv_v_x_i16m1(b, 1);
165
+ vint32m2_t product_i32m2 = __riscv_vwmul_vv_i32m2(a_i16m1, b_i16m1, 1);
166
+ return __riscv_vmv_x_s_i16m1_i16(__riscv_vnclip_wx_i16m1(product_i32m2, 0, __RISCV_VXRM_RDN, 1));
167
+ }
168
+ NK_PUBLIC nk_u32_t nk_u32_saturating_mul_rvv(nk_u32_t a, nk_u32_t b) {
169
+ vuint32m1_t a_u32m1 = __riscv_vmv_v_x_u32m1(a, 1);
170
+ vuint32m1_t b_u32m1 = __riscv_vmv_v_x_u32m1(b, 1);
171
+ vuint64m2_t product_u64m2 = __riscv_vwmulu_vv_u64m2(a_u32m1, b_u32m1, 1);
172
+ return __riscv_vmv_x_s_u32m1_u32(__riscv_vnclipu_wx_u32m1(product_u64m2, 0, __RISCV_VXRM_RDN, 1));
173
+ }
174
+ NK_PUBLIC nk_i32_t nk_i32_saturating_mul_rvv(nk_i32_t a, nk_i32_t b) {
175
+ vint32m1_t a_i32m1 = __riscv_vmv_v_x_i32m1(a, 1);
176
+ vint32m1_t b_i32m1 = __riscv_vmv_v_x_i32m1(b, 1);
177
+ vint64m2_t product_i64m2 = __riscv_vwmul_vv_i64m2(a_i32m1, b_i32m1, 1);
178
+ return __riscv_vmv_x_s_i32m1_i32(__riscv_vnclip_wx_i32m1(product_i64m2, 0, __RISCV_VXRM_RDN, 1));
179
+ }
180
+ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_rvv(nk_u64_t a, nk_u64_t b) {
181
+ vuint64m1_t a_u64m1 = __riscv_vmv_v_x_u64m1(a, 1);
182
+ vuint64m1_t b_u64m1 = __riscv_vmv_v_x_u64m1(b, 1);
183
+ nk_u64_t high = __riscv_vmv_x_s_u64m1_u64(__riscv_vmulhu_vv_u64m1(a_u64m1, b_u64m1, 1));
184
+ return high ? 18446744073709551615ull : (a * b);
185
+ }
186
+ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_rvv(nk_i64_t a, nk_i64_t b) {
187
+ int sign = (a < 0) ^ (b < 0);
188
+ nk_u64_t abs_a = a < 0 ? -(nk_u64_t)a : (nk_u64_t)a;
189
+ nk_u64_t abs_b = b < 0 ? -(nk_u64_t)b : (nk_u64_t)b;
190
+ vuint64m1_t a_u64m1 = __riscv_vmv_v_x_u64m1(abs_a, 1);
191
+ vuint64m1_t b_u64m1 = __riscv_vmv_v_x_u64m1(abs_b, 1);
192
+ nk_u64_t high = __riscv_vmv_x_s_u64m1_u64(__riscv_vmulhu_vv_u64m1(a_u64m1, b_u64m1, 1));
193
+ nk_u64_t low = abs_a * abs_b;
194
+ if (high || (sign && low > 9223372036854775808ull) || (!sign && low > 9223372036854775807ull))
195
+ return sign ? (-9223372036854775807ll - 1ll) : 9223372036854775807ll;
196
+ return sign ? -(nk_i64_t)low : (nk_i64_t)low;
197
+ }
198
+
199
+ #if defined(__cplusplus)
200
+ } // extern "C"
201
+ #endif
202
+
203
+ #if defined(__clang__)
204
+ #pragma clang attribute pop
205
+ #elif defined(__GNUC__)
206
+ #pragma GCC pop_options
207
+ #endif
208
+
209
+ #endif // NK_TARGET_RVV
210
+ #endif // NK_TARGET_RISCV_
211
+ #endif // NK_SCALAR_RVV_H
@@ -0,0 +1,63 @@
1
+ /**
2
+ * @brief SIMD-accelerated Scalar Math Helpers for Sapphire Rapids.
3
+ * @file include/numkong/scalar/sapphire.h
4
+ * @author Ash Vardanian
5
+ * @date March 1, 2026
6
+ *
7
+ * @sa include/numkong/scalar.h
8
+ *
9
+ * Provides native AVX-512 FP16 scalar ordering via `VCOMISH`.
10
+ */
11
+ #ifndef NK_SCALAR_SAPPHIRE_H
12
+ #define NK_SCALAR_SAPPHIRE_H
13
+
14
+ #if NK_TARGET_X86_
15
+ #if NK_TARGET_SAPPHIRE
16
+
17
+ #include "numkong/types.h"
18
+
19
+ #if defined(__cplusplus)
20
+ extern "C" {
21
+ #endif
22
+
23
+ #if defined(__clang__)
24
+ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512fp16,f16c,fma,bmi,bmi2"))), \
25
+ apply_to = function)
26
+ #elif defined(__GNUC__)
27
+ #pragma GCC push_options
28
+ #pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512fp16", "f16c", "fma", "bmi", "bmi2")
29
+ #endif
30
+
31
+ NK_PUBLIC int nk_f16_order_sapphire(nk_f16_t a, nk_f16_t b) {
32
+ __m128h a_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(a));
33
+ __m128h b_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(b));
34
+ return _mm_comigt_sh(a_f16x8, b_f16x8) - _mm_comilt_sh(a_f16x8, b_f16x8);
35
+ }
36
+ NK_PUBLIC nk_f16_t nk_f16_sqrt_sapphire(nk_f16_t x) {
37
+ __m128h x_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(x));
38
+ return (nk_f16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_sqrt_sh(x_f16x8, x_f16x8)));
39
+ }
40
+ NK_PUBLIC nk_f16_t nk_f16_rsqrt_sapphire(nk_f16_t x) {
41
+ __m128h x_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(x));
42
+ return (nk_f16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_rsqrt_sh(x_f16x8, x_f16x8)));
43
+ }
44
+ NK_PUBLIC nk_f16_t nk_f16_fma_sapphire(nk_f16_t a, nk_f16_t b, nk_f16_t c) {
45
+ __m128h a_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(a));
46
+ __m128h b_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(b));
47
+ __m128h c_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(c));
48
+ return (nk_f16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_fmadd_sh(a_f16x8, b_f16x8, c_f16x8)));
49
+ }
50
+
51
+ #if defined(__clang__)
52
+ #pragma clang attribute pop
53
+ #elif defined(__GNUC__)
54
+ #pragma GCC pop_options
55
+ #endif
56
+
57
+ #if defined(__cplusplus)
58
+ } // extern "C"
59
+ #endif
60
+
61
+ #endif // NK_TARGET_SAPPHIRE
62
+ #endif // NK_TARGET_X86_
63
+ #endif // NK_SCALAR_SAPPHIRE_H