numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,683 @@
1
+ /**
2
+ * @brief SIMD-accelerated Scalar Math Helpers.
3
+ * @file include/numkong/scalar.h
4
+ * @author Ash Vardanian
5
+ * @date March 1, 2026
6
+ *
7
+ * Provides dispatchable scalar helpers: sqrt, rsqrt, fma, saturating arithmetic,
8
+ * and ordering. Each ISA file is header-only with
9
+ * `NK_PUBLIC static inline` implementations; compile-time dispatch selects the
10
+ * best available backend when `NK_DYNAMIC_DISPATCH` is off.
11
+ *
12
+ * For hardware architectures:
13
+ *
14
+ * - Serial: software-emulated (Quake 3 rsqrt, bit-manipulation casts)
15
+ * - Arm: NEON (sqrt, fma, saturating_add)
16
+ * - x86: Haswell (sqrt, rsqrt, fma)
17
+ * - RISC-V: RVV (sqrt, rsqrt, fma, saturating_add via vfrsqrt7 + Newton-Raphson)
18
+ * - WASM: V128Relaxed (sqrt)
19
+ */
20
+ #ifndef NK_SCALAR_H
21
+ #define NK_SCALAR_H
22
+
23
+ #include "numkong/types.h"
24
+
25
+ #if defined(__cplusplus)
26
+ extern "C" {
27
+ #endif
28
+
29
+ /**
30
+ * @brief Scalar square root: `√x`.
31
+ *
32
+ * @param[in] x The input value.
33
+ * @return The square root of @p x.
34
+ */
35
+ NK_DYNAMIC nk_f32_t nk_f32_sqrt(nk_f32_t x);
36
+ /** @copydoc nk_f32_sqrt */
37
+ NK_DYNAMIC nk_f64_t nk_f64_sqrt(nk_f64_t x);
38
+
39
+ /**
40
+ * @brief Scalar reciprocal square root: `1/√x`.
41
+ * @sa std::rsqrt, @sa Rust f32::rsqrt
42
+ *
43
+ * @param[in] x The input value.
44
+ * @return The reciprocal square root of @p x.
45
+ */
46
+ NK_DYNAMIC nk_f32_t nk_f32_rsqrt(nk_f32_t x);
47
+ /** @copydoc nk_f32_rsqrt */
48
+ NK_DYNAMIC nk_f64_t nk_f64_rsqrt(nk_f64_t x);
49
+
50
+ /**
51
+ * @brief Scalar fused multiply-add: `a × b + c`.
52
+ * @sa std::fma, @sa Rust f32::mul_add
53
+ *
54
+ * @param[in] a Multiplicand.
55
+ * @param[in] b Multiplier.
56
+ * @param[in] c Addend.
57
+ * @return `a * b + c` computed without intermediate rounding.
58
+ */
59
+ NK_DYNAMIC nk_f32_t nk_f32_fma(nk_f32_t a, nk_f32_t b, nk_f32_t c);
60
+ /** @copydoc nk_f32_fma */
61
+ NK_DYNAMIC nk_f64_t nk_f64_fma(nk_f64_t a, nk_f64_t b, nk_f64_t c);
62
+
63
+ /** @copydoc nk_f32_sqrt */
64
+ NK_DYNAMIC nk_f16_t nk_f16_sqrt(nk_f16_t x);
65
+ /** @copydoc nk_f32_rsqrt */
66
+ NK_DYNAMIC nk_f16_t nk_f16_rsqrt(nk_f16_t x);
67
+ /** @copydoc nk_f32_fma */
68
+ NK_DYNAMIC nk_f16_t nk_f16_fma(nk_f16_t a, nk_f16_t b, nk_f16_t c);
69
+
70
+ /**
71
+ * @brief Saturating addition clamped to the representable range of the type.
72
+ *
73
+ * @param[in] a First operand.
74
+ * @param[in] b Second operand.
75
+ * @return `clamp(a + b, MIN, MAX)`.
76
+ */
77
+ NK_DYNAMIC nk_u8_t nk_u8_saturating_add(nk_u8_t a, nk_u8_t b);
78
+ /** @copydoc nk_u8_saturating_add */
79
+ NK_DYNAMIC nk_i8_t nk_i8_saturating_add(nk_i8_t a, nk_i8_t b);
80
+ /** @copydoc nk_u8_saturating_add */
81
+ NK_DYNAMIC nk_u16_t nk_u16_saturating_add(nk_u16_t a, nk_u16_t b);
82
+ /** @copydoc nk_u8_saturating_add */
83
+ NK_DYNAMIC nk_i16_t nk_i16_saturating_add(nk_i16_t a, nk_i16_t b);
84
+ /** @copydoc nk_u8_saturating_add */
85
+ NK_DYNAMIC nk_u32_t nk_u32_saturating_add(nk_u32_t a, nk_u32_t b);
86
+ /** @copydoc nk_u8_saturating_add */
87
+ NK_DYNAMIC nk_i32_t nk_i32_saturating_add(nk_i32_t a, nk_i32_t b);
88
+ /** @copydoc nk_u8_saturating_add */
89
+ NK_DYNAMIC nk_u64_t nk_u64_saturating_add(nk_u64_t a, nk_u64_t b);
90
+ /** @copydoc nk_u8_saturating_add */
91
+ NK_DYNAMIC nk_i64_t nk_i64_saturating_add(nk_i64_t a, nk_i64_t b);
92
+ /** @copydoc nk_u8_saturating_add */
93
+ NK_DYNAMIC nk_i4x2_t nk_i4x2_saturating_add(nk_i4x2_t a, nk_i4x2_t b);
94
+ /** @copydoc nk_u8_saturating_add */
95
+ NK_DYNAMIC nk_u4x2_t nk_u4x2_saturating_add(nk_u4x2_t a, nk_u4x2_t b);
96
+
97
+ /**
98
+ * @brief Saturating multiplication clamped to the representable range of the type.
99
+ *
100
+ * @param[in] a First operand.
101
+ * @param[in] b Second operand.
102
+ * @return `clamp(a * b, MIN, MAX)`.
103
+ */
104
+ NK_DYNAMIC nk_u8_t nk_u8_saturating_mul(nk_u8_t a, nk_u8_t b);
105
+ /** @copydoc nk_u8_saturating_mul */
106
+ NK_DYNAMIC nk_i8_t nk_i8_saturating_mul(nk_i8_t a, nk_i8_t b);
107
+ /** @copydoc nk_u8_saturating_mul */
108
+ NK_DYNAMIC nk_u16_t nk_u16_saturating_mul(nk_u16_t a, nk_u16_t b);
109
+ /** @copydoc nk_u8_saturating_mul */
110
+ NK_DYNAMIC nk_i16_t nk_i16_saturating_mul(nk_i16_t a, nk_i16_t b);
111
+ /** @copydoc nk_u8_saturating_mul */
112
+ NK_DYNAMIC nk_u32_t nk_u32_saturating_mul(nk_u32_t a, nk_u32_t b);
113
+ /** @copydoc nk_u8_saturating_mul */
114
+ NK_DYNAMIC nk_i32_t nk_i32_saturating_mul(nk_i32_t a, nk_i32_t b);
115
+ /** @copydoc nk_u8_saturating_mul */
116
+ NK_DYNAMIC nk_u64_t nk_u64_saturating_mul(nk_u64_t a, nk_u64_t b);
117
+ /** @copydoc nk_u8_saturating_mul */
118
+ NK_DYNAMIC nk_i64_t nk_i64_saturating_mul(nk_i64_t a, nk_i64_t b);
119
+ /** @copydoc nk_u8_saturating_mul */
120
+ NK_DYNAMIC nk_i4x2_t nk_i4x2_saturating_mul(nk_i4x2_t a, nk_i4x2_t b);
121
+ /** @copydoc nk_u8_saturating_mul */
122
+ NK_DYNAMIC nk_u4x2_t nk_u4x2_saturating_mul(nk_u4x2_t a, nk_u4x2_t b);
123
+
124
+ /**
125
+ * @brief Branchless sign-magnitude ordering for non-native floating-point scalars.
126
+ * @sa std::strong_order, Rust total_cmp
127
+ *
128
+ * Uses `mask = -sign; ordered = value ^ mask` — the constant offset cancels in subtraction.
129
+ * Returns negative if a < b, 0 if equal, positive if a > b.
130
+ *
131
+ * @param[in] a First operand.
132
+ * @param[in] b Second operand.
133
+ * @return Negative if `a < b`, zero if `a == b`, positive if `a > b`.
134
+ *
135
+ * @note NaN values are ordered at the extremes per IEEE 754 totalOrder
136
+ * (negative NaN < all finite < positive NaN). Callers requiring NaN-exclusion
137
+ * semantics must filter NaN before calling.
138
+ */
139
+ NK_DYNAMIC int nk_f16_order(nk_f16_t a, nk_f16_t b);
140
+ /** @copydoc nk_f16_order */
141
+ NK_DYNAMIC int nk_bf16_order(nk_bf16_t a, nk_bf16_t b);
142
+ /** @copydoc nk_f16_order */
143
+ NK_DYNAMIC int nk_e4m3_order(nk_e4m3_t a, nk_e4m3_t b);
144
+ /** @copydoc nk_f16_order */
145
+ NK_DYNAMIC int nk_e5m2_order(nk_e5m2_t a, nk_e5m2_t b);
146
+ /** @copydoc nk_f16_order */
147
+ NK_DYNAMIC int nk_e2m3_order(nk_e2m3_t a, nk_e2m3_t b);
148
+ /** @copydoc nk_f16_order */
149
+ NK_DYNAMIC int nk_e3m2_order(nk_e3m2_t a, nk_e3m2_t b);
150
+
151
+ /** @copydoc nk_f32_sqrt */
152
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_serial(nk_f32_t x);
153
+ /** @copydoc nk_f64_sqrt */
154
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_serial(nk_f64_t x);
155
+ /** @copydoc nk_f32_rsqrt */
156
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_serial(nk_f32_t x);
157
+ /** @copydoc nk_f64_rsqrt */
158
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_serial(nk_f64_t x);
159
+ /** @copydoc nk_f32_fma */
160
+ NK_PUBLIC nk_f32_t nk_f32_fma_serial(nk_f32_t a, nk_f32_t b, nk_f32_t c);
161
+ /** @copydoc nk_f64_fma */
162
+ NK_PUBLIC nk_f64_t nk_f64_fma_serial(nk_f64_t a, nk_f64_t b, nk_f64_t c);
163
+
164
+ /** @copydoc nk_f16_sqrt */
165
+ NK_PUBLIC nk_f16_t nk_f16_sqrt_serial(nk_f16_t x);
166
+ /** @copydoc nk_f16_rsqrt */
167
+ NK_PUBLIC nk_f16_t nk_f16_rsqrt_serial(nk_f16_t x);
168
+ /** @copydoc nk_f16_fma */
169
+ NK_PUBLIC nk_f16_t nk_f16_fma_serial(nk_f16_t a, nk_f16_t b, nk_f16_t c);
170
+
171
+ /** @copydoc nk_u8_saturating_add */
172
+ NK_PUBLIC nk_u8_t nk_u8_saturating_add_serial(nk_u8_t a, nk_u8_t b);
173
+ /** @copydoc nk_u8_saturating_add */
174
+ NK_PUBLIC nk_i8_t nk_i8_saturating_add_serial(nk_i8_t a, nk_i8_t b);
175
+ /** @copydoc nk_u8_saturating_add */
176
+ NK_PUBLIC nk_u16_t nk_u16_saturating_add_serial(nk_u16_t a, nk_u16_t b);
177
+ /** @copydoc nk_u8_saturating_add */
178
+ NK_PUBLIC nk_i16_t nk_i16_saturating_add_serial(nk_i16_t a, nk_i16_t b);
179
+ /** @copydoc nk_u8_saturating_add */
180
+ NK_PUBLIC nk_u32_t nk_u32_saturating_add_serial(nk_u32_t a, nk_u32_t b);
181
+ /** @copydoc nk_u8_saturating_add */
182
+ NK_PUBLIC nk_i32_t nk_i32_saturating_add_serial(nk_i32_t a, nk_i32_t b);
183
+ /** @copydoc nk_u8_saturating_add */
184
+ NK_PUBLIC nk_u64_t nk_u64_saturating_add_serial(nk_u64_t a, nk_u64_t b);
185
+ /** @copydoc nk_u8_saturating_add */
186
+ NK_PUBLIC nk_i64_t nk_i64_saturating_add_serial(nk_i64_t a, nk_i64_t b);
187
+ /** @copydoc nk_u8_saturating_add */
188
+ NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_add_serial(nk_i4x2_t a, nk_i4x2_t b);
189
+ /** @copydoc nk_u8_saturating_add */
190
+ NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_add_serial(nk_u4x2_t a, nk_u4x2_t b);
191
+
192
+ /** @copydoc nk_u8_saturating_mul */
193
+ NK_PUBLIC nk_u8_t nk_u8_saturating_mul_serial(nk_u8_t a, nk_u8_t b);
194
+ /** @copydoc nk_u8_saturating_mul */
195
+ NK_PUBLIC nk_i8_t nk_i8_saturating_mul_serial(nk_i8_t a, nk_i8_t b);
196
+ /** @copydoc nk_u8_saturating_mul */
197
+ NK_PUBLIC nk_u16_t nk_u16_saturating_mul_serial(nk_u16_t a, nk_u16_t b);
198
+ /** @copydoc nk_u8_saturating_mul */
199
+ NK_PUBLIC nk_i16_t nk_i16_saturating_mul_serial(nk_i16_t a, nk_i16_t b);
200
+ /** @copydoc nk_u8_saturating_mul */
201
+ NK_PUBLIC nk_u32_t nk_u32_saturating_mul_serial(nk_u32_t a, nk_u32_t b);
202
+ /** @copydoc nk_u8_saturating_mul */
203
+ NK_PUBLIC nk_i32_t nk_i32_saturating_mul_serial(nk_i32_t a, nk_i32_t b);
204
+ /** @copydoc nk_u8_saturating_mul */
205
+ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_serial(nk_u64_t a, nk_u64_t b);
206
+ /** @copydoc nk_u8_saturating_mul */
207
+ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_serial(nk_i64_t a, nk_i64_t b);
208
+ /** @copydoc nk_u8_saturating_mul */
209
+ NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_mul_serial(nk_i4x2_t a, nk_i4x2_t b);
210
+ /** @copydoc nk_u8_saturating_mul */
211
+ NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_mul_serial(nk_u4x2_t a, nk_u4x2_t b);
212
+
213
+ /** @copydoc nk_f16_order */
214
+ NK_PUBLIC int nk_f16_order_serial(nk_f16_t a, nk_f16_t b);
215
+ /** @copydoc nk_f16_order */
216
+ NK_PUBLIC int nk_bf16_order_serial(nk_bf16_t a, nk_bf16_t b);
217
+ /** @copydoc nk_f16_order */
218
+ NK_PUBLIC int nk_e4m3_order_serial(nk_e4m3_t a, nk_e4m3_t b);
219
+ /** @copydoc nk_f16_order */
220
+ NK_PUBLIC int nk_e5m2_order_serial(nk_e5m2_t a, nk_e5m2_t b);
221
+ /** @copydoc nk_f16_order */
222
+ NK_PUBLIC int nk_e2m3_order_serial(nk_e2m3_t a, nk_e2m3_t b);
223
+ /** @copydoc nk_f16_order */
224
+ NK_PUBLIC int nk_e3m2_order_serial(nk_e3m2_t a, nk_e3m2_t b);
225
+
226
+ #if NK_TARGET_NEON
227
+ /** @copydoc nk_f32_sqrt */
228
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_neon(nk_f32_t x);
229
+ /** @copydoc nk_f64_sqrt */
230
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_neon(nk_f64_t x);
231
+ /** @copydoc nk_f32_rsqrt */
232
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_neon(nk_f32_t x);
233
+ /** @copydoc nk_f64_rsqrt */
234
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_neon(nk_f64_t x);
235
+ /** @copydoc nk_f32_fma */
236
+ NK_PUBLIC nk_f32_t nk_f32_fma_neon(nk_f32_t a, nk_f32_t b, nk_f32_t c);
237
+ /** @copydoc nk_f64_fma */
238
+ NK_PUBLIC nk_f64_t nk_f64_fma_neon(nk_f64_t a, nk_f64_t b, nk_f64_t c);
239
+ /** @copydoc nk_u8_saturating_add */
240
+ NK_PUBLIC nk_u8_t nk_u8_saturating_add_neon(nk_u8_t a, nk_u8_t b);
241
+ /** @copydoc nk_u8_saturating_add */
242
+ NK_PUBLIC nk_i8_t nk_i8_saturating_add_neon(nk_i8_t a, nk_i8_t b);
243
+ /** @copydoc nk_u8_saturating_add */
244
+ NK_PUBLIC nk_u16_t nk_u16_saturating_add_neon(nk_u16_t a, nk_u16_t b);
245
+ /** @copydoc nk_u8_saturating_add */
246
+ NK_PUBLIC nk_i16_t nk_i16_saturating_add_neon(nk_i16_t a, nk_i16_t b);
247
+ /** @copydoc nk_u8_saturating_add */
248
+ NK_PUBLIC nk_u32_t nk_u32_saturating_add_neon(nk_u32_t a, nk_u32_t b);
249
+ /** @copydoc nk_u8_saturating_add */
250
+ NK_PUBLIC nk_i32_t nk_i32_saturating_add_neon(nk_i32_t a, nk_i32_t b);
251
+ /** @copydoc nk_u8_saturating_add */
252
+ NK_PUBLIC nk_u64_t nk_u64_saturating_add_neon(nk_u64_t a, nk_u64_t b);
253
+ /** @copydoc nk_u8_saturating_add */
254
+ NK_PUBLIC nk_i64_t nk_i64_saturating_add_neon(nk_i64_t a, nk_i64_t b);
255
+ /** @copydoc nk_u8_saturating_mul */
256
+ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_neon(nk_u64_t a, nk_u64_t b);
257
+ /** @copydoc nk_u8_saturating_mul */
258
+ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_neon(nk_i64_t a, nk_i64_t b);
259
+ #endif // NK_TARGET_NEON
260
+
261
+ #if NK_TARGET_NEONHALF
262
+ /** @copydoc nk_f16_sqrt */
263
+ NK_PUBLIC nk_f16_t nk_f16_sqrt_neonhalf(nk_f16_t x);
264
+ /** @copydoc nk_f16_rsqrt */
265
+ NK_PUBLIC nk_f16_t nk_f16_rsqrt_neonhalf(nk_f16_t x);
266
+ /** @copydoc nk_f16_fma */
267
+ NK_PUBLIC nk_f16_t nk_f16_fma_neonhalf(nk_f16_t a, nk_f16_t b, nk_f16_t c);
268
+ #endif // NK_TARGET_NEONHALF
269
+
270
+ #if NK_TARGET_HASWELL
271
+ /** @copydoc nk_f32_sqrt */
272
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_haswell(nk_f32_t x);
273
+ /** @copydoc nk_f64_sqrt */
274
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_haswell(nk_f64_t x);
275
+ /** @copydoc nk_f32_rsqrt */
276
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_haswell(nk_f32_t x);
277
+ /** @copydoc nk_f64_rsqrt */
278
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_haswell(nk_f64_t x);
279
+ /** @copydoc nk_f32_fma */
280
+ NK_PUBLIC nk_f32_t nk_f32_fma_haswell(nk_f32_t a, nk_f32_t b, nk_f32_t c);
281
+ /** @copydoc nk_f64_fma */
282
+ NK_PUBLIC nk_f64_t nk_f64_fma_haswell(nk_f64_t a, nk_f64_t b, nk_f64_t c);
283
+ /** @copydoc nk_u8_saturating_add */
284
+ NK_PUBLIC nk_u8_t nk_u8_saturating_add_haswell(nk_u8_t a, nk_u8_t b);
285
+ /** @copydoc nk_u8_saturating_add */
286
+ NK_PUBLIC nk_i8_t nk_i8_saturating_add_haswell(nk_i8_t a, nk_i8_t b);
287
+ /** @copydoc nk_u8_saturating_add */
288
+ NK_PUBLIC nk_u16_t nk_u16_saturating_add_haswell(nk_u16_t a, nk_u16_t b);
289
+ /** @copydoc nk_u8_saturating_add */
290
+ NK_PUBLIC nk_i16_t nk_i16_saturating_add_haswell(nk_i16_t a, nk_i16_t b);
291
+ /** @copydoc nk_u8_saturating_mul */
292
+ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_haswell(nk_u64_t a, nk_u64_t b);
293
+ /** @copydoc nk_u8_saturating_mul */
294
+ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_haswell(nk_i64_t a, nk_i64_t b);
295
+ /** @copydoc nk_f16_sqrt */
296
+ NK_PUBLIC nk_f16_t nk_f16_sqrt_haswell(nk_f16_t x);
297
+ /** @copydoc nk_f16_rsqrt */
298
+ NK_PUBLIC nk_f16_t nk_f16_rsqrt_haswell(nk_f16_t x);
299
+ /** @copydoc nk_f16_fma */
300
+ NK_PUBLIC nk_f16_t nk_f16_fma_haswell(nk_f16_t a, nk_f16_t b, nk_f16_t c);
301
+ #endif // NK_TARGET_HASWELL
302
+
303
+ #if NK_TARGET_SAPPHIRE
304
+ /** @copydoc nk_f16_order */
305
+ NK_PUBLIC int nk_f16_order_sapphire(nk_f16_t a, nk_f16_t b);
306
+ /** @copydoc nk_f16_sqrt */
307
+ NK_PUBLIC nk_f16_t nk_f16_sqrt_sapphire(nk_f16_t x);
308
+ /** @copydoc nk_f16_rsqrt */
309
+ NK_PUBLIC nk_f16_t nk_f16_rsqrt_sapphire(nk_f16_t x);
310
+ /** @copydoc nk_f16_fma */
311
+ NK_PUBLIC nk_f16_t nk_f16_fma_sapphire(nk_f16_t a, nk_f16_t b, nk_f16_t c);
312
+ #endif // NK_TARGET_SAPPHIRE
313
+
314
+ #if NK_TARGET_RVV
315
+ /** @copydoc nk_f32_sqrt */
316
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_rvv(nk_f32_t x);
317
+ /** @copydoc nk_f64_sqrt */
318
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_rvv(nk_f64_t x);
319
+ /** @copydoc nk_f32_rsqrt */
320
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_rvv(nk_f32_t x);
321
+ /** @copydoc nk_f64_rsqrt */
322
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_rvv(nk_f64_t x);
323
+ /** @copydoc nk_f32_fma */
324
+ NK_PUBLIC nk_f32_t nk_f32_fma_rvv(nk_f32_t a, nk_f32_t b, nk_f32_t c);
325
+ /** @copydoc nk_f64_fma */
326
+ NK_PUBLIC nk_f64_t nk_f64_fma_rvv(nk_f64_t a, nk_f64_t b, nk_f64_t c);
327
+ /** @copydoc nk_u8_saturating_add */
328
+ NK_PUBLIC nk_u8_t nk_u8_saturating_add_rvv(nk_u8_t a, nk_u8_t b);
329
+ /** @copydoc nk_u8_saturating_add */
330
+ NK_PUBLIC nk_i8_t nk_i8_saturating_add_rvv(nk_i8_t a, nk_i8_t b);
331
+ /** @copydoc nk_u8_saturating_add */
332
+ NK_PUBLIC nk_u16_t nk_u16_saturating_add_rvv(nk_u16_t a, nk_u16_t b);
333
+ /** @copydoc nk_u8_saturating_add */
334
+ NK_PUBLIC nk_i16_t nk_i16_saturating_add_rvv(nk_i16_t a, nk_i16_t b);
335
+ /** @copydoc nk_u8_saturating_add */
336
+ NK_PUBLIC nk_u32_t nk_u32_saturating_add_rvv(nk_u32_t a, nk_u32_t b);
337
+ /** @copydoc nk_u8_saturating_add */
338
+ NK_PUBLIC nk_i32_t nk_i32_saturating_add_rvv(nk_i32_t a, nk_i32_t b);
339
+ /** @copydoc nk_u8_saturating_add */
340
+ NK_PUBLIC nk_u64_t nk_u64_saturating_add_rvv(nk_u64_t a, nk_u64_t b);
341
+ /** @copydoc nk_u8_saturating_add */
342
+ NK_PUBLIC nk_i64_t nk_i64_saturating_add_rvv(nk_i64_t a, nk_i64_t b);
343
+ /** @copydoc nk_u8_saturating_mul */
344
+ NK_PUBLIC nk_u8_t nk_u8_saturating_mul_rvv(nk_u8_t a, nk_u8_t b);
345
+ /** @copydoc nk_u8_saturating_mul */
346
+ NK_PUBLIC nk_i8_t nk_i8_saturating_mul_rvv(nk_i8_t a, nk_i8_t b);
347
+ /** @copydoc nk_u8_saturating_mul */
348
+ NK_PUBLIC nk_u16_t nk_u16_saturating_mul_rvv(nk_u16_t a, nk_u16_t b);
349
+ /** @copydoc nk_u8_saturating_mul */
350
+ NK_PUBLIC nk_i16_t nk_i16_saturating_mul_rvv(nk_i16_t a, nk_i16_t b);
351
+ /** @copydoc nk_u8_saturating_mul */
352
+ NK_PUBLIC nk_u32_t nk_u32_saturating_mul_rvv(nk_u32_t a, nk_u32_t b);
353
+ /** @copydoc nk_u8_saturating_mul */
354
+ NK_PUBLIC nk_i32_t nk_i32_saturating_mul_rvv(nk_i32_t a, nk_i32_t b);
355
+ /** @copydoc nk_u8_saturating_mul */
356
+ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_rvv(nk_u64_t a, nk_u64_t b);
357
+ /** @copydoc nk_u8_saturating_mul */
358
+ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_rvv(nk_i64_t a, nk_i64_t b);
359
+ #endif // NK_TARGET_RVV
360
+
361
+ #if NK_TARGET_V128RELAXED
362
+ /** @copydoc nk_f32_sqrt */
363
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_v128relaxed(nk_f32_t x);
364
+ /** @copydoc nk_f64_sqrt */
365
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_v128relaxed(nk_f64_t x);
366
+ /** @copydoc nk_f32_rsqrt */
367
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_v128relaxed(nk_f32_t x);
368
+ /** @copydoc nk_f64_rsqrt */
369
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_v128relaxed(nk_f64_t x);
370
+ /** @copydoc nk_f32_fma */
371
+ NK_PUBLIC nk_f32_t nk_f32_fma_v128relaxed(nk_f32_t a, nk_f32_t b, nk_f32_t c);
372
+ /** @copydoc nk_f64_fma */
373
+ NK_PUBLIC nk_f64_t nk_f64_fma_v128relaxed(nk_f64_t a, nk_f64_t b, nk_f64_t c);
374
+ #endif // NK_TARGET_V128RELAXED
375
+
376
+ #if defined(__cplusplus)
377
+ } // extern "C"
378
+ #endif
379
+
380
+ #include "numkong/scalar/serial.h" // `nk_f32_rsqrt_serial`
381
+ #include "numkong/scalar/neon.h" // `nk_f32_sqrt_neon`
382
+ #include "numkong/scalar/neonhalf.h" // `nk_f16_sqrt_neonhalf`
383
+ #include "numkong/scalar/haswell.h" // `nk_f32_sqrt_haswell`
384
+ #include "numkong/scalar/sapphire.h" // `nk_f16_order_sapphire`
385
+ #include "numkong/scalar/rvv.h" // `nk_f32_rsqrt_rvv`
386
+ #include "numkong/scalar/v128relaxed.h" // `nk_f32_sqrt_v128relaxed`
387
+
388
+ #if defined(__cplusplus)
389
+ extern "C" {
390
+ #endif
391
+
392
+ #if !NK_DYNAMIC_DISPATCH
393
+
394
+ NK_PUBLIC nk_f32_t nk_f32_sqrt(nk_f32_t x) {
395
+ #if NK_TARGET_HASWELL
396
+ return nk_f32_sqrt_haswell(x);
397
+ #elif NK_TARGET_NEON
398
+ return nk_f32_sqrt_neon(x);
399
+ #elif NK_TARGET_RVV
400
+ return nk_f32_sqrt_rvv(x);
401
+ #elif NK_TARGET_V128RELAXED
402
+ return nk_f32_sqrt_v128relaxed(x);
403
+ #else
404
+ return nk_f32_sqrt_serial(x);
405
+ #endif
406
+ }
407
+
408
+ NK_PUBLIC nk_f64_t nk_f64_sqrt(nk_f64_t x) {
409
+ #if NK_TARGET_HASWELL
410
+ return nk_f64_sqrt_haswell(x);
411
+ #elif NK_TARGET_NEON
412
+ return nk_f64_sqrt_neon(x);
413
+ #elif NK_TARGET_RVV
414
+ return nk_f64_sqrt_rvv(x);
415
+ #elif NK_TARGET_V128RELAXED
416
+ return nk_f64_sqrt_v128relaxed(x);
417
+ #else
418
+ return nk_f64_sqrt_serial(x);
419
+ #endif
420
+ }
421
+
422
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt(nk_f32_t x) {
423
+ #if NK_TARGET_HASWELL
424
+ return nk_f32_rsqrt_haswell(x);
425
+ #elif NK_TARGET_NEON
426
+ return nk_f32_rsqrt_neon(x);
427
+ #elif NK_TARGET_RVV
428
+ return nk_f32_rsqrt_rvv(x);
429
+ #elif NK_TARGET_V128RELAXED
430
+ return nk_f32_rsqrt_v128relaxed(x);
431
+ #else
432
+ return nk_f32_rsqrt_serial(x);
433
+ #endif
434
+ }
435
+
436
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt(nk_f64_t x) {
437
+ #if NK_TARGET_HASWELL
438
+ return nk_f64_rsqrt_haswell(x);
439
+ #elif NK_TARGET_NEON
440
+ return nk_f64_rsqrt_neon(x);
441
+ #elif NK_TARGET_RVV
442
+ return nk_f64_rsqrt_rvv(x);
443
+ #elif NK_TARGET_V128RELAXED
444
+ return nk_f64_rsqrt_v128relaxed(x);
445
+ #else
446
+ return nk_f64_rsqrt_serial(x);
447
+ #endif
448
+ }
449
+
450
+ NK_PUBLIC nk_f32_t nk_f32_fma(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
451
+ #if NK_TARGET_HASWELL
452
+ return nk_f32_fma_haswell(a, b, c);
453
+ #elif NK_TARGET_NEON
454
+ return nk_f32_fma_neon(a, b, c);
455
+ #elif NK_TARGET_RVV
456
+ return nk_f32_fma_rvv(a, b, c);
457
+ #elif NK_TARGET_V128RELAXED
458
+ return nk_f32_fma_v128relaxed(a, b, c);
459
+ #else
460
+ return nk_f32_fma_serial(a, b, c);
461
+ #endif
462
+ }
463
+
464
+ NK_PUBLIC nk_f64_t nk_f64_fma(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
465
+ #if NK_TARGET_HASWELL
466
+ return nk_f64_fma_haswell(a, b, c);
467
+ #elif NK_TARGET_NEON
468
+ return nk_f64_fma_neon(a, b, c);
469
+ #elif NK_TARGET_RVV
470
+ return nk_f64_fma_rvv(a, b, c);
471
+ #elif NK_TARGET_V128RELAXED
472
+ return nk_f64_fma_v128relaxed(a, b, c);
473
+ #else
474
+ return nk_f64_fma_serial(a, b, c);
475
+ #endif
476
+ }
477
+
478
+ NK_PUBLIC nk_f16_t nk_f16_sqrt(nk_f16_t x) {
479
+ #if NK_TARGET_SAPPHIRE
480
+ return nk_f16_sqrt_sapphire(x);
481
+ #elif NK_TARGET_NEONHALF
482
+ return nk_f16_sqrt_neonhalf(x);
483
+ #elif NK_TARGET_HASWELL
484
+ return nk_f16_sqrt_haswell(x);
485
+ #else
486
+ return nk_f16_sqrt_serial(x);
487
+ #endif
488
+ }
489
+
490
+ NK_PUBLIC nk_f16_t nk_f16_rsqrt(nk_f16_t x) {
491
+ #if NK_TARGET_SAPPHIRE
492
+ return nk_f16_rsqrt_sapphire(x);
493
+ #elif NK_TARGET_NEONHALF
494
+ return nk_f16_rsqrt_neonhalf(x);
495
+ #elif NK_TARGET_HASWELL
496
+ return nk_f16_rsqrt_haswell(x);
497
+ #else
498
+ return nk_f16_rsqrt_serial(x);
499
+ #endif
500
+ }
501
+
502
+ NK_PUBLIC nk_f16_t nk_f16_fma(nk_f16_t a, nk_f16_t b, nk_f16_t c) {
503
+ #if NK_TARGET_SAPPHIRE
504
+ return nk_f16_fma_sapphire(a, b, c);
505
+ #elif NK_TARGET_NEONHALF
506
+ return nk_f16_fma_neonhalf(a, b, c);
507
+ #elif NK_TARGET_HASWELL
508
+ return nk_f16_fma_haswell(a, b, c);
509
+ #else
510
+ return nk_f16_fma_serial(a, b, c);
511
+ #endif
512
+ }
513
+
514
+ NK_PUBLIC nk_u8_t nk_u8_saturating_add(nk_u8_t a, nk_u8_t b) {
515
+ #if NK_TARGET_HASWELL
516
+ return nk_u8_saturating_add_haswell(a, b);
517
+ #elif NK_TARGET_NEON
518
+ return nk_u8_saturating_add_neon(a, b);
519
+ #elif NK_TARGET_RVV
520
+ return nk_u8_saturating_add_rvv(a, b);
521
+ #else
522
+ return nk_u8_saturating_add_serial(a, b);
523
+ #endif
524
+ }
525
+ NK_PUBLIC nk_i8_t nk_i8_saturating_add(nk_i8_t a, nk_i8_t b) {
526
+ #if NK_TARGET_HASWELL
527
+ return nk_i8_saturating_add_haswell(a, b);
528
+ #elif NK_TARGET_NEON
529
+ return nk_i8_saturating_add_neon(a, b);
530
+ #elif NK_TARGET_RVV
531
+ return nk_i8_saturating_add_rvv(a, b);
532
+ #else
533
+ return nk_i8_saturating_add_serial(a, b);
534
+ #endif
535
+ }
536
+ NK_PUBLIC nk_u16_t nk_u16_saturating_add(nk_u16_t a, nk_u16_t b) {
537
+ #if NK_TARGET_HASWELL
538
+ return nk_u16_saturating_add_haswell(a, b);
539
+ #elif NK_TARGET_NEON
540
+ return nk_u16_saturating_add_neon(a, b);
541
+ #elif NK_TARGET_RVV
542
+ return nk_u16_saturating_add_rvv(a, b);
543
+ #else
544
+ return nk_u16_saturating_add_serial(a, b);
545
+ #endif
546
+ }
547
+ NK_PUBLIC nk_i16_t nk_i16_saturating_add(nk_i16_t a, nk_i16_t b) {
548
+ #if NK_TARGET_HASWELL
549
+ return nk_i16_saturating_add_haswell(a, b);
550
+ #elif NK_TARGET_NEON
551
+ return nk_i16_saturating_add_neon(a, b);
552
+ #elif NK_TARGET_RVV
553
+ return nk_i16_saturating_add_rvv(a, b);
554
+ #else
555
+ return nk_i16_saturating_add_serial(a, b);
556
+ #endif
557
+ }
558
+ NK_PUBLIC nk_u32_t nk_u32_saturating_add(nk_u32_t a, nk_u32_t b) {
559
+ #if NK_TARGET_NEON
560
+ return nk_u32_saturating_add_neon(a, b);
561
+ #elif NK_TARGET_RVV
562
+ return nk_u32_saturating_add_rvv(a, b);
563
+ #else
564
+ return nk_u32_saturating_add_serial(a, b);
565
+ #endif
566
+ }
567
+ NK_PUBLIC nk_i32_t nk_i32_saturating_add(nk_i32_t a, nk_i32_t b) {
568
+ #if NK_TARGET_NEON
569
+ return nk_i32_saturating_add_neon(a, b);
570
+ #elif NK_TARGET_RVV
571
+ return nk_i32_saturating_add_rvv(a, b);
572
+ #else
573
+ return nk_i32_saturating_add_serial(a, b);
574
+ #endif
575
+ }
576
+ NK_PUBLIC nk_u64_t nk_u64_saturating_add(nk_u64_t a, nk_u64_t b) {
577
+ #if NK_TARGET_NEON
578
+ return nk_u64_saturating_add_neon(a, b);
579
+ #elif NK_TARGET_RVV
580
+ return nk_u64_saturating_add_rvv(a, b);
581
+ #else
582
+ return nk_u64_saturating_add_serial(a, b);
583
+ #endif
584
+ }
585
+ NK_PUBLIC nk_i64_t nk_i64_saturating_add(nk_i64_t a, nk_i64_t b) {
586
+ #if NK_TARGET_NEON
587
+ return nk_i64_saturating_add_neon(a, b);
588
+ #elif NK_TARGET_RVV
589
+ return nk_i64_saturating_add_rvv(a, b);
590
+ #else
591
+ return nk_i64_saturating_add_serial(a, b);
592
+ #endif
593
+ }
594
+ NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_add(nk_i4x2_t a, nk_i4x2_t b) { return nk_i4x2_saturating_add_serial(a, b); }
595
+ NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_add(nk_u4x2_t a, nk_u4x2_t b) { return nk_u4x2_saturating_add_serial(a, b); }
596
+
597
+ NK_PUBLIC nk_u8_t nk_u8_saturating_mul(nk_u8_t a, nk_u8_t b) {
598
+ #if NK_TARGET_RVV
599
+ return nk_u8_saturating_mul_rvv(a, b);
600
+ #else
601
+ return nk_u8_saturating_mul_serial(a, b);
602
+ #endif
603
+ }
604
+ NK_PUBLIC nk_i8_t nk_i8_saturating_mul(nk_i8_t a, nk_i8_t b) {
605
+ #if NK_TARGET_RVV
606
+ return nk_i8_saturating_mul_rvv(a, b);
607
+ #else
608
+ return nk_i8_saturating_mul_serial(a, b);
609
+ #endif
610
+ }
611
+ NK_PUBLIC nk_u16_t nk_u16_saturating_mul(nk_u16_t a, nk_u16_t b) {
612
+ #if NK_TARGET_RVV
613
+ return nk_u16_saturating_mul_rvv(a, b);
614
+ #else
615
+ return nk_u16_saturating_mul_serial(a, b);
616
+ #endif
617
+ }
618
+ NK_PUBLIC nk_i16_t nk_i16_saturating_mul(nk_i16_t a, nk_i16_t b) {
619
+ #if NK_TARGET_RVV
620
+ return nk_i16_saturating_mul_rvv(a, b);
621
+ #else
622
+ return nk_i16_saturating_mul_serial(a, b);
623
+ #endif
624
+ }
625
+ NK_PUBLIC nk_u32_t nk_u32_saturating_mul(nk_u32_t a, nk_u32_t b) {
626
+ #if NK_TARGET_RVV
627
+ return nk_u32_saturating_mul_rvv(a, b);
628
+ #else
629
+ return nk_u32_saturating_mul_serial(a, b);
630
+ #endif
631
+ }
632
+ NK_PUBLIC nk_i32_t nk_i32_saturating_mul(nk_i32_t a, nk_i32_t b) {
633
+ #if NK_TARGET_RVV
634
+ return nk_i32_saturating_mul_rvv(a, b);
635
+ #else
636
+ return nk_i32_saturating_mul_serial(a, b);
637
+ #endif
638
+ }
639
+ NK_PUBLIC nk_u64_t nk_u64_saturating_mul(nk_u64_t a, nk_u64_t b) {
640
+ #if NK_TARGET_HASWELL
641
+ return nk_u64_saturating_mul_haswell(a, b);
642
+ #elif NK_TARGET_NEON
643
+ return nk_u64_saturating_mul_neon(a, b);
644
+ #elif NK_TARGET_RVV
645
+ return nk_u64_saturating_mul_rvv(a, b);
646
+ #else
647
+ return nk_u64_saturating_mul_serial(a, b);
648
+ #endif
649
+ }
650
+ NK_PUBLIC nk_i64_t nk_i64_saturating_mul(nk_i64_t a, nk_i64_t b) {
651
+ #if NK_TARGET_HASWELL
652
+ return nk_i64_saturating_mul_haswell(a, b);
653
+ #elif NK_TARGET_NEON
654
+ return nk_i64_saturating_mul_neon(a, b);
655
+ #elif NK_TARGET_RVV
656
+ return nk_i64_saturating_mul_rvv(a, b);
657
+ #else
658
+ return nk_i64_saturating_mul_serial(a, b);
659
+ #endif
660
+ }
661
+ NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_mul(nk_i4x2_t a, nk_i4x2_t b) { return nk_i4x2_saturating_mul_serial(a, b); }
662
+ NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_mul(nk_u4x2_t a, nk_u4x2_t b) { return nk_u4x2_saturating_mul_serial(a, b); }
663
+
664
+ NK_PUBLIC int nk_f16_order(nk_f16_t a, nk_f16_t b) {
665
+ #if NK_TARGET_SAPPHIRE
666
+ return nk_f16_order_sapphire(a, b);
667
+ #else
668
+ return nk_f16_order_serial(a, b);
669
+ #endif
670
+ }
671
+ NK_PUBLIC int nk_bf16_order(nk_bf16_t a, nk_bf16_t b) { return nk_bf16_order_serial(a, b); }
672
+ NK_PUBLIC int nk_e4m3_order(nk_e4m3_t a, nk_e4m3_t b) { return nk_e4m3_order_serial(a, b); }
673
+ NK_PUBLIC int nk_e5m2_order(nk_e5m2_t a, nk_e5m2_t b) { return nk_e5m2_order_serial(a, b); }
674
+ NK_PUBLIC int nk_e2m3_order(nk_e2m3_t a, nk_e2m3_t b) { return nk_e2m3_order_serial(a, b); }
675
+ NK_PUBLIC int nk_e3m2_order(nk_e3m2_t a, nk_e3m2_t b) { return nk_e3m2_order_serial(a, b); }
676
+
677
+ #endif // !NK_DYNAMIC_DISPATCH
678
+
679
+ #if defined(__cplusplus)
680
+ } // extern "C"
681
+ #endif
682
+
683
+ #endif // NK_SCALAR_H