numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,332 @@
1
+ /**
2
+ * @brief Software-emulated Scalar Math Helpers for SIMD-free CPUs.
3
+ * @file include/numkong/scalar/serial.h
4
+ * @author Ash Vardanian
5
+ * @date March 1, 2026
6
+ *
7
+ * @sa include/numkong/scalar.h
8
+ *
9
+ * Uses the Quake 3 fast inverse square root trick with Newton-Raphson refinement.
10
+ * Three iterations for f32 (~34.9 correct bits), four for f64 (~69.3 correct bits).
11
+ */
12
+ #ifndef NK_SCALAR_SERIAL_H
13
+ #define NK_SCALAR_SERIAL_H
14
+
15
+ #include "numkong/types.h"
16
+ #include "numkong/cast/serial.h"
17
+
18
+ #if defined(__cplusplus)
19
+ extern "C" {
20
+ #endif
21
+
22
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_serial(nk_f32_t number) {
23
+ nk_fui32_t conv;
24
+ conv.f = number;
25
+ conv.u = 0x5F375A86 - (conv.u >> 1);
26
+ nk_f32_t y = conv.f;
27
+ y = y * (1.5f - 0.5f * number * y * y);
28
+ y = y * (1.5f - 0.5f * number * y * y);
29
+ y = y * (1.5f - 0.5f * number * y * y);
30
+ return y;
31
+ }
32
+
33
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_serial(nk_f32_t number) { return number > 0 ? number * nk_f32_rsqrt_serial(number) : 0; }
34
+
35
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_serial(nk_f64_t number) {
36
+ nk_fui64_t conv;
37
+ conv.f = number;
38
+ conv.u = 0x5FE6EB50C7B537A9ULL - (conv.u >> 1);
39
+ nk_f64_t y = conv.f;
40
+ y = y * (1.5 - 0.5 * number * y * y);
41
+ y = y * (1.5 - 0.5 * number * y * y);
42
+ y = y * (1.5 - 0.5 * number * y * y);
43
+ y = y * (1.5 - 0.5 * number * y * y);
44
+ return y;
45
+ }
46
+
47
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_serial(nk_f64_t number) { return number > 0 ? number * nk_f64_rsqrt_serial(number) : 0; }
48
+
49
+ NK_PUBLIC nk_f16_t nk_f16_sqrt_serial(nk_f16_t x) {
50
+ nk_f32_t x_f32;
51
+ nk_f16_to_f32_serial(&x, &x_f32);
52
+ x_f32 = nk_f32_sqrt_serial(x_f32);
53
+ nk_f16_t result;
54
+ nk_f32_to_f16_serial(&x_f32, &result);
55
+ return result;
56
+ }
57
+
58
+ NK_PUBLIC nk_f16_t nk_f16_rsqrt_serial(nk_f16_t x) {
59
+ nk_f32_t x_f32;
60
+ nk_f16_to_f32_serial(&x, &x_f32);
61
+ x_f32 = nk_f32_rsqrt_serial(x_f32);
62
+ nk_f16_t result;
63
+ nk_f32_to_f16_serial(&x_f32, &result);
64
+ return result;
65
+ }
66
+
67
+ /**
68
+ * @brief Software FMA (Fused Multiply-Add) emulation for f64.
69
+ * Computes (multiplicand * multiplier + addend) with improved precision
70
+ * using Dekker's error-free multiplication and Knuth's TwoSum.
71
+ * @sa std::fma, @sa Rust f64::mul_add
72
+ */
73
+ NK_PUBLIC nk_f64_t nk_f64_fma_serial(nk_f64_t multiplicand, nk_f64_t multiplier, nk_f64_t addend) {
74
+ nk_f64_t product = multiplicand * multiplier;
75
+ // Dekker splitting: break each operand into non-overlapping high and low halves
76
+ nk_f64_t const dekker_split = 134217729.0; // 2^27 + 1 for double precision
77
+ nk_f64_t multiplicand_hi = dekker_split * multiplicand;
78
+ nk_f64_t multiplicand_lo = multiplicand - (multiplicand_hi - (multiplicand_hi - multiplicand));
79
+ multiplicand_hi = multiplicand_hi - (multiplicand_hi - multiplicand);
80
+ nk_f64_t multiplier_hi = dekker_split * multiplier;
81
+ nk_f64_t multiplier_lo = multiplier - (multiplier_hi - (multiplier_hi - multiplier));
82
+ multiplier_hi = multiplier_hi - (multiplier_hi - multiplier);
83
+ // Exact multiplication error from the four cross-products
84
+ nk_f64_t product_error = ((multiplicand_hi * multiplier_hi - product) + multiplicand_hi * multiplier_lo +
85
+ multiplicand_lo * multiplier_hi) +
86
+ multiplicand_lo * multiplier_lo;
87
+ // Knuth TwoSum: add the addend with error tracking
88
+ nk_f64_t result = product + addend;
89
+ nk_f64_t addend_recovered = result - product;
90
+ nk_f64_t product_recovered = result - addend_recovered;
91
+ nk_f64_t addition_error = (product - product_recovered) + (addend - addend_recovered);
92
+ return result + (product_error + addition_error);
93
+ }
94
+
95
+ /**
96
+ * @brief Software FMA (Fused Multiply-Add) emulation for f32.
97
+ * Computes (multiplicand * multiplier + addend) with improved precision
98
+ * using Dekker's error-free multiplication and Knuth's TwoSum.
99
+ * @sa std::fma, @sa Rust f32::mul_add
100
+ */
101
+ NK_PUBLIC nk_f32_t nk_f32_fma_serial(nk_f32_t multiplicand, nk_f32_t multiplier, nk_f32_t addend) {
102
+ nk_f32_t product = multiplicand * multiplier;
103
+ // Dekker splitting: break each operand into non-overlapping high and low halves
104
+ nk_f32_t const dekker_split = 4097.0f; // 2^12 + 1 for single precision
105
+ nk_f32_t multiplicand_hi = dekker_split * multiplicand;
106
+ nk_f32_t multiplicand_lo = multiplicand - (multiplicand_hi - (multiplicand_hi - multiplicand));
107
+ multiplicand_hi = multiplicand_hi - (multiplicand_hi - multiplicand);
108
+ nk_f32_t multiplier_hi = dekker_split * multiplier;
109
+ nk_f32_t multiplier_lo = multiplier - (multiplier_hi - (multiplier_hi - multiplier));
110
+ multiplier_hi = multiplier_hi - (multiplier_hi - multiplier);
111
+ // Exact multiplication error from the four cross-products
112
+ nk_f32_t product_error = ((multiplicand_hi * multiplier_hi - product) + multiplicand_hi * multiplier_lo +
113
+ multiplicand_lo * multiplier_hi) +
114
+ multiplicand_lo * multiplier_lo;
115
+ // Knuth TwoSum: add the addend with error tracking
116
+ nk_f32_t result = product + addend;
117
+ nk_f32_t addend_recovered = result - product;
118
+ nk_f32_t product_recovered = result - addend_recovered;
119
+ nk_f32_t addition_error = (product - product_recovered) + (addend - addend_recovered);
120
+ return result + (product_error + addition_error);
121
+ }
122
+
123
+ /**
124
+ * @brief Scalar Dot2 accumulator: sum += a * b with error compensation.
125
+ * Uses TwoProd (via FMA) and TwoSum error-free transformations.
126
+ * @see Ogita, T., Rump, S.M., Oishi, S. (2005). "Accurate Sum and Dot Product"
127
+ */
128
+ NK_INTERNAL void nk_f64_dot2_(nk_f64_t *sum, nk_f64_t *compensation, nk_f64_t a, nk_f64_t b) {
129
+ nk_f64_t product = a * b;
130
+ nk_f64_t product_error = nk_f64_fma_serial(a, b, -product);
131
+ nk_f64_t running_sum = *sum + product;
132
+ nk_f64_t recovered_addend = running_sum - *sum;
133
+ nk_f64_t sum_error = (*sum - (running_sum - recovered_addend)) + (product - recovered_addend);
134
+ *sum = running_sum;
135
+ *compensation += sum_error + product_error;
136
+ }
137
+
138
+ NK_PUBLIC nk_f16_t nk_f16_fma_serial(nk_f16_t a, nk_f16_t b, nk_f16_t c) {
139
+ nk_f32_t a_f32, b_f32, c_f32;
140
+ nk_f16_to_f32_serial(&a, &a_f32);
141
+ nk_f16_to_f32_serial(&b, &b_f32);
142
+ nk_f16_to_f32_serial(&c, &c_f32);
143
+ nk_f32_t result_f32 = nk_f32_fma_serial(a_f32, b_f32, c_f32);
144
+ nk_f16_t result;
145
+ nk_f32_to_f16_serial(&result_f32, &result);
146
+ return result;
147
+ }
148
+
149
+ NK_PUBLIC nk_u8_t nk_u8_saturating_add_serial(nk_u8_t a, nk_u8_t b) {
150
+ nk_u16_t result = (nk_u16_t)a + (nk_u16_t)b;
151
+ return (result > 255u) ? (nk_u8_t)255u : (nk_u8_t)result;
152
+ }
153
+ NK_PUBLIC nk_u16_t nk_u16_saturating_add_serial(nk_u16_t a, nk_u16_t b) {
154
+ nk_u32_t result = (nk_u32_t)a + (nk_u32_t)b;
155
+ return (result > 65535u) ? (nk_u16_t)65535u : (nk_u16_t)result;
156
+ }
157
+ NK_PUBLIC nk_u32_t nk_u32_saturating_add_serial(nk_u32_t a, nk_u32_t b) {
158
+ nk_u64_t result = (nk_u64_t)a + (nk_u64_t)b;
159
+ return (result > 4294967295u) ? (nk_u32_t)4294967295u : (nk_u32_t)result;
160
+ }
161
+ NK_PUBLIC nk_u64_t nk_u64_saturating_add_serial(nk_u64_t a, nk_u64_t b) {
162
+ return (a + b < a) ? 18446744073709551615ull : (a + b);
163
+ }
164
+ NK_PUBLIC nk_i8_t nk_i8_saturating_add_serial(nk_i8_t a, nk_i8_t b) {
165
+ nk_i16_t result = (nk_i16_t)a + (nk_i16_t)b;
166
+ return (result > 127) ? 127 : (result < -128 ? -128 : result);
167
+ }
168
+ NK_PUBLIC nk_i16_t nk_i16_saturating_add_serial(nk_i16_t a, nk_i16_t b) {
169
+ nk_i32_t result = (nk_i32_t)a + (nk_i32_t)b;
170
+ return (result > 32767) ? 32767 : (result < -32768 ? -32768 : result);
171
+ }
172
+ NK_PUBLIC nk_i32_t nk_i32_saturating_add_serial(nk_i32_t a, nk_i32_t b) {
173
+ nk_i64_t result = (nk_i64_t)a + (nk_i64_t)b;
174
+ return (result > 2147483647ll) ? 2147483647ll : (result < -2147483648ll ? -2147483648ll : (nk_i32_t)result);
175
+ }
176
+ NK_PUBLIC nk_i64_t nk_i64_saturating_add_serial(nk_i64_t a, nk_i64_t b) {
177
+ //? We can't just write `-9223372036854775808ll`, even though it's the smallest signed 64-bit value.
178
+ //? The compiler will complain about the number being too large for the type, as it will process the
179
+ //? constant and the sign separately. So we use the same hint that compilers use to define the `INT64_MIN`.
180
+ if ((b > 0) && (a > (9223372036854775807ll) - b)) return 9223372036854775807ll;
181
+ if ((b < 0) && (a < (-9223372036854775807ll - 1ll) - b)) return -9223372036854775807ll - 1ll;
182
+ return a + b;
183
+ }
184
+
185
+ NK_PUBLIC nk_u8_t nk_u8_saturating_mul_serial(nk_u8_t a, nk_u8_t b) {
186
+ nk_u16_t result = (nk_u16_t)a * (nk_u16_t)b;
187
+ return (result > 255) ? 255 : (nk_u8_t)result;
188
+ }
189
+
190
+ NK_PUBLIC nk_u16_t nk_u16_saturating_mul_serial(nk_u16_t a, nk_u16_t b) {
191
+ nk_u32_t result = (nk_u32_t)a * (nk_u32_t)b;
192
+ return (result > 65535) ? 65535 : (nk_u16_t)result;
193
+ }
194
+
195
+ NK_PUBLIC nk_u32_t nk_u32_saturating_mul_serial(nk_u32_t a, nk_u32_t b) {
196
+ nk_u64_t result = (nk_u64_t)a * (nk_u64_t)b;
197
+ return (result > 4294967295u) ? 4294967295u : (nk_u32_t)result;
198
+ }
199
+
200
+ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_serial(nk_u64_t a, nk_u64_t b) {
201
+ // Split the inputs into high and low 32-bit parts
202
+ nk_u64_t a_high = a >> 32;
203
+ nk_u64_t a_low = a & 0xFFFFFFFF;
204
+ nk_u64_t b_high = b >> 32;
205
+ nk_u64_t b_low = b & 0xFFFFFFFF;
206
+
207
+ // Compute partial products
208
+ nk_u64_t upper_product = a_high * b_high;
209
+ nk_u64_t cross_ab = a_high * b_low;
210
+ nk_u64_t cross_ba = a_low * b_high;
211
+ nk_u64_t lower_product = a_low * b_low;
212
+
213
+ // Check if the high part of the result overflows
214
+ nk_u64_t cross_sum = cross_ab + cross_ba;
215
+ if (upper_product || (cross_ab >> 32) || (cross_ba >> 32) || (cross_sum < cross_ab) || (cross_sum >> 32))
216
+ return 18446744073709551615ull;
217
+ nk_u64_t result = (cross_sum << 32) + lower_product;
218
+ if (result < lower_product) return 18446744073709551615ull;
219
+ return result;
220
+ }
221
+
222
+ NK_PUBLIC nk_i8_t nk_i8_saturating_mul_serial(nk_i8_t a, nk_i8_t b) {
223
+ nk_i16_t result = (nk_i16_t)a * (nk_i16_t)b;
224
+ return (result > 127) ? 127 : (result < -128 ? -128 : (nk_i8_t)result);
225
+ }
226
+
227
+ NK_PUBLIC nk_i16_t nk_i16_saturating_mul_serial(nk_i16_t a, nk_i16_t b) {
228
+ nk_i32_t result = (nk_i32_t)a * (nk_i32_t)b;
229
+ return (result > 32767) ? 32767 : (result < -32768 ? -32768 : (nk_i16_t)result);
230
+ }
231
+
232
+ NK_PUBLIC nk_i32_t nk_i32_saturating_mul_serial(nk_i32_t a, nk_i32_t b) {
233
+ nk_i64_t result = (nk_i64_t)a * (nk_i64_t)b;
234
+ return (result > 2147483647ll) ? 2147483647ll : (result < -2147483648ll ? -2147483648ll : (nk_i32_t)result);
235
+ }
236
+
237
+ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_serial(nk_i64_t a, nk_i64_t b) {
238
+ int sign = ((a < 0) ^ (b < 0)) ? -1 : 1; // Track the sign of the result
239
+
240
+ // Take absolute values for easy multiplication and overflow detection
241
+ nk_u64_t abs_a = (a < 0) ? -(nk_u64_t)a : (nk_u64_t)a;
242
+ nk_u64_t abs_b = (b < 0) ? -(nk_u64_t)b : (nk_u64_t)b;
243
+
244
+ // Split the absolute values into high and low 32-bit parts
245
+ nk_u64_t a_high = abs_a >> 32;
246
+ nk_u64_t a_low = abs_a & 0xFFFFFFFF;
247
+ nk_u64_t b_high = abs_b >> 32;
248
+ nk_u64_t b_low = abs_b & 0xFFFFFFFF;
249
+
250
+ // Compute partial products
251
+ nk_u64_t upper_product = a_high * b_high;
252
+ nk_u64_t cross_ab = a_high * b_low;
253
+ nk_u64_t cross_ba = a_low * b_high;
254
+ nk_u64_t lower_product = a_low * b_low;
255
+
256
+ // Check for overflow and saturate based on sign
257
+ nk_u64_t cross_sum = cross_ab + cross_ba;
258
+ if (upper_product || (cross_ab >> 32) || (cross_ba >> 32) || (cross_sum < cross_ab) || (cross_sum >> 32))
259
+ return (sign > 0) ? 9223372036854775807ll : (-9223372036854775807ll - 1ll);
260
+ // Combine parts if no overflow, then apply the sign
261
+ nk_u64_t result = (cross_sum << 32) + lower_product;
262
+ return (sign < 0) ? -((nk_i64_t)result) : (nk_i64_t)result;
263
+ }
264
+
265
+ NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_add_serial(nk_i4x2_t a, nk_i4x2_t b) {
266
+ nk_i8_t low = nk_i4x2_low_(a) + nk_i4x2_low_(b);
267
+ nk_i8_t high = nk_i4x2_high_(a) + nk_i4x2_high_(b);
268
+ low = (low > 7) ? 7 : (low < -8 ? -8 : low);
269
+ high = (high > 7) ? 7 : (high < -8 ? -8 : high);
270
+ return (nk_i4x2_t)((low & 0x0F) | ((high & 0x0F) << 4));
271
+ }
272
+ NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_add_serial(nk_u4x2_t a, nk_u4x2_t b) {
273
+ nk_u8_t low = nk_u4x2_low_(a) + nk_u4x2_low_(b);
274
+ nk_u8_t high = nk_u4x2_high_(a) + nk_u4x2_high_(b);
275
+ low = (low > 15) ? 15 : low;
276
+ high = (high > 15) ? 15 : high;
277
+ return (nk_u4x2_t)((low & 0x0F) | ((high & 0x0F) << 4));
278
+ }
279
+ NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_mul_serial(nk_i4x2_t a, nk_i4x2_t b) {
280
+ nk_i8_t low = nk_i4x2_low_(a) * nk_i4x2_low_(b);
281
+ nk_i8_t high = nk_i4x2_high_(a) * nk_i4x2_high_(b);
282
+ low = (low > 7) ? 7 : (low < -8 ? -8 : low);
283
+ high = (high > 7) ? 7 : (high < -8 ? -8 : high);
284
+ return (nk_i4x2_t)((low & 0x0F) | ((high & 0x0F) << 4));
285
+ }
286
+ NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_mul_serial(nk_u4x2_t a, nk_u4x2_t b) {
287
+ nk_u8_t low = nk_u4x2_low_(a) * nk_u4x2_low_(b);
288
+ nk_u8_t high = nk_u4x2_high_(a) * nk_u4x2_high_(b);
289
+ low = (low > 15) ? 15 : low;
290
+ high = (high > 15) ? 15 : high;
291
+ return (nk_u4x2_t)((low & 0x0F) | ((high & 0x0F) << 4));
292
+ }
293
+
294
+ NK_PUBLIC int nk_e4m3_order_serial(nk_e4m3_t a, nk_e4m3_t b) {
295
+ int sign_a = a >> 7, sign_b = b >> 7;
296
+ return (a ^ -sign_a) - (b ^ -sign_b);
297
+ }
298
+ NK_PUBLIC int nk_e5m2_order_serial(nk_e5m2_t a, nk_e5m2_t b) {
299
+ int sign_a = a >> 7, sign_b = b >> 7;
300
+ return (a ^ -sign_a) - (b ^ -sign_b);
301
+ }
302
+
303
+ NK_PUBLIC int nk_e2m3_order_serial(nk_e2m3_t a, nk_e2m3_t b) {
304
+ int value_a = a & 0x3F, value_b = b & 0x3F;
305
+ int sign_a = value_a >> 5, sign_b = value_b >> 5;
306
+ return (value_a ^ -sign_a) - (value_b ^ -sign_b);
307
+ }
308
+ NK_PUBLIC int nk_e3m2_order_serial(nk_e3m2_t a, nk_e3m2_t b) {
309
+ int value_a = a & 0x3F, value_b = b & 0x3F;
310
+ int sign_a = value_a >> 5, sign_b = value_b >> 5;
311
+ return (value_a ^ -sign_a) - (value_b ^ -sign_b);
312
+ }
313
+
314
+ NK_PUBLIC int nk_bf16_order_serial(nk_bf16_t a, nk_bf16_t b) {
315
+ nk_fui16_t a_fui, b_fui;
316
+ a_fui.bf = a, b_fui.bf = b;
317
+ int sign_a = a_fui.u >> 15, sign_b = b_fui.u >> 15;
318
+ return ((int)a_fui.u ^ -sign_a) - ((int)b_fui.u ^ -sign_b);
319
+ }
320
+
321
+ NK_PUBLIC int nk_f16_order_serial(nk_f16_t a, nk_f16_t b) {
322
+ nk_fui16_t a_fui, b_fui;
323
+ a_fui.f = a, b_fui.f = b;
324
+ int sign_a = a_fui.u >> 15, sign_b = b_fui.u >> 15;
325
+ return ((int)a_fui.u ^ -sign_a) - ((int)b_fui.u ^ -sign_b);
326
+ }
327
+
328
+ #if defined(__cplusplus)
329
+ } // extern "C"
330
+ #endif
331
+
332
+ #endif // NK_SCALAR_SERIAL_H
@@ -0,0 +1,56 @@
1
+ /**
2
+ * @brief SIMD-accelerated Scalar Math Helpers for WASM.
3
+ * @file include/numkong/scalar/v128relaxed.h
4
+ * @author Ash Vardanian
5
+ * @date March 1, 2026
6
+ *
7
+ * @sa include/numkong/scalar.h
8
+ */
9
+ #ifndef NK_SCALAR_V128RELAXED_H
10
+ #define NK_SCALAR_V128RELAXED_H
11
+
12
+ #if NK_TARGET_V128RELAXED
13
+
14
+ #include "numkong/types.h"
15
+
16
+ #if defined(__cplusplus)
17
+ extern "C" {
18
+ #endif
19
+
20
+ #if defined(__clang__)
21
+ #pragma clang attribute push(__attribute__((target("relaxed-simd"))), apply_to = function)
22
+ #endif
23
+
24
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_v128relaxed(nk_f32_t x) {
25
+ return wasm_f32x4_extract_lane(wasm_f32x4_sqrt(wasm_f32x4_splat(x)), 0);
26
+ }
27
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_v128relaxed(nk_f64_t x) {
28
+ return wasm_f64x2_extract_lane(wasm_f64x2_sqrt(wasm_f64x2_splat(x)), 0);
29
+ }
30
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_v128relaxed(nk_f32_t x) {
31
+ v128_t sqrt_f32x4 = wasm_f32x4_sqrt(wasm_f32x4_splat(x));
32
+ return wasm_f32x4_extract_lane(wasm_f32x4_div(wasm_f32x4_splat(1.0f), sqrt_f32x4), 0);
33
+ }
34
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_v128relaxed(nk_f64_t x) {
35
+ v128_t sqrt_f64x2 = wasm_f64x2_sqrt(wasm_f64x2_splat(x));
36
+ return wasm_f64x2_extract_lane(wasm_f64x2_div(wasm_f64x2_splat(1.0), sqrt_f64x2), 0);
37
+ }
38
+ NK_PUBLIC nk_f32_t nk_f32_fma_v128relaxed(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
39
+ v128_t result_f32x4 = wasm_f32x4_relaxed_madd(wasm_f32x4_splat(a), wasm_f32x4_splat(b), wasm_f32x4_splat(c));
40
+ return wasm_f32x4_extract_lane(result_f32x4, 0);
41
+ }
42
+ NK_PUBLIC nk_f64_t nk_f64_fma_v128relaxed(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
43
+ v128_t result_f64x2 = wasm_f64x2_relaxed_madd(wasm_f64x2_splat(a), wasm_f64x2_splat(b), wasm_f64x2_splat(c));
44
+ return wasm_f64x2_extract_lane(result_f64x2, 0);
45
+ }
46
+
47
+ #if defined(__clang__)
48
+ #pragma clang attribute pop
49
+ #endif
50
+
51
+ #if defined(__cplusplus)
52
+ } // extern "C"
53
+ #endif
54
+
55
+ #endif // NK_TARGET_V128RELAXED
56
+ #endif // NK_SCALAR_V128RELAXED_H