numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,184 @@
1
+ # Trigonometric Functions in NumKong
2
+
3
+ NumKong implements element-wise trigonometric functions — sine, cosine, and arc tangent — with ~3 ulp error bounds for f32 and faithful rounding for f64.
4
+ Each function operates on dense vectors, reading input angles (radians) and writing output values of the same length.
5
+ The implementations derive from SLEEF (SIMD Library for Evaluating Elementary Functions), adapted for NumKong's ISA dispatch and type system.
6
+
7
+ Sine:
8
+
9
+ ```math
10
+ \text{sin}: \mathbb{R} \to [-1, 1]
11
+ ```
12
+
13
+ Cosine:
14
+
15
+ ```math
16
+ \text{cos}: \mathbb{R} \to [-1, 1]
17
+ ```
18
+
19
+ Arc tangent:
20
+
21
+ ```math
22
+ \text{atan}: \mathbb{R} \to \left(-\frac{\pi}{2}, \frac{\pi}{2}\right)
23
+ ```
24
+
25
+ Reformulating as Python pseudocode:
26
+
27
+ ```python
28
+ import numpy as np
29
+
30
+ def sin(a: np.ndarray) -> np.ndarray:
31
+ return np.sin(a)
32
+
33
+ def cos(a: np.ndarray) -> np.ndarray:
34
+ return np.cos(a)
35
+
36
+ def atan(a: np.ndarray) -> np.ndarray:
37
+ return np.arctan(a)
38
+ ```
39
+
40
+ ## Input & Output Types
41
+
42
+ | Input Type | Output Type | Description |
43
+ | ---------- | ----------- | ------------------------------------------------ |
44
+ | `f64` | `f64` | 64-bit IEEE 754 double precision |
45
+ | `f32` | `f32` | 32-bit IEEE 754 single precision |
46
+ | `f16` | `f16` | 16-bit half precision, widened to f32 internally |
47
+
48
+ ## Optimizations
49
+
50
+ ### Cody-Waite Range Reduction
51
+
52
+ All trigonometric kernels reduce the input angle to $[-\pi/4, \pi/4]$ before polynomial evaluation using Cody-Waite argument reduction.
53
+ The constant $\pi$ is split into high and low parts ($\pi_{\text{hi}} + \pi_{\text{lo}}$) to maintain precision during the subtraction $x - n\pi$: `reduced = (x - n * pi_hi) - n * pi_lo`.
54
+ Single-part subtraction would lose ~3 bits of precision for large multiples of $\pi$; the two-part split preserves the full mantissa.
55
+ The quadrant index $n = \text{round}(x / \pi)$ selects which trigonometric identity to apply (sin-cos swap, sign flip) via a 2-bit branch.
56
+
57
+ ### Minimax Polynomial Approximation
58
+
59
+ `nk_each_sin_f32_serial`, `nk_each_cos_f32_serial` evaluate degree-9 minimax polynomials via Horner's method after range reduction.
60
+ The polynomial coefficients are precomputed to minimize maximum error over $[-\pi/4, \pi/4]$ — Chebyshev-optimal, not Taylor truncation.
61
+ Horner evaluation: `p = c9*x^2 + c7; p = p*x^2 + c5; p = p*x^2 + c3; p = p*x^2 + c1; p = p*x` — 4 FMA operations plus 1 multiply for the final odd-power term.
62
+ `nk_each_sin_f64_serial` uses degree-19 polynomials for 52-bit mantissa coverage.
63
+
64
+ ### Vectorized Polynomial Evaluation
65
+
66
+ `nk_each_sin_f32_haswell`, `nk_each_cos_f32_skylake` evaluate the same polynomial on 8 (AVX2) or 16 (AVX-512) elements simultaneously.
67
+ Range reduction, quadrant selection, and polynomial evaluation all operate on packed vectors — the only scalar operation is the final sign correction via `VBLENDVPS` with the quadrant mask.
68
+ `nk_each_sin_f32_neon` processes 4 elements per iteration using `vfmaq_f32` for the Horner chain.
69
+ WASM v128relaxed (`nk_each_sin_f32_v128relaxed`) uses `f32x4.relaxed_madd` for the FMA steps, achieving ~2x throughput over strict `f32x4.mul` + `f32x4.add` sequences.
70
+
71
+ ## Performance
72
+
73
+ The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
74
+ The input size is controlled by the `NK_DENSE_DIMENSIONS` environment variable and set to 256, 1024, and 4096 elements.
75
+ The throughput is measured in GB/s as the number of input bytes per second.
76
+ Accuracy is reported as mean ULP (units in last place) unless noted otherwise — the average number of representable floating-point values between the result and the exact answer.
77
+ Each kernel runs for at least 20 seconds per configuration.
78
+ Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
79
+ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
80
+
81
+ ### Intel Sapphire Rapids
82
+
83
+ #### Native
84
+
85
+ | Kernel | 256 | 1024 | 4096 |
86
+ | :------------------------- | -----------------------: | -----------------------: | -----------------------: |
87
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
88
+ | `nk_each_sin_f64_serial` | 0.994 gb/s, 0 ulp | 0.783 gb/s, 0 ulp | 0.827 gb/s, 0 ulp |
89
+ | `nk_each_cos_f64_serial` | 0.906 gb/s, 0 ulp | 0.784 gb/s, 0 ulp | 0.824 gb/s, 0 ulp |
90
+ | `nk_each_atan_f64_serial` | 0.307 gb/s, 0 ulp | 0.291 gb/s, 0 ulp | 0.291 gb/s, 0 ulp |
91
+ | `nk_each_sin_f64_haswell` | 4.59 gb/s, 0 ulp | 4.19 gb/s, 0 ulp | 4.04 gb/s, 0 ulp |
92
+ | `nk_each_cos_f64_haswell` | 4.25 gb/s, 0 ulp | 4.14 gb/s, 0 ulp | 3.92 gb/s, 0 ulp |
93
+ | `nk_each_atan_f64_haswell` | 3.83 gb/s, 0 ulp | 3.21 gb/s, 0 ulp | 3.49 gb/s, 0 ulp |
94
+ | `nk_each_sin_f64_skylake` | 7.65 gb/s, 0 ulp | 6.55 gb/s, 0 ulp | 4.70 gb/s, 0 ulp |
95
+ | `nk_each_cos_f64_skylake` | 7.88 gb/s, 0 ulp | 5.76 gb/s, 0 ulp | 5.01 gb/s, 0 ulp |
96
+ | `nk_each_atan_f64_skylake` | 5.08 gb/s, 0 ulp | 4.72 gb/s, 0 ulp | 4.58 gb/s, 0 ulp |
97
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
98
+ | `nk_each_sin_f32_serial` | 6.29 gb/s, 5 ulp | 6.07 gb/s, 5 ulp | 5.41 gb/s, 5 ulp |
99
+ | `nk_each_cos_f32_serial` | 7.03 gb/s, 15 ulp | 6.24 gb/s, 15 ulp | 5.16 gb/s, 15 ulp |
100
+ | `nk_each_atan_f32_serial` | 0.642 gb/s, 0.4 ulp | 0.541 gb/s, 0.4 ulp | 0.567 gb/s, 0.4 ulp |
101
+ | `nk_each_sin_f32_haswell` | 10.0 gb/s, 5 ulp | 7.36 gb/s, 5 ulp | 5.63 gb/s, 5 ulp |
102
+ | `nk_each_cos_f32_haswell` | 7.82 gb/s, 15 ulp | 7.11 gb/s, 15 ulp | 5.09 gb/s, 15 ulp |
103
+ | `nk_each_atan_f32_haswell` | 7.63 gb/s, 0.4 ulp | 5.94 gb/s, 0.4 ulp | 5.38 gb/s, 0.4 ulp |
104
+ | `nk_each_sin_f32_skylake` | 11.9 gb/s, 5 ulp | 9.14 gb/s, 5 ulp | 5.43 gb/s, 5 ulp |
105
+ | `nk_each_cos_f32_skylake` | 10.4 gb/s, 15 ulp | 8.26 gb/s, 15 ulp | 5.40 gb/s, 15 ulp |
106
+ | `nk_each_atan_f32_skylake` | 9.07 gb/s, 0.4 ulp | 7.80 gb/s, 0.4 ulp | 5.75 gb/s, 0.4 ulp |
107
+ | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
108
+ | `nk_each_sin_f16_serial` | 0.112 gb/s, 0.9 ulp | 0.102 gb/s, 1.1 ulp | 0.110 gb/s, 0.9 ulp |
109
+ | `nk_each_cos_f16_serial` | 0.105 gb/s, 12 ulp | 0.0962 gb/s, 12 ulp | 0.0976 gb/s, 12 ulp |
110
+ | `nk_each_atan_f16_serial` | 0.0208 gb/s, 6.4 ulp | 0.0201 gb/s, 6.7 ulp | 0.0204 gb/s, 6.6 ulp |
111
+ | `nk_each_sin_f16_skylake` | 6.05 gb/s, 8.41K ulp | 5.81 gb/s, 8.43K ulp | 5.24 gb/s, 8.41K ulp |
112
+ | `nk_each_cos_f16_skylake` | 6.05 gb/s, 8.34K ulp | 5.20 gb/s, 8.34K ulp | 5.09 gb/s, 8.35K ulp |
113
+ | `nk_each_atan_f16_skylake` | 4.86 gb/s, 16.5K ulp | 5.25 gb/s, 16.6K ulp | 4.76 gb/s, 16.5K ulp |
114
+
115
+ #### WASM
116
+
117
+ Measured with Wasmtime v42 (Cranelift backend).
118
+
119
+ | Kernel | 256 | 1024 | 4096 |
120
+ | :----------------------------- | -----------------------: | -----------------------: | -----------------------: |
121
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
122
+ | `nk_each_sin_f64_serial` | 0.34 gb/s, 0.2 ulp | 0.38 gb/s, 0.2 ulp | 0.08 gb/s, 0.2 ulp |
123
+ | `nk_each_cos_f64_serial` | 0.36 gb/s, 0.3 ulp | 0.39 gb/s, 0.3 ulp | 0.08 gb/s, 0.3 ulp |
124
+ | `nk_each_atan_f64_serial` | 0.11 gb/s, 0.3 ulp | 0.12 gb/s, 0.3 ulp | 0.11 gb/s, 0.3 ulp |
125
+ | `nk_each_sin_f64_v128relaxed` | 0.59 gb/s, 0.2 ulp | 0.26 gb/s, 0.2 ulp | 0.05 gb/s, 0.2 ulp |
126
+ | `nk_each_cos_f64_v128relaxed` | 0.29 gb/s, 0.3 ulp | 0.50 gb/s, 0.3 ulp | 0.03 gb/s, 0.3 ulp |
127
+ | `nk_each_atan_f64_v128relaxed` | 0.11 gb/s, 0.3 ulp | 0.48 gb/s, 0.3 ulp | 0.21 gb/s, 0.3 ulp |
128
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
129
+ | `nk_each_sin_f32_serial` | 0.17 gb/s, 4.9 ulp | 0.51 gb/s, 4.9 ulp | 0.07 gb/s, 4.9 ulp |
130
+ | `nk_each_cos_f32_serial` | 0.05 gb/s, 14.4 ulp | 0.41 gb/s, 14.4 ulp | 0.10 gb/s, 14.4 ulp |
131
+ | `nk_each_atan_f32_serial` | 0.08 gb/s, 0.4 ulp | 0.08 gb/s, 0.4 ulp | 0.09 gb/s, 0.4 ulp |
132
+ | `nk_each_sin_f32_v128relaxed` | 0.13 gb/s, 20.7 ulp | 0.01 gb/s, 20.7 ulp | 0.10 gb/s, 20.7 ulp |
133
+ | `nk_each_cos_f32_v128relaxed` | 0.15 gb/s, 21.9 ulp | 0.32 gb/s, 21.9 ulp | 0.05 gb/s, 21.9 ulp |
134
+ | `nk_each_atan_f32_v128relaxed` | 0.45 gb/s, 0.4 ulp | 0.39 gb/s, 0.4 ulp | 0.15 gb/s, 0.4 ulp |
135
+ | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
136
+ | `nk_each_sin_f16_serial` | 0.07 gb/s, 1.1 ulp | 0.07 gb/s, 1.1 ulp | 0.07 gb/s, 1.1 ulp |
137
+ | `nk_each_cos_f16_serial` | 0.07 gb/s, 11.8 ulp | 0.07 gb/s, 11.8 ulp | 0.07 gb/s, 11.8 ulp |
138
+ | `nk_each_atan_f16_serial` | 0.03 gb/s, 6.5 ulp | 0.03 gb/s, 6.5 ulp | 0.03 gb/s, 6.5 ulp |
139
+
140
+ ### Apple M4
141
+
142
+ #### Native
143
+
144
+ | Kernel | 256 | 1024 | 4096 |
145
+ | :------------------------ | -----------------------: | -----------------------: | -----------------------: |
146
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
147
+ | `nk_each_sin_f64_serial` | 0.627 gb/s, 0.2 ulp | 0.634 gb/s, 0.2 ulp | 0.639 gb/s, 0.2 ulp |
148
+ | `nk_each_cos_f64_serial` | 0.621 gb/s, 0.3 ulp | 0.632 gb/s, 0.3 ulp | 0.619 gb/s, 0.3 ulp |
149
+ | `nk_each_atan_f64_serial` | 0.153 gb/s, 0.3 ulp | 0.154 gb/s, 0.3 ulp | 0.153 gb/s, 0.3 ulp |
150
+ | `nk_each_sin_f64_neon` | 5.94 gb/s, 0.2 ulp | 5.75 gb/s, 0.2 ulp | 5.82 gb/s, 0.2 ulp |
151
+ | `nk_each_cos_f64_neon` | 5.15 gb/s, 0.3 ulp | 5.36 gb/s, 0.3 ulp | 5.37 gb/s, 0.3 ulp |
152
+ | `nk_each_atan_f64_neon` | 3.53 gb/s, 0.3 ulp | 3.50 gb/s, 0.3 ulp | 3.50 gb/s, 0.3 ulp |
153
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
154
+ | `nk_each_sin_f32_serial` | 7.94 gb/s, 4.9 ulp | 7.94 gb/s, 4.9 ulp | 7.23 gb/s, 4.9 ulp |
155
+ | `nk_each_cos_f32_serial` | 7.26 gb/s, 14 ulp | 6.41 gb/s, 14 ulp | 6.52 gb/s, 14 ulp |
156
+ | `nk_each_atan_f32_serial` | 0.128 gb/s, 0.4 ulp | 0.129 gb/s, 0.4 ulp | 0.126 gb/s, 0.4 ulp |
157
+ | `nk_each_sin_f32_neon` | 9.75 gb/s, 4.9 ulp | 9.44 gb/s, 4.9 ulp | 8.13 gb/s, 4.9 ulp |
158
+ | `nk_each_cos_f32_neon` | 8.68 gb/s, 18 ulp | 7.77 gb/s, 18 ulp | 7.84 gb/s, 18 ulp |
159
+ | `nk_each_atan_f32_neon` | 5.57 gb/s, 0.4 ulp | 5.00 gb/s, 0.4 ulp | 5.10 gb/s, 0.4 ulp |
160
+ | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
161
+ | `nk_each_sin_f16_serial` | 3.66 gb/s, 1.3 ulp | 3.71 gb/s, 1.3 ulp | 3.38 gb/s, 1.3 ulp |
162
+ | `nk_each_cos_f16_serial` | 3.28 gb/s, 12 ulp | 3.29 gb/s, 12 ulp | 3.15 gb/s, 12 ulp |
163
+ | `nk_each_atan_f16_serial` | 0.0639 gb/s, 6.5 ulp | 0.0626 gb/s, 6.5 ulp | 0.0627 gb/s, 6.5 ulp |
164
+
165
+ #### WASM
166
+
167
+ Measured with Wasmtime v42 (Cranelift backend).
168
+
169
+ | Kernel | 256 | 1024 | 4096 |
170
+ | :----------------------------- | -----------------------: | -----------------------: | -----------------------: |
171
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
172
+ | `nk_each_sin_f64_serial` | 0.619 gb/s, 0.2 ulp | 0.517 gb/s, 0.2 ulp | 0.705 gb/s, 0.2 ulp |
173
+ | `nk_each_sin_f64_v128relaxed` | 9.10 gb/s, 0.2 ulp | 3.58 gb/s, 0.2 ulp | 8.93 gb/s, 0.2 ulp |
174
+ | `nk_each_cos_f64_serial` | 0.595 gb/s, 0.3 ulp | 0.501 gb/s, 0.3 ulp | 0.681 gb/s, 0.3 ulp |
175
+ | `nk_each_cos_f64_v128relaxed` | 9.35 gb/s, 0.3 ulp | 3.99 gb/s, 0.3 ulp | 9.16 gb/s, 0.3 ulp |
176
+ | `nk_each_atan_f64_serial` | 0.188 gb/s, 0.3 ulp | 0.157 gb/s, 0.3 ulp | 0.214 gb/s, 0.3 ulp |
177
+ | `nk_each_atan_f64_v128relaxed` | 6.22 gb/s, 0.3 ulp | 2.44 gb/s, 0.3 ulp | 6.06 gb/s, 0.3 ulp |
178
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
179
+ | `nk_each_sin_f32_serial` | 8.08 gb/s, 4.9 ulp | 4.26 gb/s, 4.9 ulp | 9.32 gb/s, 4.9 ulp |
180
+ | `nk_each_sin_f32_v128relaxed` | 19.1 gb/s, 20 ulp | 7.19 gb/s, 20 ulp | 17.1 gb/s, 20 ulp |
181
+ | `nk_each_cos_f32_serial` | 7.45 gb/s, 14 ulp | 3.88 gb/s, 14 ulp | 8.52 gb/s, 14 ulp |
182
+ | `nk_each_cos_f32_v128relaxed` | 17.8 gb/s, 21 ulp | 7.06 gb/s, 21 ulp | 16.2 gb/s, 21 ulp |
183
+ | `nk_each_atan_f32_serial` | 0.151 gb/s, 0.4 ulp | 0.0950 gb/s, 0.4 ulp | 0.175 gb/s, 0.4 ulp |
184
+ | `nk_each_atan_f32_v128relaxed` | 11.3 gb/s, 0.4 ulp | 4.22 gb/s, 0.4 ulp | 10.9 gb/s, 0.4 ulp |