numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -52,499 +52,500 @@ extern "C" {
52
52
 
53
53
  NK_INTERNAL v128_t nk_f32x4_sin_v128relaxed_(v128_t const angles_radians) {
54
54
  // Constants for argument reduction
55
- v128_t const pi = wasm_f32x4_splat(3.14159265358979323846f);
56
- v128_t const pi_reciprocal = wasm_f32x4_splat(0.31830988618379067154f);
57
- v128_t const coeff_5 = wasm_f32x4_splat(-0.0001881748176f);
58
- v128_t const coeff_3 = wasm_f32x4_splat(+0.008323502727f);
59
- v128_t const coeff_1 = wasm_f32x4_splat(-0.1666651368f);
60
-
61
- // Compute (multiples_of_pi) = round(angle / pi) using nearest rounding
62
- v128_t quotients = wasm_f32x4_mul(angles_radians, pi_reciprocal);
63
- v128_t rounded_quotients = wasm_f32x4_nearest(quotients);
55
+ v128_t const pi_f32x4 = wasm_f32x4_splat(3.14159265358979323846f);
56
+ v128_t const pi_reciprocal_f32x4 = wasm_f32x4_splat(0.31830988618379067154f);
57
+ v128_t const coeff_5_f32x4 = wasm_f32x4_splat(-0.0001881748176f);
58
+ v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.008323502727f);
59
+ v128_t const coeff_1_f32x4 = wasm_f32x4_splat(-0.1666651368f);
60
+
61
+ // Compute (multiples_of_pi_f32x4) = round(angle / pi_f32x4) using nearest rounding
62
+ v128_t quotients_f32x4 = wasm_f32x4_mul(angles_radians, pi_reciprocal_f32x4);
63
+ v128_t rounded_quotients_f32x4 = wasm_f32x4_nearest(quotients_f32x4);
64
64
  // relaxed_trunc: 1 instruction (cvttps2dq) vs 7 (with NaN/overflow fixup) on x86.
65
- // Safe because rounded_quotients are small integers from nearest(), never NaN or out of i32 range.
66
- v128_t multiples_of_pi = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients);
65
+ // Safe because rounded_quotients_f32x4 are small integers from nearest(), never NaN or out of i32 range.
66
+ v128_t multiples_of_pi_f32x4 = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients_f32x4);
67
67
 
68
- // Reduce the angle: angle - rounded_quotients * pi
68
+ // Reduce the angle: angle - rounded_quotients_f32x4 * pi_f32x4
69
69
  // vfmsq_f32(acc, a, b) = acc - a*b -> wasm_f32x4_relaxed_nmadd(a, b, acc)
70
- v128_t const angles = wasm_f32x4_relaxed_nmadd(rounded_quotients, pi, angles_radians);
71
- v128_t const angles_squared = wasm_f32x4_mul(angles, angles);
72
- v128_t const angles_cubed = wasm_f32x4_mul(angles, angles_squared);
70
+ v128_t const angles_f32x4 = wasm_f32x4_relaxed_nmadd(rounded_quotients_f32x4, pi_f32x4, angles_radians);
71
+ v128_t const angles_squared_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_f32x4);
72
+ v128_t const angles_cubed_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_squared_f32x4);
73
73
 
74
74
  // Compute the polynomial approximation
75
75
  // vfmaq_f32(acc, a, b) = acc + a*b -> wasm_f32x4_relaxed_madd(a, b, acc)
76
- v128_t polynomials = coeff_5;
77
- polynomials = wasm_f32x4_relaxed_madd(polynomials, angles_squared, coeff_3);
78
- polynomials = wasm_f32x4_relaxed_madd(polynomials, angles_squared, coeff_1);
79
- v128_t results = wasm_f32x4_relaxed_madd(angles_cubed, polynomials, angles);
80
-
81
- // If multiples_of_pi is odd, flip the sign
82
- v128_t parity = wasm_v128_and(multiples_of_pi, wasm_i32x4_splat(1));
83
- v128_t odd_mask = wasm_i32x4_eq(parity, wasm_i32x4_splat(1));
84
- v128_t negated = wasm_f32x4_neg(results);
76
+ v128_t polynomials_f32x4 = coeff_5_f32x4;
77
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_3_f32x4);
78
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_1_f32x4);
79
+ v128_t results_f32x4 = wasm_f32x4_relaxed_madd(angles_cubed_f32x4, polynomials_f32x4, angles_f32x4);
80
+
81
+ // If multiples_of_pi_f32x4 is odd, flip the sign
82
+ v128_t parity_i32x4 = wasm_v128_and(multiples_of_pi_f32x4, wasm_i32x4_splat(1));
83
+ v128_t odd_mask_i32x4 = wasm_i32x4_eq(parity_i32x4, wasm_i32x4_splat(1));
84
+ v128_t negated_f32x4 = wasm_f32x4_neg(results_f32x4);
85
85
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
86
86
  // Safe because mask is from comparison (all-ones or all-zeros per lane).
87
- results = wasm_i32x4_relaxed_laneselect(negated, results, odd_mask);
88
- return results;
87
+ results_f32x4 = wasm_i32x4_relaxed_laneselect(negated_f32x4, results_f32x4, odd_mask_i32x4);
88
+ return results_f32x4;
89
89
  }
90
90
 
91
91
  NK_INTERNAL v128_t nk_f32x4_cos_v128relaxed_(v128_t const angles_radians) {
92
92
  // Constants for argument reduction
93
- v128_t const pi = wasm_f32x4_splat(3.14159265358979323846f);
94
- v128_t const pi_half = wasm_f32x4_splat(1.57079632679489661923f);
95
- v128_t const pi_reciprocal = wasm_f32x4_splat(0.31830988618379067154f);
96
- v128_t const coeff_5 = wasm_f32x4_splat(-0.0001881748176f);
97
- v128_t const coeff_3 = wasm_f32x4_splat(+0.008323502727f);
98
- v128_t const coeff_1 = wasm_f32x4_splat(-0.1666651368f);
99
-
100
- // Compute round((angle / pi) - 0.5)
101
- v128_t const neg_half = wasm_f32x4_splat(-0.5f);
102
- v128_t quotients = wasm_f32x4_relaxed_madd(angles_radians, pi_reciprocal, neg_half);
103
- v128_t rounded_quotients = wasm_f32x4_nearest(quotients);
93
+ v128_t const pi_f32x4 = wasm_f32x4_splat(3.14159265358979323846f);
94
+ v128_t const pi_half_f32x4 = wasm_f32x4_splat(1.57079632679489661923f);
95
+ v128_t const pi_reciprocal_f32x4 = wasm_f32x4_splat(0.31830988618379067154f);
96
+ v128_t const coeff_5_f32x4 = wasm_f32x4_splat(-0.0001881748176f);
97
+ v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.008323502727f);
98
+ v128_t const coeff_1_f32x4 = wasm_f32x4_splat(-0.1666651368f);
99
+
100
+ // Compute round((angle / pi_f32x4) - 0.5)
101
+ v128_t const neg_half_f32x4 = wasm_f32x4_splat(-0.5f);
102
+ v128_t quotients_f32x4 = wasm_f32x4_relaxed_madd(angles_radians, pi_reciprocal_f32x4, neg_half_f32x4);
103
+ v128_t rounded_quotients_f32x4 = wasm_f32x4_nearest(quotients_f32x4);
104
104
  // relaxed_trunc: 1 instruction (cvttps2dq) vs 7 (with NaN/overflow fixup) on x86.
105
- // Safe because rounded_quotients are small integers from nearest(), never NaN or out of i32 range.
106
- v128_t multiples_of_pi = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients);
105
+ // Safe because rounded_quotients_f32x4 are small integers from nearest(), never NaN or out of i32 range.
106
+ v128_t multiples_of_pi_f32x4 = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients_f32x4);
107
107
 
108
- // Reduce the angle: (angle - pi/2) - rounded_quotients * pi
109
- v128_t shifted = wasm_f32x4_sub(angles_radians, pi_half);
110
- v128_t const angles = wasm_f32x4_relaxed_nmadd(rounded_quotients, pi, shifted);
111
- v128_t const angles_squared = wasm_f32x4_mul(angles, angles);
112
- v128_t const angles_cubed = wasm_f32x4_mul(angles, angles_squared);
108
+ // Reduce the angle: (angle - pi_f32x4/2) - rounded_quotients_f32x4 * pi_f32x4
109
+ v128_t shifted_f32x4 = wasm_f32x4_sub(angles_radians, pi_half_f32x4);
110
+ v128_t const angles_f32x4 = wasm_f32x4_relaxed_nmadd(rounded_quotients_f32x4, pi_f32x4, shifted_f32x4);
111
+ v128_t const angles_squared_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_f32x4);
112
+ v128_t const angles_cubed_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_squared_f32x4);
113
113
 
114
114
  // Compute the polynomial approximation
115
- v128_t polynomials = coeff_5;
116
- polynomials = wasm_f32x4_relaxed_madd(polynomials, angles_squared, coeff_3);
117
- polynomials = wasm_f32x4_relaxed_madd(polynomials, angles_squared, coeff_1);
118
- v128_t results = wasm_f32x4_relaxed_madd(angles_cubed, polynomials, angles);
119
-
120
- // If multiples_of_pi is even, flip the sign
121
- v128_t parity = wasm_v128_and(multiples_of_pi, wasm_i32x4_splat(1));
122
- v128_t even_mask = wasm_i32x4_eq(parity, wasm_i32x4_splat(0));
123
- v128_t negated = wasm_f32x4_neg(results);
115
+ v128_t polynomials_f32x4 = coeff_5_f32x4;
116
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_3_f32x4);
117
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_1_f32x4);
118
+ v128_t results_f32x4 = wasm_f32x4_relaxed_madd(angles_cubed_f32x4, polynomials_f32x4, angles_f32x4);
119
+
120
+ // If multiples_of_pi_f32x4 is even, flip the sign
121
+ v128_t parity_i32x4 = wasm_v128_and(multiples_of_pi_f32x4, wasm_i32x4_splat(1));
122
+ v128_t even_mask_i32x4 = wasm_i32x4_eq(parity_i32x4, wasm_i32x4_splat(0));
123
+ v128_t negated_f32x4 = wasm_f32x4_neg(results_f32x4);
124
124
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
125
125
  // Safe because mask is from comparison (all-ones or all-zeros per lane).
126
- results = wasm_i32x4_relaxed_laneselect(negated, results, even_mask);
127
- return results;
126
+ results_f32x4 = wasm_i32x4_relaxed_laneselect(negated_f32x4, results_f32x4, even_mask_i32x4);
127
+ return results_f32x4;
128
128
  }
129
129
 
130
130
  NK_INTERNAL v128_t nk_f32x4_atan_v128relaxed_(v128_t const inputs) {
131
131
  // Polynomial coefficients for atan approximation (8 terms)
132
- v128_t const coeff_8 = wasm_f32x4_splat(-0.333331018686294555664062f);
133
- v128_t const coeff_7 = wasm_f32x4_splat(+0.199926957488059997558594f);
134
- v128_t const coeff_6 = wasm_f32x4_splat(-0.142027363181114196777344f);
135
- v128_t const coeff_5 = wasm_f32x4_splat(+0.106347933411598205566406f);
136
- v128_t const coeff_4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
137
- v128_t const coeff_3 = wasm_f32x4_splat(+0.0425049886107444763183594f);
138
- v128_t const coeff_2 = wasm_f32x4_splat(-0.0159569028764963150024414f);
139
- v128_t const coeff_1 = wasm_f32x4_splat(+0.00282363896258175373077393f);
140
- v128_t const half_pi = wasm_f32x4_splat(1.5707963267948966f);
141
-
142
- // Detect negative values and take absolute value
143
- v128_t const zeros = wasm_f32x4_splat(0);
144
- v128_t negative_mask = wasm_f32x4_lt(inputs, zeros);
145
- v128_t values = wasm_f32x4_abs(inputs);
146
-
147
- // Check if values > 1 (need reciprocal)
148
- v128_t reciprocal_mask = wasm_f32x4_gt(values, wasm_f32x4_splat(1.0f));
132
+ v128_t const coeff_8_f32x4 = wasm_f32x4_splat(-0.333331018686294555664062f);
133
+ v128_t const coeff_7_f32x4 = wasm_f32x4_splat(+0.199926957488059997558594f);
134
+ v128_t const coeff_6_f32x4 = wasm_f32x4_splat(-0.142027363181114196777344f);
135
+ v128_t const coeff_5_f32x4 = wasm_f32x4_splat(+0.106347933411598205566406f);
136
+ v128_t const coeff_4_f32x4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
137
+ v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.0425049886107444763183594f);
138
+ v128_t const coeff_2_f32x4 = wasm_f32x4_splat(-0.0159569028764963150024414f);
139
+ v128_t const coeff_1_f32x4 = wasm_f32x4_splat(+0.00282363896258175373077393f);
140
+ v128_t const half_pi_f32x4 = wasm_f32x4_splat(1.5707963267948966f);
141
+
142
+ // Detect negative values_f32x4 and take absolute value
143
+ v128_t const zeros_f32x4 = wasm_f32x4_splat(0);
144
+ v128_t negative_mask_f32x4 = wasm_f32x4_lt(inputs, zeros_f32x4);
145
+ v128_t values_f32x4 = wasm_f32x4_abs(inputs);
146
+
147
+ // Check if values_f32x4 > 1 (need reciprocal)
148
+ v128_t reciprocal_mask_f32x4 = wasm_f32x4_gt(values_f32x4, wasm_f32x4_splat(1.0f));
149
149
 
150
150
  // No fast reciprocal in WASM — use division
151
- v128_t recip = wasm_f32x4_div(wasm_f32x4_splat(1.0f), values);
151
+ v128_t recip_f32x4 = wasm_f32x4_div(wasm_f32x4_splat(1.0f), values_f32x4);
152
152
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
153
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
154
- values = wasm_i32x4_relaxed_laneselect(recip, values, reciprocal_mask);
153
+ // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
154
+ values_f32x4 = wasm_i32x4_relaxed_laneselect(recip_f32x4, values_f32x4, reciprocal_mask_f32x4);
155
155
 
156
156
  // Compute powers
157
- v128_t const values_squared = wasm_f32x4_mul(values, values);
158
- v128_t const values_cubed = wasm_f32x4_mul(values, values_squared);
157
+ v128_t const values_squared_f32x4 = wasm_f32x4_mul(values_f32x4, values_f32x4);
158
+ v128_t const values_cubed_f32x4 = wasm_f32x4_mul(values_f32x4, values_squared_f32x4);
159
159
 
160
160
  // Polynomial evaluation using Horner's method
161
- v128_t polynomials = coeff_1;
162
- polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_2);
163
- polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_3);
164
- polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_4);
165
- polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_5);
166
- polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_6);
167
- polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_7);
168
- polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_8);
169
-
170
- // Compute result: atan(x) ~ x + x^3 * P(x^2)
171
- v128_t result = wasm_f32x4_relaxed_madd(values_cubed, polynomials, values);
172
-
173
- // Adjust for reciprocal: result = pi/2 - result
174
- v128_t adjusted = wasm_f32x4_sub(half_pi, result);
161
+ v128_t polynomials_f32x4 = coeff_1_f32x4;
162
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_2_f32x4);
163
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_3_f32x4);
164
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_4_f32x4);
165
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_5_f32x4);
166
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_6_f32x4);
167
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_7_f32x4);
168
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_8_f32x4);
169
+
170
+ // Compute result_f32x4: atan(x) ~ x + x^3 * P(x^2)
171
+ v128_t result_f32x4 = wasm_f32x4_relaxed_madd(values_cubed_f32x4, polynomials_f32x4, values_f32x4);
172
+
173
+ // Adjust for reciprocal: result_f32x4 = pi/2 - result_f32x4
174
+ v128_t adjusted_f32x4 = wasm_f32x4_sub(half_pi_f32x4, result_f32x4);
175
175
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
176
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
177
- result = wasm_i32x4_relaxed_laneselect(adjusted, result, reciprocal_mask);
176
+ // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
177
+ result_f32x4 = wasm_i32x4_relaxed_laneselect(adjusted_f32x4, result_f32x4, reciprocal_mask_f32x4);
178
178
 
179
- // Adjust for negative: result = -result
180
- v128_t negated = wasm_f32x4_neg(result);
179
+ // Adjust for negative: result_f32x4 = -result_f32x4
180
+ v128_t negated_f32x4 = wasm_f32x4_neg(result_f32x4);
181
181
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
182
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
183
- result = wasm_i32x4_relaxed_laneselect(negated, result, negative_mask);
184
- return result;
182
+ // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
183
+ result_f32x4 = wasm_i32x4_relaxed_laneselect(negated_f32x4, result_f32x4, negative_mask_f32x4);
184
+ return result_f32x4;
185
185
  }
186
186
 
187
187
  NK_INTERNAL v128_t nk_f32x4_atan2_v128relaxed_(v128_t const ys_inputs, v128_t const xs_inputs) {
188
188
  // Polynomial coefficients (same as atan)
189
- v128_t const coeff_8 = wasm_f32x4_splat(-0.333331018686294555664062f);
190
- v128_t const coeff_7 = wasm_f32x4_splat(+0.199926957488059997558594f);
191
- v128_t const coeff_6 = wasm_f32x4_splat(-0.142027363181114196777344f);
192
- v128_t const coeff_5 = wasm_f32x4_splat(+0.106347933411598205566406f);
193
- v128_t const coeff_4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
194
- v128_t const coeff_3 = wasm_f32x4_splat(+0.0425049886107444763183594f);
195
- v128_t const coeff_2 = wasm_f32x4_splat(-0.0159569028764963150024414f);
196
- v128_t const coeff_1 = wasm_f32x4_splat(+0.00282363896258175373077393f);
197
- v128_t const pi = wasm_f32x4_splat(3.14159265358979323846f);
198
- v128_t const half_pi = wasm_f32x4_splat(1.5707963267948966f);
199
- v128_t const zeros = wasm_f32x4_splat(0);
189
+ v128_t const coeff_8_f32x4 = wasm_f32x4_splat(-0.333331018686294555664062f);
190
+ v128_t const coeff_7_f32x4 = wasm_f32x4_splat(+0.199926957488059997558594f);
191
+ v128_t const coeff_6_f32x4 = wasm_f32x4_splat(-0.142027363181114196777344f);
192
+ v128_t const coeff_5_f32x4 = wasm_f32x4_splat(+0.106347933411598205566406f);
193
+ v128_t const coeff_4_f32x4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
194
+ v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.0425049886107444763183594f);
195
+ v128_t const coeff_2_f32x4 = wasm_f32x4_splat(-0.0159569028764963150024414f);
196
+ v128_t const coeff_1_f32x4 = wasm_f32x4_splat(+0.00282363896258175373077393f);
197
+ v128_t const pi_f32x4 = wasm_f32x4_splat(3.14159265358979323846f);
198
+ v128_t const half_pi_f32x4 = wasm_f32x4_splat(1.5707963267948966f);
199
+ v128_t const zeros_f32x4 = wasm_f32x4_splat(0);
200
200
 
201
201
  // Quadrant adjustments - take absolute values
202
- v128_t xs_negative_mask = wasm_f32x4_lt(xs_inputs, zeros);
203
- v128_t xs = wasm_f32x4_abs(xs_inputs);
204
- v128_t ys = wasm_f32x4_abs(ys_inputs);
202
+ v128_t xs_negative_mask_f32x4 = wasm_f32x4_lt(xs_inputs, zeros_f32x4);
203
+ v128_t xs_f32x4 = wasm_f32x4_abs(xs_inputs);
204
+ v128_t ys_f32x4 = wasm_f32x4_abs(ys_inputs);
205
205
 
206
206
  // Ensure proper fraction where numerator < denominator
207
- v128_t swap_mask = wasm_f32x4_gt(ys, xs);
208
- v128_t temps = xs;
207
+ v128_t swap_mask_f32x4 = wasm_f32x4_gt(ys_f32x4, xs_f32x4);
208
+ v128_t temps_f32x4 = xs_f32x4;
209
209
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
210
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
211
- xs = wasm_i32x4_relaxed_laneselect(ys, xs, swap_mask);
212
- ys = wasm_i32x4_relaxed_laneselect(wasm_f32x4_neg(temps), ys, swap_mask);
210
+ // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
211
+ xs_f32x4 = wasm_i32x4_relaxed_laneselect(ys_f32x4, xs_f32x4, swap_mask_f32x4);
212
+ ys_f32x4 = wasm_i32x4_relaxed_laneselect(wasm_f32x4_neg(temps_f32x4), ys_f32x4, swap_mask_f32x4);
213
213
 
214
- // Division for ratio: ratio = ys / xs
215
- v128_t const ratio = wasm_f32x4_div(ys, xs);
216
- v128_t const ratio_squared = wasm_f32x4_mul(ratio, ratio);
217
- v128_t const ratio_cubed = wasm_f32x4_mul(ratio, ratio_squared);
214
+ // Division for ratio_f32x4: ratio_f32x4 = ys_f32x4 / xs_f32x4
215
+ v128_t const ratio_f32x4 = wasm_f32x4_div(ys_f32x4, xs_f32x4);
216
+ v128_t const ratio_squared_f32x4 = wasm_f32x4_mul(ratio_f32x4, ratio_f32x4);
217
+ v128_t const ratio_cubed_f32x4 = wasm_f32x4_mul(ratio_f32x4, ratio_squared_f32x4);
218
218
 
219
219
  // Polynomial evaluation using Horner's method
220
- v128_t polynomials = coeff_1;
221
- polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_2);
222
- polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_3);
223
- polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_4);
224
- polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_5);
225
- polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_6);
226
- polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_7);
227
- polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_8);
220
+ v128_t polynomials_f32x4 = coeff_1_f32x4;
221
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_2_f32x4);
222
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_3_f32x4);
223
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_4_f32x4);
224
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_5_f32x4);
225
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_6_f32x4);
226
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_7_f32x4);
227
+ polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_8_f32x4);
228
228
 
229
229
  // Compute the result
230
- v128_t results = wasm_f32x4_relaxed_madd(ratio_cubed, polynomials, ratio);
230
+ v128_t results_f32x4 = wasm_f32x4_relaxed_madd(ratio_cubed_f32x4, polynomials_f32x4, ratio_f32x4);
231
231
 
232
- // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
232
+ // Compute quadrant_f32x4 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
233
233
  // -2 for x<0 && !swap, -1 for x<0 && swap
234
- v128_t quadrant = wasm_f32x4_splat(0.0f);
235
- v128_t neg_two = wasm_f32x4_splat(-2.0f);
234
+ v128_t quadrant_f32x4 = wasm_f32x4_splat(0.0f);
235
+ v128_t neg_two_f32x4 = wasm_f32x4_splat(-2.0f);
236
236
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
237
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
238
- quadrant = wasm_i32x4_relaxed_laneselect(neg_two, quadrant, xs_negative_mask);
239
- v128_t quadrant_incremented = wasm_f32x4_add(quadrant, wasm_f32x4_splat(1.0f));
240
- quadrant = wasm_i32x4_relaxed_laneselect(quadrant_incremented, quadrant, swap_mask);
237
+ // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
238
+ quadrant_f32x4 = wasm_i32x4_relaxed_laneselect(neg_two_f32x4, quadrant_f32x4, xs_negative_mask_f32x4);
239
+ v128_t quadrant_incremented_f32x4 = wasm_f32x4_add(quadrant_f32x4, wasm_f32x4_splat(1.0f));
240
+ quadrant_f32x4 = wasm_i32x4_relaxed_laneselect(quadrant_incremented_f32x4, quadrant_f32x4, swap_mask_f32x4);
241
241
 
242
- // Adjust for quadrant: result += quadrant * pi/2
243
- results = wasm_f32x4_relaxed_madd(quadrant, half_pi, results);
242
+ // Adjust for quadrant_f32x4: result += quadrant_f32x4 * pi_f32x4/2
243
+ results_f32x4 = wasm_f32x4_relaxed_madd(quadrant_f32x4, half_pi_f32x4, results_f32x4);
244
244
 
245
245
  // Transfer sign from x and y by XOR with sign bits
246
- v128_t sign_mask = wasm_f32x4_splat(-0.0f);
247
- v128_t xs_sign = wasm_v128_and(xs_inputs, sign_mask);
248
- v128_t ys_sign = wasm_v128_and(ys_inputs, sign_mask);
249
- results = wasm_v128_xor(results, xs_sign);
250
- results = wasm_v128_xor(results, ys_sign);
246
+ v128_t sign_mask_f32x4 = wasm_f32x4_splat(-0.0f);
247
+ v128_t xs_sign_f32x4 = wasm_v128_and(xs_inputs, sign_mask_f32x4);
248
+ v128_t ys_sign_f32x4 = wasm_v128_and(ys_inputs, sign_mask_f32x4);
249
+ results_f32x4 = wasm_v128_xor(results_f32x4, xs_sign_f32x4);
250
+ results_f32x4 = wasm_v128_xor(results_f32x4, ys_sign_f32x4);
251
251
 
252
- return results;
252
+ return results_f32x4;
253
253
  }
254
254
 
255
255
  NK_INTERNAL v128_t nk_f64x2_sin_v128relaxed_(v128_t const angles_radians) {
256
256
  // Constants for argument reduction
257
- v128_t const pi_high = wasm_f64x2_splat(3.141592653589793116);
258
- v128_t const pi_low = wasm_f64x2_splat(1.2246467991473532072e-16);
259
- v128_t const pi_reciprocal = wasm_f64x2_splat(0.31830988618379067154);
257
+ v128_t const pi_high_f64x2 = wasm_f64x2_splat(3.141592653589793116);
258
+ v128_t const pi_low_f64x2 = wasm_f64x2_splat(1.2246467991473532072e-16);
259
+ v128_t const pi_reciprocal_f64x2 = wasm_f64x2_splat(0.31830988618379067154);
260
260
 
261
261
  // Polynomial coefficients for sine approximation
262
- v128_t const coeff_0 = wasm_f64x2_splat(+0.00833333333333332974823815);
263
- v128_t const coeff_1 = wasm_f64x2_splat(-0.000198412698412696162806809);
264
- v128_t const coeff_2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
265
- v128_t const coeff_3 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
266
- v128_t const coeff_4 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
267
- v128_t const coeff_5 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
268
- v128_t const coeff_6 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
269
- v128_t const coeff_7 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
270
- v128_t const coeff_8 = wasm_f64x2_splat(-0.166666666666666657414808);
262
+ v128_t const coeff_0_f64x2 = wasm_f64x2_splat(+0.00833333333333332974823815);
263
+ v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.000198412698412696162806809);
264
+ v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
265
+ v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
266
+ v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
267
+ v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
268
+ v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
269
+ v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
270
+ v128_t const coeff_8_f64x2 = wasm_f64x2_splat(-0.166666666666666657414808);
271
271
 
272
272
  // Compute round(angle / pi)
273
- v128_t const quotients = wasm_f64x2_mul(angles_radians, pi_reciprocal);
274
- v128_t rounded_quotients = wasm_f64x2_nearest(quotients);
273
+ v128_t const quotients_f64x2 = wasm_f64x2_mul(angles_radians, pi_reciprocal_f64x2);
274
+ v128_t rounded_quotients_f64x2 = wasm_f64x2_nearest(quotients_f64x2);
275
275
  // relaxed_trunc: 1 instruction (cvttpd2dq) vs 7 (with NaN/overflow fixup) on x86.
276
- // Safe because rounded_quotients are small integers from nearest(), never NaN or out of i32 range.
277
- v128_t multiples_i32 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients);
276
+ // Safe because rounded_quotients_f64x2 are small integers from nearest(), never NaN or out of i32 range.
277
+ v128_t multiples_i32_f64x2 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients_f64x2);
278
278
 
279
- // Two-step Cody-Waite reduction: angle - rounded * pi_high - rounded * pi_low
280
- v128_t angles = angles_radians;
281
- angles = wasm_f64x2_relaxed_nmadd(rounded_quotients, pi_high, angles);
282
- angles = wasm_f64x2_relaxed_nmadd(rounded_quotients, pi_low, angles);
279
+ // Two-step Cody-Waite reduction: angle - rounded * pi_high_f64x2 - rounded * pi_low_f64x2
280
+ v128_t angles_f64x2 = angles_radians;
281
+ angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_high_f64x2, angles_f64x2);
282
+ angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_low_f64x2, angles_f64x2);
283
283
 
284
284
  // Check parity in i32, then widen to i64 mask for laneselect
285
- v128_t parity_i32 = wasm_v128_and(multiples_i32, wasm_i32x4_splat(1));
286
- v128_t odd_i32 = wasm_i32x4_eq(parity_i32, wasm_i32x4_splat(1));
285
+ v128_t parity_i32_i32x4 = wasm_v128_and(multiples_i32_f64x2, wasm_i32x4_splat(1));
286
+ v128_t odd_i32_i32x4 = wasm_i32x4_eq(parity_i32_i32x4, wasm_i32x4_splat(1));
287
287
  // Widen: lane0 of i32 -> lanes 0-1 of i64, lane1 -> lanes 2-3
288
288
  // Shuffle i32 lanes [0,0,1,1] to broadcast each i32 parity into both halves of each i64
289
- v128_t odd_mask = wasm_i32x4_shuffle(odd_i32, odd_i32, 0, 0, 1, 1);
290
- v128_t negated_angles = wasm_f64x2_neg(angles);
289
+ v128_t odd_mask_i32x4 = wasm_i32x4_shuffle(odd_i32_i32x4, odd_i32_i32x4, 0, 0, 1, 1);
290
+ v128_t negated_angles_f64x2 = wasm_f64x2_neg(angles_f64x2);
291
291
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
292
292
  // Safe because mask is lane-granular at i64 width (all-ones or all-zeros per 64-bit lane).
293
- angles = wasm_i64x2_relaxed_laneselect(negated_angles, angles, odd_mask);
293
+ angles_f64x2 = wasm_i64x2_relaxed_laneselect(negated_angles_f64x2, angles_f64x2, odd_mask_i32x4);
294
294
 
295
- v128_t const angles_squared = wasm_f64x2_mul(angles, angles);
296
- v128_t const angles_cubed = wasm_f64x2_mul(angles, angles_squared);
297
- v128_t const angles_quadratic = wasm_f64x2_mul(angles_squared, angles_squared);
298
- v128_t const angles_octic = wasm_f64x2_mul(angles_quadratic, angles_quadratic);
295
+ v128_t const angles_squared_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_f64x2);
296
+ v128_t const angles_cubed_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_squared_f64x2);
297
+ v128_t const angles_quadratic_f64x2 = wasm_f64x2_mul(angles_squared_f64x2, angles_squared_f64x2);
298
+ v128_t const angles_octic_f64x2 = wasm_f64x2_mul(angles_quadratic_f64x2, angles_quadratic_f64x2);
299
299
 
300
300
  // Compute polynomial terms using Estrin's scheme for better ILP
301
- v128_t const poly_67 = wasm_f64x2_relaxed_madd(angles_squared, coeff_7, coeff_6);
302
- v128_t const poly_45 = wasm_f64x2_relaxed_madd(angles_squared, coeff_5, coeff_4);
303
- v128_t const poly_4567 = wasm_f64x2_relaxed_madd(angles_quadratic, poly_67, poly_45);
301
+ v128_t const poly_67_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_7_f64x2, coeff_6_f64x2);
302
+ v128_t const poly_45_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_5_f64x2, coeff_4_f64x2);
303
+ v128_t const poly_4567_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_67_f64x2, poly_45_f64x2);
304
304
 
305
- v128_t const poly_23 = wasm_f64x2_relaxed_madd(angles_squared, coeff_3, coeff_2);
306
- v128_t const poly_01 = wasm_f64x2_relaxed_madd(angles_squared, coeff_1, coeff_0);
307
- v128_t const poly_0123 = wasm_f64x2_relaxed_madd(angles_quadratic, poly_23, poly_01);
305
+ v128_t const poly_23_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_3_f64x2, coeff_2_f64x2);
306
+ v128_t const poly_01_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_1_f64x2, coeff_0_f64x2);
307
+ v128_t const poly_0123_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_23_f64x2, poly_01_f64x2);
308
308
 
309
309
  // Combine polynomial terms
310
- v128_t results = wasm_f64x2_relaxed_madd(angles_octic, poly_4567, poly_0123);
311
- results = wasm_f64x2_relaxed_madd(results, angles_squared, coeff_8);
312
- results = wasm_f64x2_relaxed_madd(results, angles_cubed, angles);
310
+ v128_t results_f64x2 = wasm_f64x2_relaxed_madd(angles_octic_f64x2, poly_4567_f64x2, poly_0123_f64x2);
311
+ results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_squared_f64x2, coeff_8_f64x2);
312
+ results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_cubed_f64x2, angles_f64x2);
313
313
 
314
314
  // Handle zero input (preserve sign of zero)
315
- v128_t const non_zero_mask = wasm_f64x2_eq(angles_radians, wasm_f64x2_splat(0));
315
+ v128_t const non_zero_mask_f64x2 = wasm_f64x2_eq(angles_radians, wasm_f64x2_splat(0));
316
316
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
317
317
  // Safe because mask is from comparison (all-ones or all-zeros per lane).
318
- results = wasm_i64x2_relaxed_laneselect(angles_radians, results, non_zero_mask);
319
- return results;
318
+ results_f64x2 = wasm_i64x2_relaxed_laneselect(angles_radians, results_f64x2, non_zero_mask_f64x2);
319
+ return results_f64x2;
320
320
  }
321
321
 
322
322
  NK_INTERNAL v128_t nk_f64x2_cos_v128relaxed_(v128_t const angles_radians) {
323
323
  // Constants for argument reduction
324
- v128_t const pi_high_half = wasm_f64x2_splat(3.141592653589793116 * 0.5);
325
- v128_t const pi_low_half = wasm_f64x2_splat(1.2246467991473532072e-16 * 0.5);
326
- v128_t const pi_reciprocal = wasm_f64x2_splat(0.31830988618379067154);
324
+ v128_t const pi_high_half_f64x2 = wasm_f64x2_splat(3.141592653589793116 * 0.5);
325
+ v128_t const pi_low_half_f64x2 = wasm_f64x2_splat(1.2246467991473532072e-16 * 0.5);
326
+ v128_t const pi_reciprocal_f64x2 = wasm_f64x2_splat(0.31830988618379067154);
327
327
 
328
328
  // Polynomial coefficients for cosine approximation
329
- v128_t const coeff_0 = wasm_f64x2_splat(+0.00833333333333332974823815);
330
- v128_t const coeff_1 = wasm_f64x2_splat(-0.000198412698412696162806809);
331
- v128_t const coeff_2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
332
- v128_t const coeff_3 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
333
- v128_t const coeff_4 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
334
- v128_t const coeff_5 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
335
- v128_t const coeff_6 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
336
- v128_t const coeff_7 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
337
- v128_t const coeff_8 = wasm_f64x2_splat(-0.166666666666666657414808);
329
+ v128_t const coeff_0_f64x2 = wasm_f64x2_splat(+0.00833333333333332974823815);
330
+ v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.000198412698412696162806809);
331
+ v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
332
+ v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
333
+ v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
334
+ v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
335
+ v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
336
+ v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
337
+ v128_t const coeff_8_f64x2 = wasm_f64x2_splat(-0.166666666666666657414808);
338
338
 
339
339
  // Compute 2 * round(angle / pi - 0.5) + 1
340
- v128_t const neg_half = wasm_f64x2_splat(-0.5);
341
- v128_t const quotients = wasm_f64x2_relaxed_madd(angles_radians, pi_reciprocal, neg_half);
342
- v128_t const rounded = wasm_f64x2_nearest(quotients);
343
- v128_t const rounded_quotients = wasm_f64x2_relaxed_madd(wasm_f64x2_splat(2.0), rounded, wasm_f64x2_splat(1.0));
340
+ v128_t const neg_half_f64x2 = wasm_f64x2_splat(-0.5);
341
+ v128_t const quotients_f64x2 = wasm_f64x2_relaxed_madd(angles_radians, pi_reciprocal_f64x2, neg_half_f64x2);
342
+ v128_t const rounded_f64x2 = wasm_f64x2_nearest(quotients_f64x2);
343
+ v128_t const rounded_quotients_f64x2 = wasm_f64x2_relaxed_madd(wasm_f64x2_splat(2.0), rounded_f64x2,
344
+ wasm_f64x2_splat(1.0));
344
345
  // relaxed_trunc: 1 instruction (cvttpd2dq) vs 7 (with NaN/overflow fixup) on x86.
345
- // Safe because rounded_quotients are small integers from nearest(), never NaN or out of i32 range.
346
- v128_t quotients_i32 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients);
346
+ // Safe because rounded_quotients_f64x2 are small integers from nearest(), never NaN or out of i32 range.
347
+ v128_t quotients_i32_f64x2 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients_f64x2);
347
348
 
348
349
  // Two-step Cody-Waite reduction
349
- v128_t angles = angles_radians;
350
- angles = wasm_f64x2_relaxed_nmadd(rounded_quotients, pi_high_half, angles);
351
- angles = wasm_f64x2_relaxed_nmadd(rounded_quotients, pi_low_half, angles);
350
+ v128_t angles_f64x2 = angles_radians;
351
+ angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_high_half_f64x2, angles_f64x2);
352
+ angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_low_half_f64x2, angles_f64x2);
352
353
 
353
354
  // Check bit 1 in i32, then widen to i64 mask for laneselect
354
- v128_t bit2_i32 = wasm_v128_and(quotients_i32, wasm_i32x4_splat(2));
355
- v128_t flip_i32 = wasm_i32x4_eq(bit2_i32, wasm_i32x4_splat(0));
356
- v128_t flip_mask = wasm_i32x4_shuffle(flip_i32, flip_i32, 0, 0, 1, 1);
357
- v128_t negated_angles = wasm_f64x2_neg(angles);
355
+ v128_t bit2_i32_i32x4 = wasm_v128_and(quotients_i32_f64x2, wasm_i32x4_splat(2));
356
+ v128_t flip_i32_i32x4 = wasm_i32x4_eq(bit2_i32_i32x4, wasm_i32x4_splat(0));
357
+ v128_t flip_mask_i32x4 = wasm_i32x4_shuffle(flip_i32_i32x4, flip_i32_i32x4, 0, 0, 1, 1);
358
+ v128_t negated_angles_f64x2 = wasm_f64x2_neg(angles_f64x2);
358
359
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
359
360
  // Safe because mask is lane-granular at i64 width (all-ones or all-zeros per 64-bit lane).
360
- angles = wasm_i64x2_relaxed_laneselect(negated_angles, angles, flip_mask);
361
+ angles_f64x2 = wasm_i64x2_relaxed_laneselect(negated_angles_f64x2, angles_f64x2, flip_mask_i32x4);
361
362
 
362
- v128_t const angles_squared = wasm_f64x2_mul(angles, angles);
363
- v128_t const angles_cubed = wasm_f64x2_mul(angles, angles_squared);
364
- v128_t const angles_quadratic = wasm_f64x2_mul(angles_squared, angles_squared);
365
- v128_t const angles_octic = wasm_f64x2_mul(angles_quadratic, angles_quadratic);
363
+ v128_t const angles_squared_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_f64x2);
364
+ v128_t const angles_cubed_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_squared_f64x2);
365
+ v128_t const angles_quadratic_f64x2 = wasm_f64x2_mul(angles_squared_f64x2, angles_squared_f64x2);
366
+ v128_t const angles_octic_f64x2 = wasm_f64x2_mul(angles_quadratic_f64x2, angles_quadratic_f64x2);
366
367
 
367
368
  // Compute polynomial terms using Estrin's scheme
368
- v128_t const poly_67 = wasm_f64x2_relaxed_madd(angles_squared, coeff_7, coeff_6);
369
- v128_t const poly_45 = wasm_f64x2_relaxed_madd(angles_squared, coeff_5, coeff_4);
370
- v128_t const poly_4567 = wasm_f64x2_relaxed_madd(angles_quadratic, poly_67, poly_45);
369
+ v128_t const poly_67_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_7_f64x2, coeff_6_f64x2);
370
+ v128_t const poly_45_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_5_f64x2, coeff_4_f64x2);
371
+ v128_t const poly_4567_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_67_f64x2, poly_45_f64x2);
371
372
 
372
- v128_t const poly_23 = wasm_f64x2_relaxed_madd(angles_squared, coeff_3, coeff_2);
373
- v128_t const poly_01 = wasm_f64x2_relaxed_madd(angles_squared, coeff_1, coeff_0);
374
- v128_t const poly_0123 = wasm_f64x2_relaxed_madd(angles_quadratic, poly_23, poly_01);
373
+ v128_t const poly_23_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_3_f64x2, coeff_2_f64x2);
374
+ v128_t const poly_01_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_1_f64x2, coeff_0_f64x2);
375
+ v128_t const poly_0123_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_23_f64x2, poly_01_f64x2);
375
376
 
376
377
  // Combine polynomial terms
377
- v128_t results = wasm_f64x2_relaxed_madd(angles_octic, poly_4567, poly_0123);
378
- results = wasm_f64x2_relaxed_madd(results, angles_squared, coeff_8);
379
- results = wasm_f64x2_relaxed_madd(results, angles_cubed, angles);
380
- return results;
378
+ v128_t results_f64x2 = wasm_f64x2_relaxed_madd(angles_octic_f64x2, poly_4567_f64x2, poly_0123_f64x2);
379
+ results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_squared_f64x2, coeff_8_f64x2);
380
+ results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_cubed_f64x2, angles_f64x2);
381
+ return results_f64x2;
381
382
  }
382
383
 
383
384
  NK_INTERNAL v128_t nk_f64x2_atan_v128relaxed_(v128_t const inputs) {
384
385
  // Polynomial coefficients for atan approximation (19 terms)
385
- v128_t const coeff_19 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
386
- v128_t const coeff_18 = wasm_f64x2_splat(+0.000209850076645816976906797);
387
- v128_t const coeff_17 = wasm_f64x2_splat(-0.00110611831486672482563471);
388
- v128_t const coeff_16 = wasm_f64x2_splat(+0.00370026744188713119232403);
389
- v128_t const coeff_15 = wasm_f64x2_splat(-0.00889896195887655491740809);
390
- v128_t const coeff_14 = wasm_f64x2_splat(+0.016599329773529201970117);
391
- v128_t const coeff_13 = wasm_f64x2_splat(-0.0254517624932312641616861);
392
- v128_t const coeff_12 = wasm_f64x2_splat(+0.0337852580001353069993897);
393
- v128_t const coeff_11 = wasm_f64x2_splat(-0.0407629191276836500001934);
394
- v128_t const coeff_10 = wasm_f64x2_splat(+0.0466667150077840625632675);
395
- v128_t const coeff_9 = wasm_f64x2_splat(-0.0523674852303482457616113);
396
- v128_t const coeff_8 = wasm_f64x2_splat(+0.0587666392926673580854313);
397
- v128_t const coeff_7 = wasm_f64x2_splat(-0.0666573579361080525984562);
398
- v128_t const coeff_6 = wasm_f64x2_splat(+0.0769219538311769618355029);
399
- v128_t const coeff_5 = wasm_f64x2_splat(-0.090908995008245008229153);
400
- v128_t const coeff_4 = wasm_f64x2_splat(+0.111111105648261418443745);
401
- v128_t const coeff_3 = wasm_f64x2_splat(-0.14285714266771329383765);
402
- v128_t const coeff_2 = wasm_f64x2_splat(+0.199999999996591265594148);
403
- v128_t const coeff_1 = wasm_f64x2_splat(-0.333333333333311110369124);
404
- v128_t const half_pi = wasm_f64x2_splat(1.5707963267948966);
405
- v128_t const zeros = wasm_f64x2_splat(0);
386
+ v128_t const coeff_19_f64x2 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
387
+ v128_t const coeff_18_f64x2 = wasm_f64x2_splat(+0.000209850076645816976906797);
388
+ v128_t const coeff_17_f64x2 = wasm_f64x2_splat(-0.00110611831486672482563471);
389
+ v128_t const coeff_16_f64x2 = wasm_f64x2_splat(+0.00370026744188713119232403);
390
+ v128_t const coeff_15_f64x2 = wasm_f64x2_splat(-0.00889896195887655491740809);
391
+ v128_t const coeff_14_f64x2 = wasm_f64x2_splat(+0.016599329773529201970117);
392
+ v128_t const coeff_13_f64x2 = wasm_f64x2_splat(-0.0254517624932312641616861);
393
+ v128_t const coeff_12_f64x2 = wasm_f64x2_splat(+0.0337852580001353069993897);
394
+ v128_t const coeff_11_f64x2 = wasm_f64x2_splat(-0.0407629191276836500001934);
395
+ v128_t const coeff_10_f64x2 = wasm_f64x2_splat(+0.0466667150077840625632675);
396
+ v128_t const coeff_9_f64x2 = wasm_f64x2_splat(-0.0523674852303482457616113);
397
+ v128_t const coeff_8_f64x2 = wasm_f64x2_splat(+0.0587666392926673580854313);
398
+ v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-0.0666573579361080525984562);
399
+ v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+0.0769219538311769618355029);
400
+ v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-0.090908995008245008229153);
401
+ v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+0.111111105648261418443745);
402
+ v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-0.14285714266771329383765);
403
+ v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+0.199999999996591265594148);
404
+ v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.333333333333311110369124);
405
+ v128_t const half_pi_f64x2 = wasm_f64x2_splat(1.5707963267948966);
406
+ v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
406
407
 
407
408
  // Detect negative and take absolute value
408
- v128_t negative_mask = wasm_f64x2_lt(inputs, zeros);
409
- v128_t values = wasm_f64x2_abs(inputs);
409
+ v128_t negative_mask_f64x2 = wasm_f64x2_lt(inputs, zeros_f64x2);
410
+ v128_t values_f64x2 = wasm_f64x2_abs(inputs);
410
411
 
411
- // Check if values > 1 (need reciprocal) - use division for f64 precision
412
- v128_t reciprocal_mask = wasm_f64x2_gt(values, wasm_f64x2_splat(1.0));
413
- v128_t reciprocal_values = wasm_f64x2_div(wasm_f64x2_splat(1.0), values);
412
+ // Check if values_f64x2 > 1 (need reciprocal) - use division for f64 precision
413
+ v128_t reciprocal_mask_f64x2 = wasm_f64x2_gt(values_f64x2, wasm_f64x2_splat(1.0));
414
+ v128_t reciprocal_values_f64x2 = wasm_f64x2_div(wasm_f64x2_splat(1.0), values_f64x2);
414
415
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
415
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
416
- values = wasm_i64x2_relaxed_laneselect(reciprocal_values, values, reciprocal_mask);
416
+ // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
417
+ values_f64x2 = wasm_i64x2_relaxed_laneselect(reciprocal_values_f64x2, values_f64x2, reciprocal_mask_f64x2);
417
418
 
418
419
  // Compute powers
419
- v128_t const values_squared = wasm_f64x2_mul(values, values);
420
- v128_t const values_cubed = wasm_f64x2_mul(values, values_squared);
420
+ v128_t const values_squared_f64x2 = wasm_f64x2_mul(values_f64x2, values_f64x2);
421
+ v128_t const values_cubed_f64x2 = wasm_f64x2_mul(values_f64x2, values_squared_f64x2);
421
422
 
422
423
  // Polynomial evaluation using Horner's method
423
- v128_t polynomials = coeff_19;
424
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_18);
425
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_17);
426
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_16);
427
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_15);
428
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_14);
429
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_13);
430
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_12);
431
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_11);
432
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_10);
433
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_9);
434
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_8);
435
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_7);
436
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_6);
437
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_5);
438
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_4);
439
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_3);
440
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_2);
441
- polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_1);
442
-
443
- // Compute result
444
- v128_t result = wasm_f64x2_relaxed_madd(values_cubed, polynomials, values);
445
-
446
- // Adjust for reciprocal: result = pi/2 - result
447
- v128_t adjusted = wasm_f64x2_sub(half_pi, result);
424
+ v128_t polynomials_f64x2 = coeff_19_f64x2;
425
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_18_f64x2);
426
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_17_f64x2);
427
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_16_f64x2);
428
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_15_f64x2);
429
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_14_f64x2);
430
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_13_f64x2);
431
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_12_f64x2);
432
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_11_f64x2);
433
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_10_f64x2);
434
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_9_f64x2);
435
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_8_f64x2);
436
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_7_f64x2);
437
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_6_f64x2);
438
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_5_f64x2);
439
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_4_f64x2);
440
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_3_f64x2);
441
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_2_f64x2);
442
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_1_f64x2);
443
+
444
+ // Compute result_f64x2
445
+ v128_t result_f64x2 = wasm_f64x2_relaxed_madd(values_cubed_f64x2, polynomials_f64x2, values_f64x2);
446
+
447
+ // Adjust for reciprocal: result_f64x2 = pi/2 - result_f64x2
448
+ v128_t adjusted_f64x2 = wasm_f64x2_sub(half_pi_f64x2, result_f64x2);
448
449
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
449
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
450
- result = wasm_i64x2_relaxed_laneselect(adjusted, result, reciprocal_mask);
450
+ // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
451
+ result_f64x2 = wasm_i64x2_relaxed_laneselect(adjusted_f64x2, result_f64x2, reciprocal_mask_f64x2);
451
452
 
452
- // Adjust for negative: result = -result
453
- v128_t negated = wasm_f64x2_neg(result);
453
+ // Adjust for negative: result_f64x2 = -result_f64x2
454
+ v128_t negated_f64x2 = wasm_f64x2_neg(result_f64x2);
454
455
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
455
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
456
- result = wasm_i64x2_relaxed_laneselect(negated, result, negative_mask);
457
- return result;
456
+ // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
457
+ result_f64x2 = wasm_i64x2_relaxed_laneselect(negated_f64x2, result_f64x2, negative_mask_f64x2);
458
+ return result_f64x2;
458
459
  }
459
460
 
460
461
  NK_INTERNAL v128_t nk_f64x2_atan2_v128relaxed_(v128_t const ys_inputs, v128_t const xs_inputs) {
461
462
  // Polynomial coefficients (same as atan)
462
- v128_t const coeff_19 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
463
- v128_t const coeff_18 = wasm_f64x2_splat(+0.000209850076645816976906797);
464
- v128_t const coeff_17 = wasm_f64x2_splat(-0.00110611831486672482563471);
465
- v128_t const coeff_16 = wasm_f64x2_splat(+0.00370026744188713119232403);
466
- v128_t const coeff_15 = wasm_f64x2_splat(-0.00889896195887655491740809);
467
- v128_t const coeff_14 = wasm_f64x2_splat(+0.016599329773529201970117);
468
- v128_t const coeff_13 = wasm_f64x2_splat(-0.0254517624932312641616861);
469
- v128_t const coeff_12 = wasm_f64x2_splat(+0.0337852580001353069993897);
470
- v128_t const coeff_11 = wasm_f64x2_splat(-0.0407629191276836500001934);
471
- v128_t const coeff_10 = wasm_f64x2_splat(+0.0466667150077840625632675);
472
- v128_t const coeff_9 = wasm_f64x2_splat(-0.0523674852303482457616113);
473
- v128_t const coeff_8 = wasm_f64x2_splat(+0.0587666392926673580854313);
474
- v128_t const coeff_7 = wasm_f64x2_splat(-0.0666573579361080525984562);
475
- v128_t const coeff_6 = wasm_f64x2_splat(+0.0769219538311769618355029);
476
- v128_t const coeff_5 = wasm_f64x2_splat(-0.090908995008245008229153);
477
- v128_t const coeff_4 = wasm_f64x2_splat(+0.111111105648261418443745);
478
- v128_t const coeff_3 = wasm_f64x2_splat(-0.14285714266771329383765);
479
- v128_t const coeff_2 = wasm_f64x2_splat(+0.199999999996591265594148);
480
- v128_t const coeff_1 = wasm_f64x2_splat(-0.333333333333311110369124);
481
- v128_t const pi = wasm_f64x2_splat(3.14159265358979323846);
482
- v128_t const half_pi = wasm_f64x2_splat(1.5707963267948966);
483
- v128_t const zeros = wasm_f64x2_splat(0);
463
+ v128_t const coeff_19_f64x2 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
464
+ v128_t const coeff_18_f64x2 = wasm_f64x2_splat(+0.000209850076645816976906797);
465
+ v128_t const coeff_17_f64x2 = wasm_f64x2_splat(-0.00110611831486672482563471);
466
+ v128_t const coeff_16_f64x2 = wasm_f64x2_splat(+0.00370026744188713119232403);
467
+ v128_t const coeff_15_f64x2 = wasm_f64x2_splat(-0.00889896195887655491740809);
468
+ v128_t const coeff_14_f64x2 = wasm_f64x2_splat(+0.016599329773529201970117);
469
+ v128_t const coeff_13_f64x2 = wasm_f64x2_splat(-0.0254517624932312641616861);
470
+ v128_t const coeff_12_f64x2 = wasm_f64x2_splat(+0.0337852580001353069993897);
471
+ v128_t const coeff_11_f64x2 = wasm_f64x2_splat(-0.0407629191276836500001934);
472
+ v128_t const coeff_10_f64x2 = wasm_f64x2_splat(+0.0466667150077840625632675);
473
+ v128_t const coeff_9_f64x2 = wasm_f64x2_splat(-0.0523674852303482457616113);
474
+ v128_t const coeff_8_f64x2 = wasm_f64x2_splat(+0.0587666392926673580854313);
475
+ v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-0.0666573579361080525984562);
476
+ v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+0.0769219538311769618355029);
477
+ v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-0.090908995008245008229153);
478
+ v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+0.111111105648261418443745);
479
+ v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-0.14285714266771329383765);
480
+ v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+0.199999999996591265594148);
481
+ v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.333333333333311110369124);
482
+ v128_t const pi_f64x2 = wasm_f64x2_splat(3.14159265358979323846);
483
+ v128_t const half_pi_f64x2 = wasm_f64x2_splat(1.5707963267948966);
484
+ v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
484
485
 
485
486
  // Quadrant adjustments - take absolute values
486
- v128_t xs_negative_mask = wasm_f64x2_lt(xs_inputs, zeros);
487
- v128_t xs = wasm_f64x2_abs(xs_inputs);
488
- v128_t ys = wasm_f64x2_abs(ys_inputs);
487
+ v128_t xs_negative_mask_f64x2 = wasm_f64x2_lt(xs_inputs, zeros_f64x2);
488
+ v128_t xs_f64x2 = wasm_f64x2_abs(xs_inputs);
489
+ v128_t ys_f64x2 = wasm_f64x2_abs(ys_inputs);
489
490
 
490
491
  // Ensure proper fraction where numerator < denominator
491
- v128_t swap_mask = wasm_f64x2_gt(ys, xs);
492
- v128_t temps = xs;
492
+ v128_t swap_mask_f64x2 = wasm_f64x2_gt(ys_f64x2, xs_f64x2);
493
+ v128_t temps_f64x2 = xs_f64x2;
493
494
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
494
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
495
- xs = wasm_i64x2_relaxed_laneselect(ys, xs, swap_mask);
496
- ys = wasm_i64x2_relaxed_laneselect(wasm_f64x2_neg(temps), ys, swap_mask);
495
+ // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
496
+ xs_f64x2 = wasm_i64x2_relaxed_laneselect(ys_f64x2, xs_f64x2, swap_mask_f64x2);
497
+ ys_f64x2 = wasm_i64x2_relaxed_laneselect(wasm_f64x2_neg(temps_f64x2), ys_f64x2, swap_mask_f64x2);
497
498
 
498
499
  // Division for f64 precision
499
- v128_t const ratio = wasm_f64x2_div(ys, xs);
500
- v128_t const ratio_squared = wasm_f64x2_mul(ratio, ratio);
501
- v128_t const ratio_cubed = wasm_f64x2_mul(ratio, ratio_squared);
500
+ v128_t const ratio_f64x2 = wasm_f64x2_div(ys_f64x2, xs_f64x2);
501
+ v128_t const ratio_squared_f64x2 = wasm_f64x2_mul(ratio_f64x2, ratio_f64x2);
502
+ v128_t const ratio_cubed_f64x2 = wasm_f64x2_mul(ratio_f64x2, ratio_squared_f64x2);
502
503
 
503
504
  // Polynomial evaluation using Horner's method
504
- v128_t polynomials = coeff_19;
505
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_18);
506
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_17);
507
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_16);
508
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_15);
509
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_14);
510
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_13);
511
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_12);
512
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_11);
513
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_10);
514
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_9);
515
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_8);
516
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_7);
517
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_6);
518
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_5);
519
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_4);
520
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_3);
521
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_2);
522
- polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_1);
505
+ v128_t polynomials_f64x2 = coeff_19_f64x2;
506
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_18_f64x2);
507
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_17_f64x2);
508
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_16_f64x2);
509
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_15_f64x2);
510
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_14_f64x2);
511
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_13_f64x2);
512
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_12_f64x2);
513
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_11_f64x2);
514
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_10_f64x2);
515
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_9_f64x2);
516
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_8_f64x2);
517
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_7_f64x2);
518
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_6_f64x2);
519
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_5_f64x2);
520
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_4_f64x2);
521
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_3_f64x2);
522
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_2_f64x2);
523
+ polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_1_f64x2);
523
524
 
524
525
  // Compute the result
525
- v128_t results = wasm_f64x2_relaxed_madd(ratio_cubed, polynomials, ratio);
526
+ v128_t results_f64x2 = wasm_f64x2_relaxed_madd(ratio_cubed_f64x2, polynomials_f64x2, ratio_f64x2);
526
527
 
527
- // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
528
+ // Compute quadrant_f64x2 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
528
529
  // -2 for x<0 && !swap, -1 for x<0 && swap
529
- v128_t quadrant = wasm_f64x2_splat(0.0);
530
- v128_t neg_two = wasm_f64x2_splat(-2.0);
530
+ v128_t quadrant_f64x2 = wasm_f64x2_splat(0.0);
531
+ v128_t neg_two_f64x2 = wasm_f64x2_splat(-2.0);
531
532
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
532
- // Safe because mask is from comparison (all-ones or all-zeros per lane).
533
- quadrant = wasm_i64x2_relaxed_laneselect(neg_two, quadrant, xs_negative_mask);
534
- v128_t quadrant_incremented = wasm_f64x2_add(quadrant, wasm_f64x2_splat(1.0));
535
- quadrant = wasm_i64x2_relaxed_laneselect(quadrant_incremented, quadrant, swap_mask);
533
+ // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
534
+ quadrant_f64x2 = wasm_i64x2_relaxed_laneselect(neg_two_f64x2, quadrant_f64x2, xs_negative_mask_f64x2);
535
+ v128_t quadrant_incremented_f64x2 = wasm_f64x2_add(quadrant_f64x2, wasm_f64x2_splat(1.0));
536
+ quadrant_f64x2 = wasm_i64x2_relaxed_laneselect(quadrant_incremented_f64x2, quadrant_f64x2, swap_mask_f64x2);
536
537
 
537
- // Adjust for quadrant: result += quadrant * pi/2
538
- results = wasm_f64x2_relaxed_madd(quadrant, half_pi, results);
538
+ // Adjust for quadrant_f64x2: result += quadrant_f64x2 * pi_f64x2/2
539
+ results_f64x2 = wasm_f64x2_relaxed_madd(quadrant_f64x2, half_pi_f64x2, results_f64x2);
539
540
 
540
541
  // Transfer sign from x and y by XOR with sign bits
541
- v128_t sign_mask = wasm_f64x2_splat(-0.0);
542
- v128_t xs_sign = wasm_v128_and(xs_inputs, sign_mask);
543
- v128_t ys_sign = wasm_v128_and(ys_inputs, sign_mask);
544
- results = wasm_v128_xor(results, xs_sign);
545
- results = wasm_v128_xor(results, ys_sign);
542
+ v128_t sign_mask_f64x2 = wasm_f64x2_splat(-0.0);
543
+ v128_t xs_sign_f64x2 = wasm_v128_and(xs_inputs, sign_mask_f64x2);
544
+ v128_t ys_sign_f64x2 = wasm_v128_and(ys_inputs, sign_mask_f64x2);
545
+ results_f64x2 = wasm_v128_xor(results_f64x2, xs_sign_f64x2);
546
+ results_f64x2 = wasm_v128_xor(results_f64x2, ys_sign_f64x2);
546
547
 
547
- return results;
548
+ return results_f64x2;
548
549
  }
549
550
 
550
551
  /* NK_PUBLIC wrappers — same loop+tail pattern as neon.h.
@@ -555,9 +556,9 @@ NK_INTERNAL v128_t nk_f64x2_atan2_v128relaxed_(v128_t const ys_inputs, v128_t co
555
556
  NK_PUBLIC void nk_each_sin_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
556
557
  nk_size_t i = 0;
557
558
  for (; i + 4 <= n; i += 4) {
558
- v128_t angles = wasm_v128_load(ins + i);
559
- v128_t results = nk_f32x4_sin_v128relaxed_(angles);
560
- wasm_v128_store(outs + i, results);
559
+ v128_t angles_f32x4 = wasm_v128_load(ins + i);
560
+ v128_t results_f32x4 = nk_f32x4_sin_v128relaxed_(angles_f32x4);
561
+ wasm_v128_store(outs + i, results_f32x4);
561
562
  }
562
563
  if (i < n) {
563
564
  nk_size_t remaining = n - i;
@@ -572,9 +573,9 @@ NK_PUBLIC void nk_each_sin_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_
572
573
  NK_PUBLIC void nk_each_cos_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
573
574
  nk_size_t i = 0;
574
575
  for (; i + 4 <= n; i += 4) {
575
- v128_t angles = wasm_v128_load(ins + i);
576
- v128_t results = nk_f32x4_cos_v128relaxed_(angles);
577
- wasm_v128_store(outs + i, results);
576
+ v128_t angles_f32x4 = wasm_v128_load(ins + i);
577
+ v128_t results_f32x4 = nk_f32x4_cos_v128relaxed_(angles_f32x4);
578
+ wasm_v128_store(outs + i, results_f32x4);
578
579
  }
579
580
  if (i < n) {
580
581
  nk_size_t remaining = n - i;
@@ -589,9 +590,9 @@ NK_PUBLIC void nk_each_cos_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_
589
590
  NK_PUBLIC void nk_each_atan_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
590
591
  nk_size_t i = 0;
591
592
  for (; i + 4 <= n; i += 4) {
592
- v128_t values = wasm_v128_load(ins + i);
593
- v128_t results = nk_f32x4_atan_v128relaxed_(values);
594
- wasm_v128_store(outs + i, results);
593
+ v128_t values_f32x4 = wasm_v128_load(ins + i);
594
+ v128_t results_f32x4 = nk_f32x4_atan_v128relaxed_(values_f32x4);
595
+ wasm_v128_store(outs + i, results_f32x4);
595
596
  }
596
597
  if (i < n) {
597
598
  nk_size_t remaining = n - i;
@@ -606,9 +607,9 @@ NK_PUBLIC void nk_each_atan_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk
606
607
  NK_PUBLIC void nk_each_sin_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
607
608
  nk_size_t i = 0;
608
609
  for (; i + 2 <= n; i += 2) {
609
- v128_t angles = wasm_v128_load(ins + i);
610
- v128_t results = nk_f64x2_sin_v128relaxed_(angles);
611
- wasm_v128_store(outs + i, results);
610
+ v128_t angles_f64x2 = wasm_v128_load(ins + i);
611
+ v128_t results_f64x2 = nk_f64x2_sin_v128relaxed_(angles_f64x2);
612
+ wasm_v128_store(outs + i, results_f64x2);
612
613
  }
613
614
  if (i < n) {
614
615
  nk_size_t remaining = n - i;
@@ -623,9 +624,9 @@ NK_PUBLIC void nk_each_sin_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_
623
624
  NK_PUBLIC void nk_each_cos_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
624
625
  nk_size_t i = 0;
625
626
  for (; i + 2 <= n; i += 2) {
626
- v128_t angles = wasm_v128_load(ins + i);
627
- v128_t results = nk_f64x2_cos_v128relaxed_(angles);
628
- wasm_v128_store(outs + i, results);
627
+ v128_t angles_f64x2 = wasm_v128_load(ins + i);
628
+ v128_t results_f64x2 = nk_f64x2_cos_v128relaxed_(angles_f64x2);
629
+ wasm_v128_store(outs + i, results_f64x2);
629
630
  }
630
631
  if (i < n) {
631
632
  nk_size_t remaining = n - i;
@@ -640,9 +641,9 @@ NK_PUBLIC void nk_each_cos_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_
640
641
  NK_PUBLIC void nk_each_atan_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
641
642
  nk_size_t i = 0;
642
643
  for (; i + 2 <= n; i += 2) {
643
- v128_t values = wasm_v128_load(ins + i);
644
- v128_t results = nk_f64x2_atan_v128relaxed_(values);
645
- wasm_v128_store(outs + i, results);
644
+ v128_t values_f64x2 = wasm_v128_load(ins + i);
645
+ v128_t results_f64x2 = nk_f64x2_atan_v128relaxed_(values_f64x2);
646
+ wasm_v128_store(outs + i, results_f64x2);
646
647
  }
647
648
  if (i < n) {
648
649
  nk_size_t remaining = n - i;