npm - numkong - Versions diffs - 7.0.0 → 7.4.1 - Mend

numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +239 -122
package/binding.gyp +25 -491
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/trigonometry/v128relaxed.h CHANGED Viewed

@@ -52,499 +52,500 @@ extern "C" {
 NK_INTERNAL v128_t nk_f32x4_sin_v128relaxed_(v128_t const angles_radians) {
     // Constants for argument reduction
-    v128_t const pi = wasm_f32x4_splat(3.14159265358979323846f);
-    v128_t const pi_reciprocal = wasm_f32x4_splat(0.31830988618379067154f);
-    v128_t const coeff_5 = wasm_f32x4_splat(-0.0001881748176f);
-    v128_t const coeff_3 = wasm_f32x4_splat(+0.008323502727f);
-    v128_t const coeff_1 = wasm_f32x4_splat(-0.1666651368f);
-    // Compute (multiples_of_pi) = round(angle / pi) using nearest rounding
-    v128_t quotients = wasm_f32x4_mul(angles_radians, pi_reciprocal);
-    v128_t rounded_quotients = wasm_f32x4_nearest(quotients);
+    v128_t const pi_f32x4 = wasm_f32x4_splat(3.14159265358979323846f);
+    v128_t const pi_reciprocal_f32x4 = wasm_f32x4_splat(0.31830988618379067154f);
+    v128_t const coeff_5_f32x4 = wasm_f32x4_splat(-0.0001881748176f);
+    v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.008323502727f);
+    v128_t const coeff_1_f32x4 = wasm_f32x4_splat(-0.1666651368f);
+    // Compute (multiples_of_pi_f32x4) = round(angle / pi_f32x4) using nearest rounding
+    v128_t quotients_f32x4 = wasm_f32x4_mul(angles_radians, pi_reciprocal_f32x4);
+    v128_t rounded_quotients_f32x4 = wasm_f32x4_nearest(quotients_f32x4);
     // relaxed_trunc: 1 instruction (cvttps2dq) vs 7 (with NaN/overflow fixup) on x86.
-    // Safe because rounded_quotients are small integers from nearest(), never NaN or out of i32 range.
-    v128_t multiples_of_pi = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients);
+    // Safe because rounded_quotients_f32x4 are small integers from nearest(), never NaN or out of i32 range.
+    v128_t multiples_of_pi_f32x4 = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients_f32x4);
-    // Reduce the angle: angle - rounded_quotients * pi
+    // Reduce the angle: angle - rounded_quotients_f32x4 * pi_f32x4
     // vfmsq_f32(acc, a, b) = acc - a*b -> wasm_f32x4_relaxed_nmadd(a, b, acc)
-    v128_t const angles = wasm_f32x4_relaxed_nmadd(rounded_quotients, pi, angles_radians);
-    v128_t const angles_squared = wasm_f32x4_mul(angles, angles);
-    v128_t const angles_cubed = wasm_f32x4_mul(angles, angles_squared);
+    v128_t const angles_f32x4 = wasm_f32x4_relaxed_nmadd(rounded_quotients_f32x4, pi_f32x4, angles_radians);
+    v128_t const angles_squared_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_f32x4);
+    v128_t const angles_cubed_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_squared_f32x4);
     // Compute the polynomial approximation
     // vfmaq_f32(acc, a, b) = acc + a*b -> wasm_f32x4_relaxed_madd(a, b, acc)
-    v128_t polynomials = coeff_5;
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, angles_squared, coeff_3);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, angles_squared, coeff_1);
-    v128_t results = wasm_f32x4_relaxed_madd(angles_cubed, polynomials, angles);
-    // If multiples_of_pi is odd, flip the sign
-    v128_t parity = wasm_v128_and(multiples_of_pi, wasm_i32x4_splat(1));
-    v128_t odd_mask = wasm_i32x4_eq(parity, wasm_i32x4_splat(1));
-    v128_t negated = wasm_f32x4_neg(results);
+    v128_t polynomials_f32x4 = coeff_5_f32x4;
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_3_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_1_f32x4);
+    v128_t results_f32x4 = wasm_f32x4_relaxed_madd(angles_cubed_f32x4, polynomials_f32x4, angles_f32x4);
+    // If multiples_of_pi_f32x4 is odd, flip the sign
+    v128_t parity_i32x4 = wasm_v128_and(multiples_of_pi_f32x4, wasm_i32x4_splat(1));
+    v128_t odd_mask_i32x4 = wasm_i32x4_eq(parity_i32x4, wasm_i32x4_splat(1));
+    v128_t negated_f32x4 = wasm_f32x4_neg(results_f32x4);
     // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
     // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    results = wasm_i32x4_relaxed_laneselect(negated, results, odd_mask);
-    return results;
+    results_f32x4 = wasm_i32x4_relaxed_laneselect(negated_f32x4, results_f32x4, odd_mask_i32x4);
+    return results_f32x4;
 }
 NK_INTERNAL v128_t nk_f32x4_cos_v128relaxed_(v128_t const angles_radians) {
     // Constants for argument reduction
-    v128_t const pi = wasm_f32x4_splat(3.14159265358979323846f);
-    v128_t const pi_half = wasm_f32x4_splat(1.57079632679489661923f);
-    v128_t const pi_reciprocal = wasm_f32x4_splat(0.31830988618379067154f);
-    v128_t const coeff_5 = wasm_f32x4_splat(-0.0001881748176f);
-    v128_t const coeff_3 = wasm_f32x4_splat(+0.008323502727f);
-    v128_t const coeff_1 = wasm_f32x4_splat(-0.1666651368f);
-    // Compute round((angle / pi) - 0.5)
-    v128_t const neg_half = wasm_f32x4_splat(-0.5f);
-    v128_t quotients = wasm_f32x4_relaxed_madd(angles_radians, pi_reciprocal, neg_half);
-    v128_t rounded_quotients = wasm_f32x4_nearest(quotients);
+    v128_t const pi_f32x4 = wasm_f32x4_splat(3.14159265358979323846f);
+    v128_t const pi_half_f32x4 = wasm_f32x4_splat(1.57079632679489661923f);
+    v128_t const pi_reciprocal_f32x4 = wasm_f32x4_splat(0.31830988618379067154f);
+    v128_t const coeff_5_f32x4 = wasm_f32x4_splat(-0.0001881748176f);
+    v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.008323502727f);
+    v128_t const coeff_1_f32x4 = wasm_f32x4_splat(-0.1666651368f);
+    // Compute round((angle / pi_f32x4) - 0.5)
+    v128_t const neg_half_f32x4 = wasm_f32x4_splat(-0.5f);
+    v128_t quotients_f32x4 = wasm_f32x4_relaxed_madd(angles_radians, pi_reciprocal_f32x4, neg_half_f32x4);
+    v128_t rounded_quotients_f32x4 = wasm_f32x4_nearest(quotients_f32x4);
     // relaxed_trunc: 1 instruction (cvttps2dq) vs 7 (with NaN/overflow fixup) on x86.
-    // Safe because rounded_quotients are small integers from nearest(), never NaN or out of i32 range.
-    v128_t multiples_of_pi = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients);
+    // Safe because rounded_quotients_f32x4 are small integers from nearest(), never NaN or out of i32 range.
+    v128_t multiples_of_pi_f32x4 = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients_f32x4);
-    // Reduce the angle: (angle - pi/2) - rounded_quotients * pi
-    v128_t shifted = wasm_f32x4_sub(angles_radians, pi_half);
-    v128_t const angles = wasm_f32x4_relaxed_nmadd(rounded_quotients, pi, shifted);
-    v128_t const angles_squared = wasm_f32x4_mul(angles, angles);
-    v128_t const angles_cubed = wasm_f32x4_mul(angles, angles_squared);
+    // Reduce the angle: (angle - pi_f32x4/2) - rounded_quotients_f32x4 * pi_f32x4
+    v128_t shifted_f32x4 = wasm_f32x4_sub(angles_radians, pi_half_f32x4);
+    v128_t const angles_f32x4 = wasm_f32x4_relaxed_nmadd(rounded_quotients_f32x4, pi_f32x4, shifted_f32x4);
+    v128_t const angles_squared_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_f32x4);
+    v128_t const angles_cubed_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_squared_f32x4);
     // Compute the polynomial approximation
-    v128_t polynomials = coeff_5;
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, angles_squared, coeff_3);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, angles_squared, coeff_1);
-    v128_t results = wasm_f32x4_relaxed_madd(angles_cubed, polynomials, angles);
-    // If multiples_of_pi is even, flip the sign
-    v128_t parity = wasm_v128_and(multiples_of_pi, wasm_i32x4_splat(1));
-    v128_t even_mask = wasm_i32x4_eq(parity, wasm_i32x4_splat(0));
-    v128_t negated = wasm_f32x4_neg(results);
+    v128_t polynomials_f32x4 = coeff_5_f32x4;
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_3_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_1_f32x4);
+    v128_t results_f32x4 = wasm_f32x4_relaxed_madd(angles_cubed_f32x4, polynomials_f32x4, angles_f32x4);
+    // If multiples_of_pi_f32x4 is even, flip the sign
+    v128_t parity_i32x4 = wasm_v128_and(multiples_of_pi_f32x4, wasm_i32x4_splat(1));
+    v128_t even_mask_i32x4 = wasm_i32x4_eq(parity_i32x4, wasm_i32x4_splat(0));
+    v128_t negated_f32x4 = wasm_f32x4_neg(results_f32x4);
     // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
     // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    results = wasm_i32x4_relaxed_laneselect(negated, results, even_mask);
-    return results;
+    results_f32x4 = wasm_i32x4_relaxed_laneselect(negated_f32x4, results_f32x4, even_mask_i32x4);
+    return results_f32x4;
 }
 NK_INTERNAL v128_t nk_f32x4_atan_v128relaxed_(v128_t const inputs) {
     // Polynomial coefficients for atan approximation (8 terms)
-    v128_t const coeff_8 = wasm_f32x4_splat(-0.333331018686294555664062f);
-    v128_t const coeff_7 = wasm_f32x4_splat(+0.199926957488059997558594f);
-    v128_t const coeff_6 = wasm_f32x4_splat(-0.142027363181114196777344f);
-    v128_t const coeff_5 = wasm_f32x4_splat(+0.106347933411598205566406f);
-    v128_t const coeff_4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
-    v128_t const coeff_3 = wasm_f32x4_splat(+0.0425049886107444763183594f);
-    v128_t const coeff_2 = wasm_f32x4_splat(-0.0159569028764963150024414f);
-    v128_t const coeff_1 = wasm_f32x4_splat(+0.00282363896258175373077393f);
-    v128_t const half_pi = wasm_f32x4_splat(1.5707963267948966f);
-    // Detect negative values and take absolute value
-    v128_t const zeros = wasm_f32x4_splat(0);
-    v128_t negative_mask = wasm_f32x4_lt(inputs, zeros);
-    v128_t values = wasm_f32x4_abs(inputs);
-    // Check if values > 1 (need reciprocal)
-    v128_t reciprocal_mask = wasm_f32x4_gt(values, wasm_f32x4_splat(1.0f));
+    v128_t const coeff_8_f32x4 = wasm_f32x4_splat(-0.333331018686294555664062f);
+    v128_t const coeff_7_f32x4 = wasm_f32x4_splat(+0.199926957488059997558594f);
+    v128_t const coeff_6_f32x4 = wasm_f32x4_splat(-0.142027363181114196777344f);
+    v128_t const coeff_5_f32x4 = wasm_f32x4_splat(+0.106347933411598205566406f);
+    v128_t const coeff_4_f32x4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
+    v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.0425049886107444763183594f);
+    v128_t const coeff_2_f32x4 = wasm_f32x4_splat(-0.0159569028764963150024414f);
+    v128_t const coeff_1_f32x4 = wasm_f32x4_splat(+0.00282363896258175373077393f);
+    v128_t const half_pi_f32x4 = wasm_f32x4_splat(1.5707963267948966f);
+    // Detect negative values_f32x4 and take absolute value
+    v128_t const zeros_f32x4 = wasm_f32x4_splat(0);
+    v128_t negative_mask_f32x4 = wasm_f32x4_lt(inputs, zeros_f32x4);
+    v128_t values_f32x4 = wasm_f32x4_abs(inputs);
+    // Check if values_f32x4 > 1 (need reciprocal)
+    v128_t reciprocal_mask_f32x4 = wasm_f32x4_gt(values_f32x4, wasm_f32x4_splat(1.0f));
     // No fast reciprocal in WASM — use division
-    v128_t recip = wasm_f32x4_div(wasm_f32x4_splat(1.0f), values);
+    v128_t recip_f32x4 = wasm_f32x4_div(wasm_f32x4_splat(1.0f), values_f32x4);
     // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    values = wasm_i32x4_relaxed_laneselect(recip, values, reciprocal_mask);
+    // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
+    values_f32x4 = wasm_i32x4_relaxed_laneselect(recip_f32x4, values_f32x4, reciprocal_mask_f32x4);
     // Compute powers
-    v128_t const values_squared = wasm_f32x4_mul(values, values);
-    v128_t const values_cubed = wasm_f32x4_mul(values, values_squared);
+    v128_t const values_squared_f32x4 = wasm_f32x4_mul(values_f32x4, values_f32x4);
+    v128_t const values_cubed_f32x4 = wasm_f32x4_mul(values_f32x4, values_squared_f32x4);
     // Polynomial evaluation using Horner's method
-    v128_t polynomials = coeff_1;
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_2);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_3);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_4);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_5);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_6);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_7);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, values_squared, coeff_8);
-    // Compute result: atan(x) ~ x + x^3 * P(x^2)
-    v128_t result = wasm_f32x4_relaxed_madd(values_cubed, polynomials, values);
-    // Adjust for reciprocal: result = pi/2 - result
-    v128_t adjusted = wasm_f32x4_sub(half_pi, result);
+    v128_t polynomials_f32x4 = coeff_1_f32x4;
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_2_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_3_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_4_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_5_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_6_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_7_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_8_f32x4);
+    // Compute result_f32x4: atan(x) ~ x + x^3 * P(x^2)
+    v128_t result_f32x4 = wasm_f32x4_relaxed_madd(values_cubed_f32x4, polynomials_f32x4, values_f32x4);
+    // Adjust for reciprocal: result_f32x4 = pi/2 - result_f32x4
+    v128_t adjusted_f32x4 = wasm_f32x4_sub(half_pi_f32x4, result_f32x4);
     // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    result = wasm_i32x4_relaxed_laneselect(adjusted, result, reciprocal_mask);
+    // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
+    result_f32x4 = wasm_i32x4_relaxed_laneselect(adjusted_f32x4, result_f32x4, reciprocal_mask_f32x4);
-    // Adjust for negative: result = -result
-    v128_t negated = wasm_f32x4_neg(result);
+    // Adjust for negative: result_f32x4 = -result_f32x4
+    v128_t negated_f32x4 = wasm_f32x4_neg(result_f32x4);
     // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    result = wasm_i32x4_relaxed_laneselect(negated, result, negative_mask);
-    return result;
+    // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
+    result_f32x4 = wasm_i32x4_relaxed_laneselect(negated_f32x4, result_f32x4, negative_mask_f32x4);
+    return result_f32x4;
 }
 NK_INTERNAL v128_t nk_f32x4_atan2_v128relaxed_(v128_t const ys_inputs, v128_t const xs_inputs) {
     // Polynomial coefficients (same as atan)
-    v128_t const coeff_8 = wasm_f32x4_splat(-0.333331018686294555664062f);
-    v128_t const coeff_7 = wasm_f32x4_splat(+0.199926957488059997558594f);
-    v128_t const coeff_6 = wasm_f32x4_splat(-0.142027363181114196777344f);
-    v128_t const coeff_5 = wasm_f32x4_splat(+0.106347933411598205566406f);
-    v128_t const coeff_4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
-    v128_t const coeff_3 = wasm_f32x4_splat(+0.0425049886107444763183594f);
-    v128_t const coeff_2 = wasm_f32x4_splat(-0.0159569028764963150024414f);
-    v128_t const coeff_1 = wasm_f32x4_splat(+0.00282363896258175373077393f);
-    v128_t const pi = wasm_f32x4_splat(3.14159265358979323846f);
-    v128_t const half_pi = wasm_f32x4_splat(1.5707963267948966f);
-    v128_t const zeros = wasm_f32x4_splat(0);
+    v128_t const coeff_8_f32x4 = wasm_f32x4_splat(-0.333331018686294555664062f);
+    v128_t const coeff_7_f32x4 = wasm_f32x4_splat(+0.199926957488059997558594f);
+    v128_t const coeff_6_f32x4 = wasm_f32x4_splat(-0.142027363181114196777344f);
+    v128_t const coeff_5_f32x4 = wasm_f32x4_splat(+0.106347933411598205566406f);
+    v128_t const coeff_4_f32x4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
+    v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.0425049886107444763183594f);
+    v128_t const coeff_2_f32x4 = wasm_f32x4_splat(-0.0159569028764963150024414f);
+    v128_t const coeff_1_f32x4 = wasm_f32x4_splat(+0.00282363896258175373077393f);
+    v128_t const pi_f32x4 = wasm_f32x4_splat(3.14159265358979323846f);
+    v128_t const half_pi_f32x4 = wasm_f32x4_splat(1.5707963267948966f);
+    v128_t const zeros_f32x4 = wasm_f32x4_splat(0);
     // Quadrant adjustments - take absolute values
-    v128_t xs_negative_mask = wasm_f32x4_lt(xs_inputs, zeros);
-    v128_t xs = wasm_f32x4_abs(xs_inputs);
-    v128_t ys = wasm_f32x4_abs(ys_inputs);
+    v128_t xs_negative_mask_f32x4 = wasm_f32x4_lt(xs_inputs, zeros_f32x4);
+    v128_t xs_f32x4 = wasm_f32x4_abs(xs_inputs);
+    v128_t ys_f32x4 = wasm_f32x4_abs(ys_inputs);
     // Ensure proper fraction where numerator < denominator
-    v128_t swap_mask = wasm_f32x4_gt(ys, xs);
-    v128_t temps = xs;
+    v128_t swap_mask_f32x4 = wasm_f32x4_gt(ys_f32x4, xs_f32x4);
+    v128_t temps_f32x4 = xs_f32x4;
     // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    xs = wasm_i32x4_relaxed_laneselect(ys, xs, swap_mask);
-    ys = wasm_i32x4_relaxed_laneselect(wasm_f32x4_neg(temps), ys, swap_mask);
+    // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
+    xs_f32x4 = wasm_i32x4_relaxed_laneselect(ys_f32x4, xs_f32x4, swap_mask_f32x4);
+    ys_f32x4 = wasm_i32x4_relaxed_laneselect(wasm_f32x4_neg(temps_f32x4), ys_f32x4, swap_mask_f32x4);
-    // Division for ratio: ratio = ys / xs
-    v128_t const ratio = wasm_f32x4_div(ys, xs);
-    v128_t const ratio_squared = wasm_f32x4_mul(ratio, ratio);
-    v128_t const ratio_cubed = wasm_f32x4_mul(ratio, ratio_squared);
+    // Division for ratio_f32x4: ratio_f32x4 = ys_f32x4 / xs_f32x4
+    v128_t const ratio_f32x4 = wasm_f32x4_div(ys_f32x4, xs_f32x4);
+    v128_t const ratio_squared_f32x4 = wasm_f32x4_mul(ratio_f32x4, ratio_f32x4);
+    v128_t const ratio_cubed_f32x4 = wasm_f32x4_mul(ratio_f32x4, ratio_squared_f32x4);
     // Polynomial evaluation using Horner's method
-    v128_t polynomials = coeff_1;
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_2);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_3);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_4);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_5);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_6);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_7);
-    polynomials = wasm_f32x4_relaxed_madd(polynomials, ratio_squared, coeff_8);
+    v128_t polynomials_f32x4 = coeff_1_f32x4;
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_2_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_3_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_4_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_5_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_6_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_7_f32x4);
+    polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_8_f32x4);
     // Compute the result
-    v128_t results = wasm_f32x4_relaxed_madd(ratio_cubed, polynomials, ratio);
+    v128_t results_f32x4 = wasm_f32x4_relaxed_madd(ratio_cubed_f32x4, polynomials_f32x4, ratio_f32x4);
-    // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
+    // Compute quadrant_f32x4 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
     //                        -2 for x<0 && !swap, -1 for x<0 && swap
-    v128_t quadrant = wasm_f32x4_splat(0.0f);
-    v128_t neg_two = wasm_f32x4_splat(-2.0f);
+    v128_t quadrant_f32x4 = wasm_f32x4_splat(0.0f);
+    v128_t neg_two_f32x4 = wasm_f32x4_splat(-2.0f);
     // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    quadrant = wasm_i32x4_relaxed_laneselect(neg_two, quadrant, xs_negative_mask);
-    v128_t quadrant_incremented = wasm_f32x4_add(quadrant, wasm_f32x4_splat(1.0f));
-    quadrant = wasm_i32x4_relaxed_laneselect(quadrant_incremented, quadrant, swap_mask);
+    // Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
+    quadrant_f32x4 = wasm_i32x4_relaxed_laneselect(neg_two_f32x4, quadrant_f32x4, xs_negative_mask_f32x4);
+    v128_t quadrant_incremented_f32x4 = wasm_f32x4_add(quadrant_f32x4, wasm_f32x4_splat(1.0f));
+    quadrant_f32x4 = wasm_i32x4_relaxed_laneselect(quadrant_incremented_f32x4, quadrant_f32x4, swap_mask_f32x4);
-    // Adjust for quadrant: result += quadrant * pi/2
-    results = wasm_f32x4_relaxed_madd(quadrant, half_pi, results);
+    // Adjust for quadrant_f32x4: result += quadrant_f32x4 * pi_f32x4/2
+    results_f32x4 = wasm_f32x4_relaxed_madd(quadrant_f32x4, half_pi_f32x4, results_f32x4);
     // Transfer sign from x and y by XOR with sign bits
-    v128_t sign_mask = wasm_f32x4_splat(-0.0f);
-    v128_t xs_sign = wasm_v128_and(xs_inputs, sign_mask);
-    v128_t ys_sign = wasm_v128_and(ys_inputs, sign_mask);
-    results = wasm_v128_xor(results, xs_sign);
-    results = wasm_v128_xor(results, ys_sign);
+    v128_t sign_mask_f32x4 = wasm_f32x4_splat(-0.0f);
+    v128_t xs_sign_f32x4 = wasm_v128_and(xs_inputs, sign_mask_f32x4);
+    v128_t ys_sign_f32x4 = wasm_v128_and(ys_inputs, sign_mask_f32x4);
+    results_f32x4 = wasm_v128_xor(results_f32x4, xs_sign_f32x4);
+    results_f32x4 = wasm_v128_xor(results_f32x4, ys_sign_f32x4);
-    return results;
+    return results_f32x4;
 }
 NK_INTERNAL v128_t nk_f64x2_sin_v128relaxed_(v128_t const angles_radians) {
     // Constants for argument reduction
-    v128_t const pi_high = wasm_f64x2_splat(3.141592653589793116);
-    v128_t const pi_low = wasm_f64x2_splat(1.2246467991473532072e-16);
-    v128_t const pi_reciprocal = wasm_f64x2_splat(0.31830988618379067154);
+    v128_t const pi_high_f64x2 = wasm_f64x2_splat(3.141592653589793116);
+    v128_t const pi_low_f64x2 = wasm_f64x2_splat(1.2246467991473532072e-16);
+    v128_t const pi_reciprocal_f64x2 = wasm_f64x2_splat(0.31830988618379067154);
     // Polynomial coefficients for sine approximation
-    v128_t const coeff_0 = wasm_f64x2_splat(+0.00833333333333332974823815);
-    v128_t const coeff_1 = wasm_f64x2_splat(-0.000198412698412696162806809);
-    v128_t const coeff_2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
-    v128_t const coeff_3 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
-    v128_t const coeff_4 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
-    v128_t const coeff_5 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
-    v128_t const coeff_6 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
-    v128_t const coeff_7 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
-    v128_t const coeff_8 = wasm_f64x2_splat(-0.166666666666666657414808);
+    v128_t const coeff_0_f64x2 = wasm_f64x2_splat(+0.00833333333333332974823815);
+    v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.000198412698412696162806809);
+    v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
+    v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
+    v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
+    v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
+    v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
+    v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
+    v128_t const coeff_8_f64x2 = wasm_f64x2_splat(-0.166666666666666657414808);
     // Compute round(angle / pi)
-    v128_t const quotients = wasm_f64x2_mul(angles_radians, pi_reciprocal);
-    v128_t rounded_quotients = wasm_f64x2_nearest(quotients);
+    v128_t const quotients_f64x2 = wasm_f64x2_mul(angles_radians, pi_reciprocal_f64x2);
+    v128_t rounded_quotients_f64x2 = wasm_f64x2_nearest(quotients_f64x2);
     // relaxed_trunc: 1 instruction (cvttpd2dq) vs 7 (with NaN/overflow fixup) on x86.
-    // Safe because rounded_quotients are small integers from nearest(), never NaN or out of i32 range.
-    v128_t multiples_i32 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients);
+    // Safe because rounded_quotients_f64x2 are small integers from nearest(), never NaN or out of i32 range.
+    v128_t multiples_i32_f64x2 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients_f64x2);
-    // Two-step Cody-Waite reduction: angle - rounded * pi_high - rounded * pi_low
-    v128_t angles = angles_radians;
-    angles = wasm_f64x2_relaxed_nmadd(rounded_quotients, pi_high, angles);
-    angles = wasm_f64x2_relaxed_nmadd(rounded_quotients, pi_low, angles);
+    // Two-step Cody-Waite reduction: angle - rounded * pi_high_f64x2 - rounded * pi_low_f64x2
+    v128_t angles_f64x2 = angles_radians;
+    angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_high_f64x2, angles_f64x2);
+    angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_low_f64x2, angles_f64x2);
     // Check parity in i32, then widen to i64 mask for laneselect
-    v128_t parity_i32 = wasm_v128_and(multiples_i32, wasm_i32x4_splat(1));
-    v128_t odd_i32 = wasm_i32x4_eq(parity_i32, wasm_i32x4_splat(1));
+    v128_t parity_i32_i32x4 = wasm_v128_and(multiples_i32_f64x2, wasm_i32x4_splat(1));
+    v128_t odd_i32_i32x4 = wasm_i32x4_eq(parity_i32_i32x4, wasm_i32x4_splat(1));
     // Widen: lane0 of i32 -> lanes 0-1 of i64, lane1 -> lanes 2-3
     // Shuffle i32 lanes [0,0,1,1] to broadcast each i32 parity into both halves of each i64
-    v128_t odd_mask = wasm_i32x4_shuffle(odd_i32, odd_i32, 0, 0, 1, 1);
-    v128_t negated_angles = wasm_f64x2_neg(angles);
+    v128_t odd_mask_i32x4 = wasm_i32x4_shuffle(odd_i32_i32x4, odd_i32_i32x4, 0, 0, 1, 1);
+    v128_t negated_angles_f64x2 = wasm_f64x2_neg(angles_f64x2);
     // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
     // Safe because mask is lane-granular at i64 width (all-ones or all-zeros per 64-bit lane).
-    angles = wasm_i64x2_relaxed_laneselect(negated_angles, angles, odd_mask);
+    angles_f64x2 = wasm_i64x2_relaxed_laneselect(negated_angles_f64x2, angles_f64x2, odd_mask_i32x4);
-    v128_t const angles_squared = wasm_f64x2_mul(angles, angles);
-    v128_t const angles_cubed = wasm_f64x2_mul(angles, angles_squared);
-    v128_t const angles_quadratic = wasm_f64x2_mul(angles_squared, angles_squared);
-    v128_t const angles_octic = wasm_f64x2_mul(angles_quadratic, angles_quadratic);
+    v128_t const angles_squared_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_f64x2);
+    v128_t const angles_cubed_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_squared_f64x2);
+    v128_t const angles_quadratic_f64x2 = wasm_f64x2_mul(angles_squared_f64x2, angles_squared_f64x2);
+    v128_t const angles_octic_f64x2 = wasm_f64x2_mul(angles_quadratic_f64x2, angles_quadratic_f64x2);
     // Compute polynomial terms using Estrin's scheme for better ILP
-    v128_t const poly_67 = wasm_f64x2_relaxed_madd(angles_squared, coeff_7, coeff_6);
-    v128_t const poly_45 = wasm_f64x2_relaxed_madd(angles_squared, coeff_5, coeff_4);
-    v128_t const poly_4567 = wasm_f64x2_relaxed_madd(angles_quadratic, poly_67, poly_45);
+    v128_t const poly_67_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_7_f64x2, coeff_6_f64x2);
+    v128_t const poly_45_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_5_f64x2, coeff_4_f64x2);
+    v128_t const poly_4567_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_67_f64x2, poly_45_f64x2);
-    v128_t const poly_23 = wasm_f64x2_relaxed_madd(angles_squared, coeff_3, coeff_2);
-    v128_t const poly_01 = wasm_f64x2_relaxed_madd(angles_squared, coeff_1, coeff_0);
-    v128_t const poly_0123 = wasm_f64x2_relaxed_madd(angles_quadratic, poly_23, poly_01);
+    v128_t const poly_23_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_3_f64x2, coeff_2_f64x2);
+    v128_t const poly_01_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_1_f64x2, coeff_0_f64x2);
+    v128_t const poly_0123_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_23_f64x2, poly_01_f64x2);
     // Combine polynomial terms
-    v128_t results = wasm_f64x2_relaxed_madd(angles_octic, poly_4567, poly_0123);
-    results = wasm_f64x2_relaxed_madd(results, angles_squared, coeff_8);
-    results = wasm_f64x2_relaxed_madd(results, angles_cubed, angles);
+    v128_t results_f64x2 = wasm_f64x2_relaxed_madd(angles_octic_f64x2, poly_4567_f64x2, poly_0123_f64x2);
+    results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_squared_f64x2, coeff_8_f64x2);
+    results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_cubed_f64x2, angles_f64x2);
     // Handle zero input (preserve sign of zero)
-    v128_t const non_zero_mask = wasm_f64x2_eq(angles_radians, wasm_f64x2_splat(0));
+    v128_t const non_zero_mask_f64x2 = wasm_f64x2_eq(angles_radians, wasm_f64x2_splat(0));
     // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
     // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    results = wasm_i64x2_relaxed_laneselect(angles_radians, results, non_zero_mask);
-    return results;
+    results_f64x2 = wasm_i64x2_relaxed_laneselect(angles_radians, results_f64x2, non_zero_mask_f64x2);
+    return results_f64x2;
 }
 NK_INTERNAL v128_t nk_f64x2_cos_v128relaxed_(v128_t const angles_radians) {
     // Constants for argument reduction
-    v128_t const pi_high_half = wasm_f64x2_splat(3.141592653589793116 * 0.5);
-    v128_t const pi_low_half = wasm_f64x2_splat(1.2246467991473532072e-16 * 0.5);
-    v128_t const pi_reciprocal = wasm_f64x2_splat(0.31830988618379067154);
+    v128_t const pi_high_half_f64x2 = wasm_f64x2_splat(3.141592653589793116 * 0.5);
+    v128_t const pi_low_half_f64x2 = wasm_f64x2_splat(1.2246467991473532072e-16 * 0.5);
+    v128_t const pi_reciprocal_f64x2 = wasm_f64x2_splat(0.31830988618379067154);
     // Polynomial coefficients for cosine approximation
-    v128_t const coeff_0 = wasm_f64x2_splat(+0.00833333333333332974823815);
-    v128_t const coeff_1 = wasm_f64x2_splat(-0.000198412698412696162806809);
-    v128_t const coeff_2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
-    v128_t const coeff_3 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
-    v128_t const coeff_4 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
-    v128_t const coeff_5 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
-    v128_t const coeff_6 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
-    v128_t const coeff_7 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
-    v128_t const coeff_8 = wasm_f64x2_splat(-0.166666666666666657414808);
+    v128_t const coeff_0_f64x2 = wasm_f64x2_splat(+0.00833333333333332974823815);
+    v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.000198412698412696162806809);
+    v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
+    v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
+    v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
+    v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
+    v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
+    v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
+    v128_t const coeff_8_f64x2 = wasm_f64x2_splat(-0.166666666666666657414808);
     // Compute 2 * round(angle / pi - 0.5) + 1
-    v128_t const neg_half = wasm_f64x2_splat(-0.5);
-    v128_t const quotients = wasm_f64x2_relaxed_madd(angles_radians, pi_reciprocal, neg_half);
-    v128_t const rounded = wasm_f64x2_nearest(quotients);
-    v128_t const rounded_quotients = wasm_f64x2_relaxed_madd(wasm_f64x2_splat(2.0), rounded, wasm_f64x2_splat(1.0));
+    v128_t const neg_half_f64x2 = wasm_f64x2_splat(-0.5);
+    v128_t const quotients_f64x2 = wasm_f64x2_relaxed_madd(angles_radians, pi_reciprocal_f64x2, neg_half_f64x2);
+    v128_t const rounded_f64x2 = wasm_f64x2_nearest(quotients_f64x2);
+    v128_t const rounded_quotients_f64x2 = wasm_f64x2_relaxed_madd(wasm_f64x2_splat(2.0), rounded_f64x2,
+                                                                   wasm_f64x2_splat(1.0));
     // relaxed_trunc: 1 instruction (cvttpd2dq) vs 7 (with NaN/overflow fixup) on x86.
-    // Safe because rounded_quotients are small integers from nearest(), never NaN or out of i32 range.
-    v128_t quotients_i32 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients);
+    // Safe because rounded_quotients_f64x2 are small integers from nearest(), never NaN or out of i32 range.
+    v128_t quotients_i32_f64x2 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients_f64x2);
     // Two-step Cody-Waite reduction
-    v128_t angles = angles_radians;
-    angles = wasm_f64x2_relaxed_nmadd(rounded_quotients, pi_high_half, angles);
-    angles = wasm_f64x2_relaxed_nmadd(rounded_quotients, pi_low_half, angles);
+    v128_t angles_f64x2 = angles_radians;
+    angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_high_half_f64x2, angles_f64x2);
+    angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_low_half_f64x2, angles_f64x2);
     // Check bit 1 in i32, then widen to i64 mask for laneselect
-    v128_t bit2_i32 = wasm_v128_and(quotients_i32, wasm_i32x4_splat(2));
-    v128_t flip_i32 = wasm_i32x4_eq(bit2_i32, wasm_i32x4_splat(0));
-    v128_t flip_mask = wasm_i32x4_shuffle(flip_i32, flip_i32, 0, 0, 1, 1);
-    v128_t negated_angles = wasm_f64x2_neg(angles);
+    v128_t bit2_i32_i32x4 = wasm_v128_and(quotients_i32_f64x2, wasm_i32x4_splat(2));
+    v128_t flip_i32_i32x4 = wasm_i32x4_eq(bit2_i32_i32x4, wasm_i32x4_splat(0));
+    v128_t flip_mask_i32x4 = wasm_i32x4_shuffle(flip_i32_i32x4, flip_i32_i32x4, 0, 0, 1, 1);
+    v128_t negated_angles_f64x2 = wasm_f64x2_neg(angles_f64x2);
     // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
     // Safe because mask is lane-granular at i64 width (all-ones or all-zeros per 64-bit lane).
-    angles = wasm_i64x2_relaxed_laneselect(negated_angles, angles, flip_mask);
+    angles_f64x2 = wasm_i64x2_relaxed_laneselect(negated_angles_f64x2, angles_f64x2, flip_mask_i32x4);
-    v128_t const angles_squared = wasm_f64x2_mul(angles, angles);
-    v128_t const angles_cubed = wasm_f64x2_mul(angles, angles_squared);
-    v128_t const angles_quadratic = wasm_f64x2_mul(angles_squared, angles_squared);
-    v128_t const angles_octic = wasm_f64x2_mul(angles_quadratic, angles_quadratic);
+    v128_t const angles_squared_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_f64x2);
+    v128_t const angles_cubed_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_squared_f64x2);
+    v128_t const angles_quadratic_f64x2 = wasm_f64x2_mul(angles_squared_f64x2, angles_squared_f64x2);
+    v128_t const angles_octic_f64x2 = wasm_f64x2_mul(angles_quadratic_f64x2, angles_quadratic_f64x2);
     // Compute polynomial terms using Estrin's scheme
-    v128_t const poly_67 = wasm_f64x2_relaxed_madd(angles_squared, coeff_7, coeff_6);
-    v128_t const poly_45 = wasm_f64x2_relaxed_madd(angles_squared, coeff_5, coeff_4);
-    v128_t const poly_4567 = wasm_f64x2_relaxed_madd(angles_quadratic, poly_67, poly_45);
+    v128_t const poly_67_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_7_f64x2, coeff_6_f64x2);
+    v128_t const poly_45_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_5_f64x2, coeff_4_f64x2);
+    v128_t const poly_4567_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_67_f64x2, poly_45_f64x2);
-    v128_t const poly_23 = wasm_f64x2_relaxed_madd(angles_squared, coeff_3, coeff_2);
-    v128_t const poly_01 = wasm_f64x2_relaxed_madd(angles_squared, coeff_1, coeff_0);
-    v128_t const poly_0123 = wasm_f64x2_relaxed_madd(angles_quadratic, poly_23, poly_01);
+    v128_t const poly_23_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_3_f64x2, coeff_2_f64x2);
+    v128_t const poly_01_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_1_f64x2, coeff_0_f64x2);
+    v128_t const poly_0123_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_23_f64x2, poly_01_f64x2);
     // Combine polynomial terms
-    v128_t results = wasm_f64x2_relaxed_madd(angles_octic, poly_4567, poly_0123);
-    results = wasm_f64x2_relaxed_madd(results, angles_squared, coeff_8);
-    results = wasm_f64x2_relaxed_madd(results, angles_cubed, angles);
-    return results;
+    v128_t results_f64x2 = wasm_f64x2_relaxed_madd(angles_octic_f64x2, poly_4567_f64x2, poly_0123_f64x2);
+    results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_squared_f64x2, coeff_8_f64x2);
+    results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_cubed_f64x2, angles_f64x2);
+    return results_f64x2;
 }
 NK_INTERNAL v128_t nk_f64x2_atan_v128relaxed_(v128_t const inputs) {
     // Polynomial coefficients for atan approximation (19 terms)
-    v128_t const coeff_19 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
-    v128_t const coeff_18 = wasm_f64x2_splat(+0.000209850076645816976906797);
-    v128_t const coeff_17 = wasm_f64x2_splat(-0.00110611831486672482563471);
-    v128_t const coeff_16 = wasm_f64x2_splat(+0.00370026744188713119232403);
-    v128_t const coeff_15 = wasm_f64x2_splat(-0.00889896195887655491740809);
-    v128_t const coeff_14 = wasm_f64x2_splat(+0.016599329773529201970117);
-    v128_t const coeff_13 = wasm_f64x2_splat(-0.0254517624932312641616861);
-    v128_t const coeff_12 = wasm_f64x2_splat(+0.0337852580001353069993897);
-    v128_t const coeff_11 = wasm_f64x2_splat(-0.0407629191276836500001934);
-    v128_t const coeff_10 = wasm_f64x2_splat(+0.0466667150077840625632675);
-    v128_t const coeff_9 = wasm_f64x2_splat(-0.0523674852303482457616113);
-    v128_t const coeff_8 = wasm_f64x2_splat(+0.0587666392926673580854313);
-    v128_t const coeff_7 = wasm_f64x2_splat(-0.0666573579361080525984562);
-    v128_t const coeff_6 = wasm_f64x2_splat(+0.0769219538311769618355029);
-    v128_t const coeff_5 = wasm_f64x2_splat(-0.090908995008245008229153);
-    v128_t const coeff_4 = wasm_f64x2_splat(+0.111111105648261418443745);
-    v128_t const coeff_3 = wasm_f64x2_splat(-0.14285714266771329383765);
-    v128_t const coeff_2 = wasm_f64x2_splat(+0.199999999996591265594148);
-    v128_t const coeff_1 = wasm_f64x2_splat(-0.333333333333311110369124);
-    v128_t const half_pi = wasm_f64x2_splat(1.5707963267948966);
-    v128_t const zeros = wasm_f64x2_splat(0);
+    v128_t const coeff_19_f64x2 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
+    v128_t const coeff_18_f64x2 = wasm_f64x2_splat(+0.000209850076645816976906797);
+    v128_t const coeff_17_f64x2 = wasm_f64x2_splat(-0.00110611831486672482563471);
+    v128_t const coeff_16_f64x2 = wasm_f64x2_splat(+0.00370026744188713119232403);
+    v128_t const coeff_15_f64x2 = wasm_f64x2_splat(-0.00889896195887655491740809);
+    v128_t const coeff_14_f64x2 = wasm_f64x2_splat(+0.016599329773529201970117);
+    v128_t const coeff_13_f64x2 = wasm_f64x2_splat(-0.0254517624932312641616861);
+    v128_t const coeff_12_f64x2 = wasm_f64x2_splat(+0.0337852580001353069993897);
+    v128_t const coeff_11_f64x2 = wasm_f64x2_splat(-0.0407629191276836500001934);
+    v128_t const coeff_10_f64x2 = wasm_f64x2_splat(+0.0466667150077840625632675);
+    v128_t const coeff_9_f64x2 = wasm_f64x2_splat(-0.0523674852303482457616113);
+    v128_t const coeff_8_f64x2 = wasm_f64x2_splat(+0.0587666392926673580854313);
+    v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-0.0666573579361080525984562);
+    v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+0.0769219538311769618355029);
+    v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-0.090908995008245008229153);
+    v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+0.111111105648261418443745);
+    v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-0.14285714266771329383765);
+    v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+0.199999999996591265594148);
+    v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.333333333333311110369124);
+    v128_t const half_pi_f64x2 = wasm_f64x2_splat(1.5707963267948966);
+    v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
     // Detect negative and take absolute value
-    v128_t negative_mask = wasm_f64x2_lt(inputs, zeros);
-    v128_t values = wasm_f64x2_abs(inputs);
+    v128_t negative_mask_f64x2 = wasm_f64x2_lt(inputs, zeros_f64x2);
+    v128_t values_f64x2 = wasm_f64x2_abs(inputs);
-    // Check if values > 1 (need reciprocal) - use division for f64 precision
-    v128_t reciprocal_mask = wasm_f64x2_gt(values, wasm_f64x2_splat(1.0));
-    v128_t reciprocal_values = wasm_f64x2_div(wasm_f64x2_splat(1.0), values);
+    // Check if values_f64x2 > 1 (need reciprocal) - use division for f64 precision
+    v128_t reciprocal_mask_f64x2 = wasm_f64x2_gt(values_f64x2, wasm_f64x2_splat(1.0));
+    v128_t reciprocal_values_f64x2 = wasm_f64x2_div(wasm_f64x2_splat(1.0), values_f64x2);
     // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    values = wasm_i64x2_relaxed_laneselect(reciprocal_values, values, reciprocal_mask);
+    // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
+    values_f64x2 = wasm_i64x2_relaxed_laneselect(reciprocal_values_f64x2, values_f64x2, reciprocal_mask_f64x2);
     // Compute powers
-    v128_t const values_squared = wasm_f64x2_mul(values, values);
-    v128_t const values_cubed = wasm_f64x2_mul(values, values_squared);
+    v128_t const values_squared_f64x2 = wasm_f64x2_mul(values_f64x2, values_f64x2);
+    v128_t const values_cubed_f64x2 = wasm_f64x2_mul(values_f64x2, values_squared_f64x2);
     // Polynomial evaluation using Horner's method
-    v128_t polynomials = coeff_19;
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_18);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_17);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_16);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_15);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_14);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_13);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_12);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_11);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_10);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_9);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_8);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_7);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_6);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_5);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_4);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_3);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_2);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, values_squared, coeff_1);
-    // Compute result
-    v128_t result = wasm_f64x2_relaxed_madd(values_cubed, polynomials, values);
-    // Adjust for reciprocal: result = pi/2 - result
-    v128_t adjusted = wasm_f64x2_sub(half_pi, result);
+    v128_t polynomials_f64x2 = coeff_19_f64x2;
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_18_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_17_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_16_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_15_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_14_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_13_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_12_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_11_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_10_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_9_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_8_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_7_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_6_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_5_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_4_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_3_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_2_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_1_f64x2);
+    // Compute result_f64x2
+    v128_t result_f64x2 = wasm_f64x2_relaxed_madd(values_cubed_f64x2, polynomials_f64x2, values_f64x2);
+    // Adjust for reciprocal: result_f64x2 = pi/2 - result_f64x2
+    v128_t adjusted_f64x2 = wasm_f64x2_sub(half_pi_f64x2, result_f64x2);
     // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    result = wasm_i64x2_relaxed_laneselect(adjusted, result, reciprocal_mask);
+    // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
+    result_f64x2 = wasm_i64x2_relaxed_laneselect(adjusted_f64x2, result_f64x2, reciprocal_mask_f64x2);
-    // Adjust for negative: result = -result
-    v128_t negated = wasm_f64x2_neg(result);
+    // Adjust for negative: result_f64x2 = -result_f64x2
+    v128_t negated_f64x2 = wasm_f64x2_neg(result_f64x2);
     // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    result = wasm_i64x2_relaxed_laneselect(negated, result, negative_mask);
-    return result;
+    // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
+    result_f64x2 = wasm_i64x2_relaxed_laneselect(negated_f64x2, result_f64x2, negative_mask_f64x2);
+    return result_f64x2;
 }
 NK_INTERNAL v128_t nk_f64x2_atan2_v128relaxed_(v128_t const ys_inputs, v128_t const xs_inputs) {
     // Polynomial coefficients (same as atan)
-    v128_t const coeff_19 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
-    v128_t const coeff_18 = wasm_f64x2_splat(+0.000209850076645816976906797);
-    v128_t const coeff_17 = wasm_f64x2_splat(-0.00110611831486672482563471);
-    v128_t const coeff_16 = wasm_f64x2_splat(+0.00370026744188713119232403);
-    v128_t const coeff_15 = wasm_f64x2_splat(-0.00889896195887655491740809);
-    v128_t const coeff_14 = wasm_f64x2_splat(+0.016599329773529201970117);
-    v128_t const coeff_13 = wasm_f64x2_splat(-0.0254517624932312641616861);
-    v128_t const coeff_12 = wasm_f64x2_splat(+0.0337852580001353069993897);
-    v128_t const coeff_11 = wasm_f64x2_splat(-0.0407629191276836500001934);
-    v128_t const coeff_10 = wasm_f64x2_splat(+0.0466667150077840625632675);
-    v128_t const coeff_9 = wasm_f64x2_splat(-0.0523674852303482457616113);
-    v128_t const coeff_8 = wasm_f64x2_splat(+0.0587666392926673580854313);
-    v128_t const coeff_7 = wasm_f64x2_splat(-0.0666573579361080525984562);
-    v128_t const coeff_6 = wasm_f64x2_splat(+0.0769219538311769618355029);
-    v128_t const coeff_5 = wasm_f64x2_splat(-0.090908995008245008229153);
-    v128_t const coeff_4 = wasm_f64x2_splat(+0.111111105648261418443745);
-    v128_t const coeff_3 = wasm_f64x2_splat(-0.14285714266771329383765);
-    v128_t const coeff_2 = wasm_f64x2_splat(+0.199999999996591265594148);
-    v128_t const coeff_1 = wasm_f64x2_splat(-0.333333333333311110369124);
-    v128_t const pi = wasm_f64x2_splat(3.14159265358979323846);
-    v128_t const half_pi = wasm_f64x2_splat(1.5707963267948966);
-    v128_t const zeros = wasm_f64x2_splat(0);
+    v128_t const coeff_19_f64x2 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
+    v128_t const coeff_18_f64x2 = wasm_f64x2_splat(+0.000209850076645816976906797);
+    v128_t const coeff_17_f64x2 = wasm_f64x2_splat(-0.00110611831486672482563471);
+    v128_t const coeff_16_f64x2 = wasm_f64x2_splat(+0.00370026744188713119232403);
+    v128_t const coeff_15_f64x2 = wasm_f64x2_splat(-0.00889896195887655491740809);
+    v128_t const coeff_14_f64x2 = wasm_f64x2_splat(+0.016599329773529201970117);
+    v128_t const coeff_13_f64x2 = wasm_f64x2_splat(-0.0254517624932312641616861);
+    v128_t const coeff_12_f64x2 = wasm_f64x2_splat(+0.0337852580001353069993897);
+    v128_t const coeff_11_f64x2 = wasm_f64x2_splat(-0.0407629191276836500001934);
+    v128_t const coeff_10_f64x2 = wasm_f64x2_splat(+0.0466667150077840625632675);
+    v128_t const coeff_9_f64x2 = wasm_f64x2_splat(-0.0523674852303482457616113);
+    v128_t const coeff_8_f64x2 = wasm_f64x2_splat(+0.0587666392926673580854313);
+    v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-0.0666573579361080525984562);
+    v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+0.0769219538311769618355029);
+    v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-0.090908995008245008229153);
+    v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+0.111111105648261418443745);
+    v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-0.14285714266771329383765);
+    v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+0.199999999996591265594148);
+    v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.333333333333311110369124);
+    v128_t const pi_f64x2 = wasm_f64x2_splat(3.14159265358979323846);
+    v128_t const half_pi_f64x2 = wasm_f64x2_splat(1.5707963267948966);
+    v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
     // Quadrant adjustments - take absolute values
-    v128_t xs_negative_mask = wasm_f64x2_lt(xs_inputs, zeros);
-    v128_t xs = wasm_f64x2_abs(xs_inputs);
-    v128_t ys = wasm_f64x2_abs(ys_inputs);
+    v128_t xs_negative_mask_f64x2 = wasm_f64x2_lt(xs_inputs, zeros_f64x2);
+    v128_t xs_f64x2 = wasm_f64x2_abs(xs_inputs);
+    v128_t ys_f64x2 = wasm_f64x2_abs(ys_inputs);
     // Ensure proper fraction where numerator < denominator
-    v128_t swap_mask = wasm_f64x2_gt(ys, xs);
-    v128_t temps = xs;
+    v128_t swap_mask_f64x2 = wasm_f64x2_gt(ys_f64x2, xs_f64x2);
+    v128_t temps_f64x2 = xs_f64x2;
     // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    xs = wasm_i64x2_relaxed_laneselect(ys, xs, swap_mask);
-    ys = wasm_i64x2_relaxed_laneselect(wasm_f64x2_neg(temps), ys, swap_mask);
+    // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
+    xs_f64x2 = wasm_i64x2_relaxed_laneselect(ys_f64x2, xs_f64x2, swap_mask_f64x2);
+    ys_f64x2 = wasm_i64x2_relaxed_laneselect(wasm_f64x2_neg(temps_f64x2), ys_f64x2, swap_mask_f64x2);
     // Division for f64 precision
-    v128_t const ratio = wasm_f64x2_div(ys, xs);
-    v128_t const ratio_squared = wasm_f64x2_mul(ratio, ratio);
-    v128_t const ratio_cubed = wasm_f64x2_mul(ratio, ratio_squared);
+    v128_t const ratio_f64x2 = wasm_f64x2_div(ys_f64x2, xs_f64x2);
+    v128_t const ratio_squared_f64x2 = wasm_f64x2_mul(ratio_f64x2, ratio_f64x2);
+    v128_t const ratio_cubed_f64x2 = wasm_f64x2_mul(ratio_f64x2, ratio_squared_f64x2);
     // Polynomial evaluation using Horner's method
-    v128_t polynomials = coeff_19;
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_18);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_17);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_16);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_15);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_14);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_13);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_12);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_11);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_10);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_9);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_8);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_7);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_6);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_5);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_4);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_3);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_2);
-    polynomials = wasm_f64x2_relaxed_madd(polynomials, ratio_squared, coeff_1);
+    v128_t polynomials_f64x2 = coeff_19_f64x2;
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_18_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_17_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_16_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_15_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_14_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_13_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_12_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_11_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_10_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_9_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_8_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_7_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_6_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_5_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_4_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_3_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_2_f64x2);
+    polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_1_f64x2);
     // Compute the result
-    v128_t results = wasm_f64x2_relaxed_madd(ratio_cubed, polynomials, ratio);
+    v128_t results_f64x2 = wasm_f64x2_relaxed_madd(ratio_cubed_f64x2, polynomials_f64x2, ratio_f64x2);
-    // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
+    // Compute quadrant_f64x2 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
     //                        -2 for x<0 && !swap, -1 for x<0 && swap
-    v128_t quadrant = wasm_f64x2_splat(0.0);
-    v128_t neg_two = wasm_f64x2_splat(-2.0);
+    v128_t quadrant_f64x2 = wasm_f64x2_splat(0.0);
+    v128_t neg_two_f64x2 = wasm_f64x2_splat(-2.0);
     // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
-    // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    quadrant = wasm_i64x2_relaxed_laneselect(neg_two, quadrant, xs_negative_mask);
-    v128_t quadrant_incremented = wasm_f64x2_add(quadrant, wasm_f64x2_splat(1.0));
-    quadrant = wasm_i64x2_relaxed_laneselect(quadrant_incremented, quadrant, swap_mask);
+    // Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
+    quadrant_f64x2 = wasm_i64x2_relaxed_laneselect(neg_two_f64x2, quadrant_f64x2, xs_negative_mask_f64x2);
+    v128_t quadrant_incremented_f64x2 = wasm_f64x2_add(quadrant_f64x2, wasm_f64x2_splat(1.0));
+    quadrant_f64x2 = wasm_i64x2_relaxed_laneselect(quadrant_incremented_f64x2, quadrant_f64x2, swap_mask_f64x2);
-    // Adjust for quadrant: result += quadrant * pi/2
-    results = wasm_f64x2_relaxed_madd(quadrant, half_pi, results);
+    // Adjust for quadrant_f64x2: result += quadrant_f64x2 * pi_f64x2/2
+    results_f64x2 = wasm_f64x2_relaxed_madd(quadrant_f64x2, half_pi_f64x2, results_f64x2);
     // Transfer sign from x and y by XOR with sign bits
-    v128_t sign_mask = wasm_f64x2_splat(-0.0);
-    v128_t xs_sign = wasm_v128_and(xs_inputs, sign_mask);
-    v128_t ys_sign = wasm_v128_and(ys_inputs, sign_mask);
-    results = wasm_v128_xor(results, xs_sign);
-    results = wasm_v128_xor(results, ys_sign);
+    v128_t sign_mask_f64x2 = wasm_f64x2_splat(-0.0);
+    v128_t xs_sign_f64x2 = wasm_v128_and(xs_inputs, sign_mask_f64x2);
+    v128_t ys_sign_f64x2 = wasm_v128_and(ys_inputs, sign_mask_f64x2);
+    results_f64x2 = wasm_v128_xor(results_f64x2, xs_sign_f64x2);
+    results_f64x2 = wasm_v128_xor(results_f64x2, ys_sign_f64x2);
-    return results;
+    return results_f64x2;
 }
 /*  NK_PUBLIC wrappers — same loop+tail pattern as neon.h.
@@ -555,9 +556,9 @@ NK_INTERNAL v128_t nk_f64x2_atan2_v128relaxed_(v128_t const ys_inputs, v128_t co
 NK_PUBLIC void nk_each_sin_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
     nk_size_t i = 0;
     for (; i + 4 <= n; i += 4) {
-        v128_t angles = wasm_v128_load(ins + i);
-        v128_t results = nk_f32x4_sin_v128relaxed_(angles);
-        wasm_v128_store(outs + i, results);
+        v128_t angles_f32x4 = wasm_v128_load(ins + i);
+        v128_t results_f32x4 = nk_f32x4_sin_v128relaxed_(angles_f32x4);
+        wasm_v128_store(outs + i, results_f32x4);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -572,9 +573,9 @@ NK_PUBLIC void nk_each_sin_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_
 NK_PUBLIC void nk_each_cos_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
     nk_size_t i = 0;
     for (; i + 4 <= n; i += 4) {
-        v128_t angles = wasm_v128_load(ins + i);
-        v128_t results = nk_f32x4_cos_v128relaxed_(angles);
-        wasm_v128_store(outs + i, results);
+        v128_t angles_f32x4 = wasm_v128_load(ins + i);
+        v128_t results_f32x4 = nk_f32x4_cos_v128relaxed_(angles_f32x4);
+        wasm_v128_store(outs + i, results_f32x4);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -589,9 +590,9 @@ NK_PUBLIC void nk_each_cos_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_
 NK_PUBLIC void nk_each_atan_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
     nk_size_t i = 0;
     for (; i + 4 <= n; i += 4) {
-        v128_t values = wasm_v128_load(ins + i);
-        v128_t results = nk_f32x4_atan_v128relaxed_(values);
-        wasm_v128_store(outs + i, results);
+        v128_t values_f32x4 = wasm_v128_load(ins + i);
+        v128_t results_f32x4 = nk_f32x4_atan_v128relaxed_(values_f32x4);
+        wasm_v128_store(outs + i, results_f32x4);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -606,9 +607,9 @@ NK_PUBLIC void nk_each_atan_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk
 NK_PUBLIC void nk_each_sin_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
     nk_size_t i = 0;
     for (; i + 2 <= n; i += 2) {
-        v128_t angles = wasm_v128_load(ins + i);
-        v128_t results = nk_f64x2_sin_v128relaxed_(angles);
-        wasm_v128_store(outs + i, results);
+        v128_t angles_f64x2 = wasm_v128_load(ins + i);
+        v128_t results_f64x2 = nk_f64x2_sin_v128relaxed_(angles_f64x2);
+        wasm_v128_store(outs + i, results_f64x2);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -623,9 +624,9 @@ NK_PUBLIC void nk_each_sin_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_
 NK_PUBLIC void nk_each_cos_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
     nk_size_t i = 0;
     for (; i + 2 <= n; i += 2) {
-        v128_t angles = wasm_v128_load(ins + i);
-        v128_t results = nk_f64x2_cos_v128relaxed_(angles);
-        wasm_v128_store(outs + i, results);
+        v128_t angles_f64x2 = wasm_v128_load(ins + i);
+        v128_t results_f64x2 = nk_f64x2_cos_v128relaxed_(angles_f64x2);
+        wasm_v128_store(outs + i, results_f64x2);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -640,9 +641,9 @@ NK_PUBLIC void nk_each_cos_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_
 NK_PUBLIC void nk_each_atan_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
     nk_size_t i = 0;
     for (; i + 2 <= n; i += 2) {
-        v128_t values = wasm_v128_load(ins + i);
-        v128_t results = nk_f64x2_atan_v128relaxed_(values);
-        wasm_v128_store(outs + i, results);
+        v128_t values_f64x2 = wasm_v128_load(ins + i);
+        v128_t results_f64x2 = nk_f64x2_atan_v128relaxed_(values_f64x2);
+        wasm_v128_store(outs + i, results_f64x2);
     }
     if (i < n) {
         nk_size_t remaining = n - i;