npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/trigonometry/haswell.h CHANGED Viewed

@@ -9,12 +9,12 @@
  *
  *  @section haswell_trig_instructions Key AVX2 Trigonometry Instructions
  *
- *      Intrinsic                   Instruction                     Latency     Throughput  Ports
- *      _mm256_fmadd_ps/pd          VFMADD (YMM, YMM, YMM)          5cy         0.5/cy      p01
- *      _mm256_mul_ps/pd            VMULPS/PD (YMM, YMM, YMM)       5cy         0.5/cy      p01
- *      _mm256_blendv_ps/pd         VBLENDVPS/PD (YMM, YMM, YMM)    2cy         1/cy        p015
- *      _mm256_round_ps/pd          VROUNDPS/PD (YMM, YMM, I8)      6cy         1/cy        p01
- *      _mm256_div_ps               VDIVPS (YMM, YMM, YMM)          13cy        5/cy        p0
+ *      Intrinsic            Instruction                   Haswell     Genoa
+ *      _mm256_fmadd_ps/pd   VFMADD (YMM, YMM, YMM)        5cy @ p01   4cy @ p01
+ *      _mm256_mul_ps/pd     VMULPS/PD (YMM, YMM, YMM)     5cy @ p01   3cy @ p01
+ *      _mm256_blendv_ps/pd  VBLENDVPS/PD (YMM, YMM, YMM)  2cy @ p015  1cy @ p01
+ *      _mm256_round_ps/pd   VROUNDPS/PD (YMM, YMM, I8)    6cy @ p01   3cy @ p23
+ *      _mm256_div_ps        VDIVPS (YMM, YMM, YMM)        13cy @ p0   11cy @ p01
  *
  *  Polynomial evaluation uses Horner's method with FMA for sin/cos/atan approximation. For large
  *  arrays, out-of-order execution across loop iterations hides FMA latency better than Estrin's
@@ -46,501 +46,502 @@ extern "C" {
 NK_INTERNAL __m256 nk_sin_f32x8_haswell_(__m256 const angles_radians) {
     // Cody-Waite constants for argument reduction
-    __m256 const pi_hi_f32x8 = _mm256_set1_ps(3.1415927f);
-    __m256 const pi_lo_f32x8 = _mm256_set1_ps(-8.742278e-8f);
-    __m256 const pi_reciprocal = _mm256_set1_ps(0.31830988618379067154f); // 1/π
+    __m256 const pi_high_f32x8 = _mm256_set1_ps(3.1415927f);
+    __m256 const pi_low_f32x8 = _mm256_set1_ps(-8.742278e-8f);
+    __m256 const pi_reciprocal_f32x8 = _mm256_set1_ps(0.31830988618379067154f); // 1/π
     // Degree-9 minimax coefficients
-    __m256 const coeff_9 = _mm256_set1_ps(+2.7557319224e-6f);
-    __m256 const coeff_7 = _mm256_set1_ps(-1.9841269841e-4f);
-    __m256 const coeff_5 = _mm256_set1_ps(+8.3333293855e-3f);
-    __m256 const coeff_3 = _mm256_set1_ps(-1.6666666641e-1f);
+    __m256 const coeff_9_f32x8 = _mm256_set1_ps(+2.7557319224e-6f);
+    __m256 const coeff_7_f32x8 = _mm256_set1_ps(-1.9841269841e-4f);
+    __m256 const coeff_5_f32x8 = _mm256_set1_ps(+8.3333293855e-3f);
+    __m256 const coeff_3_f32x8 = _mm256_set1_ps(-1.6666666641e-1f);
-    // Compute (multiples_of_pi) = round(angle / π)
-    __m256 quotients = _mm256_mul_ps(angles_radians, pi_reciprocal);
-    __m256 rounded_quotients = _mm256_round_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
-    __m256i multiples_of_pi = _mm256_cvttps_epi32(rounded_quotients);
+    // Compute (multiples_of_pi_i32x8) = round(angle / π)
+    __m256 quotients_f32x8 = _mm256_mul_ps(angles_radians, pi_reciprocal_f32x8);
+    __m256 rounded_quotients_f32x8 = _mm256_round_ps(quotients_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    // Use truncation (MXCSR-independent) since rounded_quotients_f32x8 is already integer-valued
+    __m256i multiples_of_pi_i32x8 = _mm256_cvttps_epi32(rounded_quotients_f32x8);
     // Cody-Waite range reduction
-    __m256 angles = _mm256_fnmadd_ps(rounded_quotients, pi_hi_f32x8, angles_radians);
-    angles = _mm256_fnmadd_ps(rounded_quotients, pi_lo_f32x8, angles);
-    __m256 const angles_squared = _mm256_mul_ps(angles, angles);
-    __m256 const angles_cubed = _mm256_mul_ps(angles, angles_squared);
+    __m256 angles_f32x8 = _mm256_fnmadd_ps(rounded_quotients_f32x8, pi_high_f32x8, angles_radians);
+    angles_f32x8 = _mm256_fnmadd_ps(rounded_quotients_f32x8, pi_low_f32x8, angles_f32x8);
+    __m256 const angles_squared_f32x8 = _mm256_mul_ps(angles_f32x8, angles_f32x8);
+    __m256 const angles_cubed_f32x8 = _mm256_mul_ps(angles_f32x8, angles_squared_f32x8);
     // Degree-9 polynomial via Horner's method
-    __m256 polynomials = coeff_9;
-    polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_7);
-    polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_5);
-    polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_3);
-    __m256 results = _mm256_fmadd_ps(angles_cubed, polynomials, angles);
-    // If multiples_of_pi is odd, flip the sign of the results
-    __m256i parity = _mm256_and_si256(multiples_of_pi, _mm256_set1_epi32(1));
-    __m256i odd_mask = _mm256_cmpeq_epi32(parity, _mm256_set1_epi32(1));
-    __m256 float_mask = _mm256_castsi256_ps(odd_mask);
-    __m256 negated = _mm256_sub_ps(_mm256_setzero_ps(), results);
-    results = _mm256_blendv_ps(results, negated, float_mask);
-    return results;
+    __m256 polynomials_f32x8 = coeff_9_f32x8;
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_7_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_5_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_3_f32x8);
+    __m256 results_f32x8 = _mm256_fmadd_ps(angles_cubed_f32x8, polynomials_f32x8, angles_f32x8);
+    // If multiples_of_pi_i32x8 is odd, flip the sign of the results_f32x8
+    __m256i parity_i32x8 = _mm256_and_si256(multiples_of_pi_i32x8, _mm256_set1_epi32(1));
+    __m256i odd_mask_i32x8 = _mm256_cmpeq_epi32(parity_i32x8, _mm256_set1_epi32(1));
+    __m256 float_mask_f32x8 = _mm256_castsi256_ps(odd_mask_i32x8);
+    __m256 negated_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), results_f32x8);
+    results_f32x8 = _mm256_blendv_ps(results_f32x8, negated_f32x8, float_mask_f32x8);
+    return results_f32x8;
 }
 NK_INTERNAL __m256 nk_cos_f32x8_haswell_(__m256 const angles_radians) {
     // Cody-Waite constants for argument reduction
-    __m256 const pi_hi_f32x8 = _mm256_set1_ps(3.1415927f);
-    __m256 const pi_lo_f32x8 = _mm256_set1_ps(-8.742278e-8f);
-    __m256 const pi_half = _mm256_set1_ps(1.57079632679489661923f);       // π/2
-    __m256 const pi_reciprocal = _mm256_set1_ps(0.31830988618379067154f); // 1/π
+    __m256 const pi_high_f32x8 = _mm256_set1_ps(3.1415927f);
+    __m256 const pi_low_f32x8 = _mm256_set1_ps(-8.742278e-8f);
+    __m256 const pi_half_f32x8 = _mm256_set1_ps(1.57079632679489661923f);       // π/2
+    __m256 const pi_reciprocal_f32x8 = _mm256_set1_ps(0.31830988618379067154f); // 1/π
     // Degree-9 minimax coefficients
-    __m256 const coeff_9 = _mm256_set1_ps(+2.7557319224e-6f);
-    __m256 const coeff_7 = _mm256_set1_ps(-1.9841269841e-4f);
-    __m256 const coeff_5 = _mm256_set1_ps(+8.3333293855e-3f);
-    __m256 const coeff_3 = _mm256_set1_ps(-1.6666666641e-1f);
+    __m256 const coeff_9_f32x8 = _mm256_set1_ps(+2.7557319224e-6f);
+    __m256 const coeff_7_f32x8 = _mm256_set1_ps(-1.9841269841e-4f);
+    __m256 const coeff_5_f32x8 = _mm256_set1_ps(+8.3333293855e-3f);
+    __m256 const coeff_3_f32x8 = _mm256_set1_ps(-1.6666666641e-1f);
-    // Compute (multiples_of_pi) = round((angle / π) - 0.5)
-    __m256 quotients = _mm256_fmsub_ps(angles_radians, pi_reciprocal, _mm256_set1_ps(0.5f));
-    __m256 rounded_quotients = _mm256_round_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
-    __m256i multiples_of_pi = _mm256_cvttps_epi32(rounded_quotients);
+    // Compute (multiples_of_pi_i32x8) = round((angle / π) - 0.5)
+    __m256 quotients_f32x8 = _mm256_fmsub_ps(angles_radians, pi_reciprocal_f32x8, _mm256_set1_ps(0.5f));
+    __m256 rounded_quotients_f32x8 = _mm256_round_ps(quotients_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    // Use truncation (MXCSR-independent) since rounded_quotients_f32x8 is already integer-valued
+    __m256i multiples_of_pi_i32x8 = _mm256_cvttps_epi32(rounded_quotients_f32x8);
     // Cody-Waite range reduction: angle = angle_radians - (multiples * pi + pi/2)
-    __m256 const offset = _mm256_fmadd_ps(rounded_quotients, pi_hi_f32x8, pi_half);
-    __m256 angles = _mm256_sub_ps(angles_radians, offset);
-    angles = _mm256_fnmadd_ps(rounded_quotients, pi_lo_f32x8, angles);
-    __m256 const angles_squared = _mm256_mul_ps(angles, angles);
-    __m256 const angles_cubed = _mm256_mul_ps(angles, angles_squared);
+    __m256 const offset_f32x8 = _mm256_fmadd_ps(rounded_quotients_f32x8, pi_high_f32x8, pi_half_f32x8);
+    __m256 angles_f32x8 = _mm256_sub_ps(angles_radians, offset_f32x8);
+    angles_f32x8 = _mm256_fnmadd_ps(rounded_quotients_f32x8, pi_low_f32x8, angles_f32x8);
+    __m256 const angles_squared_f32x8 = _mm256_mul_ps(angles_f32x8, angles_f32x8);
+    __m256 const angles_cubed_f32x8 = _mm256_mul_ps(angles_f32x8, angles_squared_f32x8);
     // Degree-9 polynomial via Horner's method
-    __m256 polynomials = coeff_9;
-    polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_7);
-    polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_5);
-    polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_3);
-    __m256 results = _mm256_fmadd_ps(angles_cubed, polynomials, angles);
-    // If multiples_of_pi is even, flip the sign of the results
-    __m256i parity = _mm256_and_si256(multiples_of_pi, _mm256_set1_epi32(1));
-    __m256i even_mask = _mm256_cmpeq_epi32(parity, _mm256_setzero_si256());
-    __m256 float_mask = _mm256_castsi256_ps(even_mask);
-    __m256 negated = _mm256_sub_ps(_mm256_setzero_ps(), results);
-    results = _mm256_blendv_ps(results, negated, float_mask);
-    return results;
+    __m256 polynomials_f32x8 = coeff_9_f32x8;
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_7_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_5_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_3_f32x8);
+    __m256 results_f32x8 = _mm256_fmadd_ps(angles_cubed_f32x8, polynomials_f32x8, angles_f32x8);
+    // If multiples_of_pi_i32x8 is even, flip the sign of the results_f32x8
+    __m256i parity_i32x8 = _mm256_and_si256(multiples_of_pi_i32x8, _mm256_set1_epi32(1));
+    __m256i even_mask_i32x8 = _mm256_cmpeq_epi32(parity_i32x8, _mm256_setzero_si256());
+    __m256 float_mask_f32x8 = _mm256_castsi256_ps(even_mask_i32x8);
+    __m256 negated_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), results_f32x8);
+    results_f32x8 = _mm256_blendv_ps(results_f32x8, negated_f32x8, float_mask_f32x8);
+    return results_f32x8;
 }
 NK_INTERNAL __m256 nk_atan_f32x8_haswell_(__m256 const inputs) {
     // Polynomial coefficients for atan approximation (8 terms)
     // These coefficients approximate: atan(x) ≈ x + c8 × x³ + c7 × x⁵ + c6 × x⁷ + ... + c1 × x¹⁵
-    __m256 const coeff_8 = _mm256_set1_ps(-0.333331018686294555664062f);
-    __m256 const coeff_7 = _mm256_set1_ps(+0.199926957488059997558594f);
-    __m256 const coeff_6 = _mm256_set1_ps(-0.142027363181114196777344f);
-    __m256 const coeff_5 = _mm256_set1_ps(+0.106347933411598205566406f);
-    __m256 const coeff_4 = _mm256_set1_ps(-0.0748900920152664184570312f);
-    __m256 const coeff_3 = _mm256_set1_ps(+0.0425049886107444763183594f);
-    __m256 const coeff_2 = _mm256_set1_ps(-0.0159569028764963150024414f);
-    __m256 const coeff_1 = _mm256_set1_ps(+0.00282363896258175373077393f);
-    __m256 const sign_mask = _mm256_set1_ps(-0.0f);
-    // Adjust for quadrant - detect negative values
-    __m256 values = inputs;
-    __m256 negative_mask = _mm256_cmp_ps(values, _mm256_setzero_ps(), _CMP_LT_OS);
-    values = _mm256_andnot_ps(sign_mask, values); // abs(values)
-    // Check if values > 1 (need reciprocal)
-    __m256 reciprocal_mask = _mm256_cmp_ps(values, _mm256_set1_ps(1.0f), _CMP_GT_OS);
-    __m256 reciprocal_values = _mm256_div_ps(_mm256_set1_ps(1.0f), values);
-    values = _mm256_blendv_ps(values, reciprocal_values, reciprocal_mask);
+    __m256 const coeff_8_f32x8 = _mm256_set1_ps(-0.333331018686294555664062f);
+    __m256 const coeff_7_f32x8 = _mm256_set1_ps(+0.199926957488059997558594f);
+    __m256 const coeff_6_f32x8 = _mm256_set1_ps(-0.142027363181114196777344f);
+    __m256 const coeff_5_f32x8 = _mm256_set1_ps(+0.106347933411598205566406f);
+    __m256 const coeff_4_f32x8 = _mm256_set1_ps(-0.0748900920152664184570312f);
+    __m256 const coeff_3_f32x8 = _mm256_set1_ps(+0.0425049886107444763183594f);
+    __m256 const coeff_2_f32x8 = _mm256_set1_ps(-0.0159569028764963150024414f);
+    __m256 const coeff_1_f32x8 = _mm256_set1_ps(+0.00282363896258175373077393f);
+    __m256 const sign_mask_f32x8 = _mm256_set1_ps(-0.0f);
+    // Adjust for quadrant - detect negative values_f32x8
+    __m256 values_f32x8 = inputs;
+    __m256 negative_mask_f32x8 = _mm256_cmp_ps(values_f32x8, _mm256_setzero_ps(), _CMP_LT_OS);
+    values_f32x8 = _mm256_andnot_ps(sign_mask_f32x8, values_f32x8); // abs(values_f32x8)
+    // Check if values_f32x8 > 1 (need reciprocal)
+    __m256 reciprocal_mask_f32x8 = _mm256_cmp_ps(values_f32x8, _mm256_set1_ps(1.0f), _CMP_GT_OS);
+    __m256 reciprocal_values_f32x8 = _mm256_div_ps(_mm256_set1_ps(1.0f), values_f32x8);
+    values_f32x8 = _mm256_blendv_ps(values_f32x8, reciprocal_values_f32x8, reciprocal_mask_f32x8);
     // Argument reduction
-    __m256 const values_squared = _mm256_mul_ps(values, values);
-    __m256 const values_cubed = _mm256_mul_ps(values, values_squared);
+    __m256 const values_squared_f32x8 = _mm256_mul_ps(values_f32x8, values_f32x8);
+    __m256 const values_cubed_f32x8 = _mm256_mul_ps(values_f32x8, values_squared_f32x8);
     // Polynomial evaluation using Horner's method.
     // For large arrays, out-of-order execution across loop iterations already hides
     // FMA latency. Estrin's scheme was tested but showed ~20% regression because
     // the extra power computations (y², y⁴) hurt throughput more than the reduced
     // dependency depth helps latency.
-    __m256 polynomials = coeff_1;
-    polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_2);
-    polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_3);
-    polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_4);
-    polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_5);
-    polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_6);
-    polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_7);
-    polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_8);
-    // Compute result: atan(x) ≈ x + x³ * P(x²)
-    __m256 result = _mm256_fmadd_ps(values_cubed, polynomials, values);
-    // Adjust for reciprocal: result = π/2 - result
-    __m256 adjusted = _mm256_sub_ps(_mm256_set1_ps(1.5707963267948966f), result);
-    result = _mm256_blendv_ps(result, adjusted, reciprocal_mask);
-    // Adjust for negative: result = -result
-    __m256 negated = _mm256_sub_ps(_mm256_setzero_ps(), result);
-    result = _mm256_blendv_ps(result, negated, negative_mask);
-    return result;
+    __m256 polynomials_f32x8 = coeff_1_f32x8;
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_2_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_3_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_4_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_5_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_6_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_7_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_8_f32x8);
+    // Compute result_f32x8: atan(x) ≈ x + x³ * P(x²)
+    __m256 result_f32x8 = _mm256_fmadd_ps(values_cubed_f32x8, polynomials_f32x8, values_f32x8);
+    // Adjust for reciprocal: result_f32x8 = π/2 - result_f32x8
+    __m256 adjusted_f32x8 = _mm256_sub_ps(_mm256_set1_ps(1.5707963267948966f), result_f32x8);
+    result_f32x8 = _mm256_blendv_ps(result_f32x8, adjusted_f32x8, reciprocal_mask_f32x8);
+    // Adjust for negative: result_f32x8 = -result_f32x8
+    __m256 negated_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), result_f32x8);
+    result_f32x8 = _mm256_blendv_ps(result_f32x8, negated_f32x8, negative_mask_f32x8);
+    return result_f32x8;
 }
 NK_INTERNAL __m256 nk_atan2_f32x8_haswell_(__m256 const ys_inputs, __m256 const xs_inputs) {
     // Polynomial coefficients (same as atan)
-    __m256 const coeff_8 = _mm256_set1_ps(-0.333331018686294555664062f);
-    __m256 const coeff_7 = _mm256_set1_ps(+0.199926957488059997558594f);
-    __m256 const coeff_6 = _mm256_set1_ps(-0.142027363181114196777344f);
-    __m256 const coeff_5 = _mm256_set1_ps(+0.106347933411598205566406f);
-    __m256 const coeff_4 = _mm256_set1_ps(-0.0748900920152664184570312f);
-    __m256 const coeff_3 = _mm256_set1_ps(+0.0425049886107444763183594f);
-    __m256 const coeff_2 = _mm256_set1_ps(-0.0159569028764963150024414f);
-    __m256 const coeff_1 = _mm256_set1_ps(+0.00282363896258175373077393f);
-    __m256 const sign_mask = _mm256_set1_ps(-0.0f);
+    __m256 const coeff_8_f32x8 = _mm256_set1_ps(-0.333331018686294555664062f);
+    __m256 const coeff_7_f32x8 = _mm256_set1_ps(+0.199926957488059997558594f);
+    __m256 const coeff_6_f32x8 = _mm256_set1_ps(-0.142027363181114196777344f);
+    __m256 const coeff_5_f32x8 = _mm256_set1_ps(+0.106347933411598205566406f);
+    __m256 const coeff_4_f32x8 = _mm256_set1_ps(-0.0748900920152664184570312f);
+    __m256 const coeff_3_f32x8 = _mm256_set1_ps(+0.0425049886107444763183594f);
+    __m256 const coeff_2_f32x8 = _mm256_set1_ps(-0.0159569028764963150024414f);
+    __m256 const coeff_1_f32x8 = _mm256_set1_ps(+0.00282363896258175373077393f);
+    __m256 const sign_mask_f32x8 = _mm256_set1_ps(-0.0f);
     // Quadrant adjustments normalizing to absolute values of x and y
-    __m256 xs_negative_mask = _mm256_cmp_ps(xs_inputs, _mm256_setzero_ps(), _CMP_LT_OS);
-    __m256 xs = _mm256_andnot_ps(sign_mask, xs_inputs); // abs(xs_inputs)
-    __m256 ys = _mm256_andnot_ps(sign_mask, ys_inputs); // abs(ys_inputs)
+    __m256 xs_negative_mask_f32x8 = _mm256_cmp_ps(xs_inputs, _mm256_setzero_ps(), _CMP_LT_OS);
+    __m256 xs_f32x8 = _mm256_andnot_ps(sign_mask_f32x8, xs_inputs); // abs(xs_inputs)
+    __m256 ys_f32x8 = _mm256_andnot_ps(sign_mask_f32x8, ys_inputs); // abs(ys_inputs)
     // Ensure proper fraction where the numerator is smaller than the denominator
-    __m256 swap_mask = _mm256_cmp_ps(ys, xs, _CMP_GT_OS);
-    __m256 temps = xs;
-    xs = _mm256_blendv_ps(xs, ys, swap_mask);
-    __m256 neg_temps = _mm256_sub_ps(_mm256_setzero_ps(), temps);
-    ys = _mm256_blendv_ps(ys, neg_temps, swap_mask);
+    __m256 swap_mask_f32x8 = _mm256_cmp_ps(ys_f32x8, xs_f32x8, _CMP_GT_OS);
+    __m256 temps_f32x8 = xs_f32x8;
+    xs_f32x8 = _mm256_blendv_ps(xs_f32x8, ys_f32x8, swap_mask_f32x8);
+    __m256 neg_temps_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), temps_f32x8);
+    ys_f32x8 = _mm256_blendv_ps(ys_f32x8, neg_temps_f32x8, swap_mask_f32x8);
-    // Compute ratio and powers
-    __m256 const ratio = _mm256_div_ps(ys, xs);
-    __m256 const ratio_squared = _mm256_mul_ps(ratio, ratio);
-    __m256 const ratio_cubed = _mm256_mul_ps(ratio, ratio_squared);
+    // Compute ratio_f32x8 and powers
+    __m256 const ratio_f32x8 = _mm256_div_ps(ys_f32x8, xs_f32x8);
+    __m256 const ratio_squared_f32x8 = _mm256_mul_ps(ratio_f32x8, ratio_f32x8);
+    __m256 const ratio_cubed_f32x8 = _mm256_mul_ps(ratio_f32x8, ratio_squared_f32x8);
     // Polynomial evaluation using Horner's method
-    __m256 polynomials = coeff_1;
-    polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_2);
-    polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_3);
-    polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_4);
-    polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_5);
-    polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_6);
-    polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_7);
-    polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_8);
-    // Compute the result using masks for quadrant adjustments
-    __m256 results = _mm256_fmadd_ps(ratio_cubed, polynomials, ratio);
-    // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
+    __m256 polynomials_f32x8 = coeff_1_f32x8;
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_2_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_3_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_4_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_5_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_6_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_7_f32x8);
+    polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_8_f32x8);
+    // Compute the result using masks for quadrant_f32x8 adjustments
+    __m256 results_f32x8 = _mm256_fmadd_ps(ratio_cubed_f32x8, polynomials_f32x8, ratio_f32x8);
+    // Compute quadrant_f32x8 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
     //                        -2 for x<0 && !swap, -1 for x<0 && swap
-    __m256 quadrant = _mm256_setzero_ps();
-    __m256 neg_two = _mm256_set1_ps(-2.0f);
-    quadrant = _mm256_blendv_ps(quadrant, neg_two, xs_negative_mask);
-    __m256 one = _mm256_set1_ps(1.0f);
-    __m256 quadrant_incremented = _mm256_add_ps(quadrant, one);
-    quadrant = _mm256_blendv_ps(quadrant, quadrant_incremented, swap_mask);
+    __m256 quadrant_f32x8 = _mm256_setzero_ps();
+    __m256 neg_two_f32x8 = _mm256_set1_ps(-2.0f);
+    quadrant_f32x8 = _mm256_blendv_ps(quadrant_f32x8, neg_two_f32x8, xs_negative_mask_f32x8);
+    __m256 one_f32x8 = _mm256_set1_ps(1.0f);
+    __m256 quadrant_incremented_f32x8 = _mm256_add_ps(quadrant_f32x8, one_f32x8);
+    quadrant_f32x8 = _mm256_blendv_ps(quadrant_f32x8, quadrant_incremented_f32x8, swap_mask_f32x8);
-    // Adjust for quadrant: result += quadrant * π/2
-    __m256 pi_half = _mm256_set1_ps(1.5707963267948966f);
-    results = _mm256_fmadd_ps(quadrant, pi_half, results);
+    // Adjust for quadrant_f32x8: result += quadrant_f32x8 * π/2
+    __m256 pi_half_f32x8 = _mm256_set1_ps(1.5707963267948966f);
+    results_f32x8 = _mm256_fmadd_ps(quadrant_f32x8, pi_half_f32x8, results_f32x8);
     // Transfer sign from x (XOR with sign bit of x_input)
-    __m256 xs_sign_bits = _mm256_and_ps(xs_inputs, sign_mask);
-    results = _mm256_xor_ps(results, xs_sign_bits);
+    __m256 xs_sign_bits_f32x8 = _mm256_and_ps(xs_inputs, sign_mask_f32x8);
+    results_f32x8 = _mm256_xor_ps(results_f32x8, xs_sign_bits_f32x8);
     // Transfer sign from y (XOR with sign bit of y_input)
-    __m256 ys_sign_bits = _mm256_and_ps(ys_inputs, sign_mask);
-    results = _mm256_xor_ps(results, ys_sign_bits);
+    __m256 ys_sign_bits_f32x8 = _mm256_and_ps(ys_inputs, sign_mask_f32x8);
+    results_f32x8 = _mm256_xor_ps(results_f32x8, ys_sign_bits_f32x8);
-    return results;
+    return results_f32x8;
 }
 NK_INTERNAL __m256d nk_sin_f64x4_haswell_(__m256d const angles_radians) {
     // Constants for argument reduction
-    __m256d const pi_high = _mm256_set1_pd(3.141592653589793116);         // High-digits part of π
-    __m256d const pi_low = _mm256_set1_pd(1.2246467991473532072e-16);     // Low-digits part of π
-    __m256d const pi_reciprocal = _mm256_set1_pd(0.31830988618379067154); // 1/π
+    __m256d const pi_high_f64x4 = _mm256_set1_pd(3.141592653589793116);         // High-digits part of π
+    __m256d const pi_low_f64x4 = _mm256_set1_pd(1.2246467991473532072e-16);     // Low-digits part of π
+    __m256d const pi_reciprocal_f64x4 = _mm256_set1_pd(0.31830988618379067154); // 1/π
     // Polynomial coefficients for sine approximation (minimax polynomial)
-    __m256d const coeff_0 = _mm256_set1_pd(+0.00833333333333332974823815);
-    __m256d const coeff_1 = _mm256_set1_pd(-0.000198412698412696162806809);
-    __m256d const coeff_2 = _mm256_set1_pd(+2.75573192239198747630416e-06);
-    __m256d const coeff_3 = _mm256_set1_pd(-2.50521083763502045810755e-08);
-    __m256d const coeff_4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
-    __m256d const coeff_5 = _mm256_set1_pd(-7.64712219118158833288484e-13);
-    __m256d const coeff_6 = _mm256_set1_pd(+2.81009972710863200091251e-15);
-    __m256d const coeff_7 = _mm256_set1_pd(-7.97255955009037868891952e-18);
-    __m256d const coeff_8 = _mm256_set1_pd(-0.166666666666666657414808);
-    // Compute (rounded_quotients) = round(angle / π)
-    __m256d const quotients = _mm256_mul_pd(angles_radians, pi_reciprocal);
-    __m256d const rounded_quotients = _mm256_round_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    // Reduce the angle: angle - (rounded_quotients * π_high + rounded_quotients * π_low)
-    __m256d angles = angles_radians;
-    angles = _mm256_fnmadd_pd(rounded_quotients, pi_high, angles);
-    angles = _mm256_fnmadd_pd(rounded_quotients, pi_low, angles);
-    // If rounded_quotients is odd (bit 0 set), negate the angle
+    __m256d const coeff_0_f64x4 = _mm256_set1_pd(+0.00833333333333332974823815);
+    __m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.000198412698412696162806809);
+    __m256d const coeff_2_f64x4 = _mm256_set1_pd(+2.75573192239198747630416e-06);
+    __m256d const coeff_3_f64x4 = _mm256_set1_pd(-2.50521083763502045810755e-08);
+    __m256d const coeff_4_f64x4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
+    __m256d const coeff_5_f64x4 = _mm256_set1_pd(-7.64712219118158833288484e-13);
+    __m256d const coeff_6_f64x4 = _mm256_set1_pd(+2.81009972710863200091251e-15);
+    __m256d const coeff_7_f64x4 = _mm256_set1_pd(-7.97255955009037868891952e-18);
+    __m256d const coeff_8_f64x4 = _mm256_set1_pd(-0.166666666666666657414808);
+    // Compute (rounded_quotients_f64x4) = round(angle / π)
+    __m256d const quotients_f64x4 = _mm256_mul_pd(angles_radians, pi_reciprocal_f64x4);
+    __m256d const rounded_quotients_f64x4 = _mm256_round_pd(quotients_f64x4,
+                                                            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    // Reduce the angle: angle - (rounded_quotients_f64x4 * π_high + rounded_quotients_f64x4 * π_low)
+    __m256d angles_f64x4 = angles_radians;
+    angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_high_f64x4, angles_f64x4);
+    angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_low_f64x4, angles_f64x4);
+    // If rounded_quotients_f64x4 is odd (bit 0 set), negate the angle
     // Convert to 32-bit int (returns __m128i with 4 x 32-bit ints)
-    // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
-    __m128i quotients_i32 = _mm256_cvttpd_epi32(rounded_quotients);
-    __m128i parity = _mm_and_si128(quotients_i32, _mm_set1_epi32(1));
-    __m128i odd_mask_i32 = _mm_cmpeq_epi32(parity, _mm_set1_epi32(1));
+    // Use truncation (MXCSR-independent) since rounded_quotients_f64x4 is already integer-valued
+    __m128i quotients_i32_i32x4 = _mm256_cvttpd_epi32(rounded_quotients_f64x4);
+    __m128i parity_i32x4 = _mm_and_si128(quotients_i32_i32x4, _mm_set1_epi32(1));
+    __m128i odd_mask_i32_i32x4 = _mm_cmpeq_epi32(parity_i32x4, _mm_set1_epi32(1));
     // Expand 32-bit mask to 64-bit by shuffling
-    __m256i odd_mask_i64 = _mm256_cvtepi32_epi64(odd_mask_i32);
-    __m256d float_mask = _mm256_castsi256_pd(odd_mask_i64);
-    __m256d negated_angles = _mm256_sub_pd(_mm256_setzero_pd(), angles);
-    angles = _mm256_blendv_pd(angles, negated_angles, float_mask);
+    __m256i odd_mask_i64_i32x8 = _mm256_cvtepi32_epi64(odd_mask_i32_i32x4);
+    __m256d float_mask_f64x4 = _mm256_castsi256_pd(odd_mask_i64_i32x8);
+    __m256d negated_angles_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), angles_f64x4);
+    angles_f64x4 = _mm256_blendv_pd(angles_f64x4, negated_angles_f64x4, float_mask_f64x4);
-    __m256d const angles_squared = _mm256_mul_pd(angles, angles);
-    __m256d const angles_cubed = _mm256_mul_pd(angles, angles_squared);
-    __m256d const angles_quadratic = _mm256_mul_pd(angles_squared, angles_squared);
-    __m256d const angles_octic = _mm256_mul_pd(angles_quadratic, angles_quadratic);
+    __m256d const angles_squared_f64x4 = _mm256_mul_pd(angles_f64x4, angles_f64x4);
+    __m256d const angles_cubed_f64x4 = _mm256_mul_pd(angles_f64x4, angles_squared_f64x4);
+    __m256d const angles_quadratic_f64x4 = _mm256_mul_pd(angles_squared_f64x4, angles_squared_f64x4);
+    __m256d const angles_octic_f64x4 = _mm256_mul_pd(angles_quadratic_f64x4, angles_quadratic_f64x4);
     // Compute higher-degree polynomial terms
-    __m256d const poly_67 = _mm256_fmadd_pd(angles_squared, coeff_7, coeff_6);
-    __m256d const poly_45 = _mm256_fmadd_pd(angles_squared, coeff_5, coeff_4);
-    __m256d const poly_4567 = _mm256_fmadd_pd(angles_quadratic, poly_67, poly_45);
+    __m256d const poly_67_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_7_f64x4, coeff_6_f64x4);
+    __m256d const poly_45_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_5_f64x4, coeff_4_f64x4);
+    __m256d const poly_4567_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_67_f64x4, poly_45_f64x4);
     // Compute lower-degree polynomial terms
-    __m256d const poly_23 = _mm256_fmadd_pd(angles_squared, coeff_3, coeff_2);
-    __m256d const poly_01 = _mm256_fmadd_pd(angles_squared, coeff_1, coeff_0);
-    __m256d const poly_0123 = _mm256_fmadd_pd(angles_quadratic, poly_23, poly_01);
+    __m256d const poly_23_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_3_f64x4, coeff_2_f64x4);
+    __m256d const poly_01_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_1_f64x4, coeff_0_f64x4);
+    __m256d const poly_0123_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_23_f64x4, poly_01_f64x4);
     // Combine polynomial terms
-    __m256d results = _mm256_fmadd_pd(angles_octic, poly_4567, poly_0123);
-    results = _mm256_fmadd_pd(results, angles_squared, coeff_8);
-    results = _mm256_fmadd_pd(results, angles_cubed, angles);
+    __m256d results_f64x4 = _mm256_fmadd_pd(angles_octic_f64x4, poly_4567_f64x4, poly_0123_f64x4);
+    results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_squared_f64x4, coeff_8_f64x4);
+    results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_cubed_f64x4, angles_f64x4);
     // Handle the special case of negative zero input
-    __m256d const non_zero_mask = _mm256_cmp_pd(angles_radians, _mm256_setzero_pd(), _CMP_NEQ_UQ);
-    results = _mm256_and_pd(results, non_zero_mask);
-    return results;
+    __m256d const non_zero_mask_f64x4 = _mm256_cmp_pd(angles_radians, _mm256_setzero_pd(), _CMP_NEQ_UQ);
+    results_f64x4 = _mm256_and_pd(results_f64x4, non_zero_mask_f64x4);
+    return results_f64x4;
 }
 NK_INTERNAL __m256d nk_cos_f64x4_haswell_(__m256d const angles_radians) {
     // Constants for argument reduction
-    __m256d const pi_high_half = _mm256_set1_pd(3.141592653589793116 * 0.5);     // High-digits part of π/2
-    __m256d const pi_low_half = _mm256_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π/2
-    __m256d const pi_reciprocal = _mm256_set1_pd(0.31830988618379067154);        // 1/π
+    __m256d const pi_high_half_f64x4 = _mm256_set1_pd(3.141592653589793116 * 0.5);     // High-digits part of π/2
+    __m256d const pi_low_half_f64x4 = _mm256_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π/2
+    __m256d const pi_reciprocal_f64x4 = _mm256_set1_pd(0.31830988618379067154);        // 1/π
     // Polynomial coefficients for cosine approximation
-    __m256d const coeff_0 = _mm256_set1_pd(+0.00833333333333332974823815);
-    __m256d const coeff_1 = _mm256_set1_pd(-0.000198412698412696162806809);
-    __m256d const coeff_2 = _mm256_set1_pd(+2.75573192239198747630416e-06);
-    __m256d const coeff_3 = _mm256_set1_pd(-2.50521083763502045810755e-08);
-    __m256d const coeff_4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
-    __m256d const coeff_5 = _mm256_set1_pd(-7.64712219118158833288484e-13);
-    __m256d const coeff_6 = _mm256_set1_pd(+2.81009972710863200091251e-15);
-    __m256d const coeff_7 = _mm256_set1_pd(-7.97255955009037868891952e-18);
-    __m256d const coeff_8 = _mm256_set1_pd(-0.166666666666666657414808);
-    // Compute (rounded_quotients) = 2 * round(angle / π - 0.5) + 1
-    // Use fmsub: a*b - c = angles * (1/π) - 0.5
-    __m256d const quotients = _mm256_fmsub_pd(angles_radians, pi_reciprocal, _mm256_set1_pd(0.5));
-    __m256d const rounded_quotients = _mm256_fmadd_pd(                             //
-        _mm256_set1_pd(2.0),                                                       //
-        _mm256_round_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
+    __m256d const coeff_0_f64x4 = _mm256_set1_pd(+0.00833333333333332974823815);
+    __m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.000198412698412696162806809);
+    __m256d const coeff_2_f64x4 = _mm256_set1_pd(+2.75573192239198747630416e-06);
+    __m256d const coeff_3_f64x4 = _mm256_set1_pd(-2.50521083763502045810755e-08);
+    __m256d const coeff_4_f64x4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
+    __m256d const coeff_5_f64x4 = _mm256_set1_pd(-7.64712219118158833288484e-13);
+    __m256d const coeff_6_f64x4 = _mm256_set1_pd(+2.81009972710863200091251e-15);
+    __m256d const coeff_7_f64x4 = _mm256_set1_pd(-7.97255955009037868891952e-18);
+    __m256d const coeff_8_f64x4 = _mm256_set1_pd(-0.166666666666666657414808);
+    // Compute (rounded_quotients_f64x4) = 2 * round(angle / π - 0.5) + 1
+    // Use fmsub: a*b - c = angles_f64x4 * (1/π) - 0.5
+    __m256d const quotients_f64x4 = _mm256_fmsub_pd(angles_radians, pi_reciprocal_f64x4, _mm256_set1_pd(0.5));
+    __m256d const rounded_quotients_f64x4 = _mm256_fmadd_pd(                             //
+        _mm256_set1_pd(2.0),                                                             //
+        _mm256_round_pd(quotients_f64x4, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
         _mm256_set1_pd(1.0));
-    // Reduce the angle: angle - (rounded_quotients * π_high_half + rounded_quotients * π_low_half)
-    __m256d angles = angles_radians;
-    angles = _mm256_fnmadd_pd(rounded_quotients, pi_high_half, angles);
-    angles = _mm256_fnmadd_pd(rounded_quotients, pi_low_half, angles);
-    // If (rounded_quotients & 2) == 0, negate the angle
-    // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
-    __m128i quotients_i32 = _mm256_cvttpd_epi32(rounded_quotients);
-    __m128i bit2 = _mm_and_si128(quotients_i32, _mm_set1_epi32(2));
-    __m128i flip_mask_i32 = _mm_cmpeq_epi32(bit2, _mm_setzero_si128());
-    __m256i flip_mask_i64 = _mm256_cvtepi32_epi64(flip_mask_i32);
-    __m256d float_mask = _mm256_castsi256_pd(flip_mask_i64);
-    __m256d negated_angles = _mm256_sub_pd(_mm256_setzero_pd(), angles);
-    angles = _mm256_blendv_pd(angles, negated_angles, float_mask);
-    __m256d const angles_squared = _mm256_mul_pd(angles, angles);
-    __m256d const angles_cubed = _mm256_mul_pd(angles, angles_squared);
-    __m256d const angles_quadratic = _mm256_mul_pd(angles_squared, angles_squared);
-    __m256d const angles_octic = _mm256_mul_pd(angles_quadratic, angles_quadratic);
+    // Reduce the angle: angle - (rounded_quotients_f64x4 * π_high_half + rounded_quotients_f64x4 * π_low_half)
+    __m256d angles_f64x4 = angles_radians;
+    angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_high_half_f64x4, angles_f64x4);
+    angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_low_half_f64x4, angles_f64x4);
+    // If (rounded_quotients_f64x4 & 2) == 0, negate the angle
+    // Use truncation (MXCSR-independent) since rounded_quotients_f64x4 is already integer-valued
+    __m128i quotients_i32_i32x4 = _mm256_cvttpd_epi32(rounded_quotients_f64x4);
+    __m128i bit2_i32x4 = _mm_and_si128(quotients_i32_i32x4, _mm_set1_epi32(2));
+    __m128i flip_mask_i32_i32x4 = _mm_cmpeq_epi32(bit2_i32x4, _mm_setzero_si128());
+    __m256i flip_mask_i64_i32x8 = _mm256_cvtepi32_epi64(flip_mask_i32_i32x4);
+    __m256d float_mask_f64x4 = _mm256_castsi256_pd(flip_mask_i64_i32x8);
+    __m256d negated_angles_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), angles_f64x4);
+    angles_f64x4 = _mm256_blendv_pd(angles_f64x4, negated_angles_f64x4, float_mask_f64x4);
+    __m256d const angles_squared_f64x4 = _mm256_mul_pd(angles_f64x4, angles_f64x4);
+    __m256d const angles_cubed_f64x4 = _mm256_mul_pd(angles_f64x4, angles_squared_f64x4);
+    __m256d const angles_quadratic_f64x4 = _mm256_mul_pd(angles_squared_f64x4, angles_squared_f64x4);
+    __m256d const angles_octic_f64x4 = _mm256_mul_pd(angles_quadratic_f64x4, angles_quadratic_f64x4);
     // Compute higher-degree polynomial terms
-    __m256d const poly_67 = _mm256_fmadd_pd(angles_squared, coeff_7, coeff_6);
-    __m256d const poly_45 = _mm256_fmadd_pd(angles_squared, coeff_5, coeff_4);
-    __m256d const poly_4567 = _mm256_fmadd_pd(angles_quadratic, poly_67, poly_45);
+    __m256d const poly_67_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_7_f64x4, coeff_6_f64x4);
+    __m256d const poly_45_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_5_f64x4, coeff_4_f64x4);
+    __m256d const poly_4567_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_67_f64x4, poly_45_f64x4);
     // Compute lower-degree polynomial terms
-    __m256d const poly_23 = _mm256_fmadd_pd(angles_squared, coeff_3, coeff_2);
-    __m256d const poly_01 = _mm256_fmadd_pd(angles_squared, coeff_1, coeff_0);
-    __m256d const poly_0123 = _mm256_fmadd_pd(angles_quadratic, poly_23, poly_01);
+    __m256d const poly_23_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_3_f64x4, coeff_2_f64x4);
+    __m256d const poly_01_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_1_f64x4, coeff_0_f64x4);
+    __m256d const poly_0123_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_23_f64x4, poly_01_f64x4);
     // Combine polynomial terms
-    __m256d results = _mm256_fmadd_pd(angles_octic, poly_4567, poly_0123);
-    results = _mm256_fmadd_pd(results, angles_squared, coeff_8);
-    results = _mm256_fmadd_pd(results, angles_cubed, angles);
-    return results;
+    __m256d results_f64x4 = _mm256_fmadd_pd(angles_octic_f64x4, poly_4567_f64x4, poly_0123_f64x4);
+    results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_squared_f64x4, coeff_8_f64x4);
+    results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_cubed_f64x4, angles_f64x4);
+    return results_f64x4;
 }
 NK_INTERNAL __m256d nk_atan_f64x4_haswell_(__m256d const inputs) {
     // Polynomial coefficients for atan approximation (19 coefficients)
     // The polynomial approximates: atan(x) ≈ x + x³ * P(x²) where P has 19 terms
-    __m256d const coeff_19 = _mm256_set1_pd(-1.88796008463073496563746e-05);
-    __m256d const coeff_18 = _mm256_set1_pd(+0.000209850076645816976906797);
-    __m256d const coeff_17 = _mm256_set1_pd(-0.00110611831486672482563471);
-    __m256d const coeff_16 = _mm256_set1_pd(+0.00370026744188713119232403);
-    __m256d const coeff_15 = _mm256_set1_pd(-0.00889896195887655491740809);
-    __m256d const coeff_14 = _mm256_set1_pd(+0.016599329773529201970117);
-    __m256d const coeff_13 = _mm256_set1_pd(-0.0254517624932312641616861);
-    __m256d const coeff_12 = _mm256_set1_pd(+0.0337852580001353069993897);
-    __m256d const coeff_11 = _mm256_set1_pd(-0.0407629191276836500001934);
-    __m256d const coeff_10 = _mm256_set1_pd(+0.0466667150077840625632675);
-    __m256d const coeff_9 = _mm256_set1_pd(-0.0523674852303482457616113);
-    __m256d const coeff_8 = _mm256_set1_pd(+0.0587666392926673580854313);
-    __m256d const coeff_7 = _mm256_set1_pd(-0.0666573579361080525984562);
-    __m256d const coeff_6 = _mm256_set1_pd(+0.0769219538311769618355029);
-    __m256d const coeff_5 = _mm256_set1_pd(-0.090908995008245008229153);
-    __m256d const coeff_4 = _mm256_set1_pd(+0.111111105648261418443745);
-    __m256d const coeff_3 = _mm256_set1_pd(-0.14285714266771329383765);
-    __m256d const coeff_2 = _mm256_set1_pd(+0.199999999996591265594148);
-    __m256d const coeff_1 = _mm256_set1_pd(-0.333333333333311110369124);
-    __m256d const sign_mask = _mm256_set1_pd(-0.0);
-    // Adjust for quadrant - detect negative values
-    __m256d values = inputs;
-    __m256d negative_mask = _mm256_cmp_pd(values, _mm256_setzero_pd(), _CMP_LT_OS);
-    values = _mm256_andnot_pd(sign_mask, values); // abs(values)
-    // Check if values > 1 (need reciprocal)
+    __m256d const coeff_19_f64x4 = _mm256_set1_pd(-1.88796008463073496563746e-05);
+    __m256d const coeff_18_f64x4 = _mm256_set1_pd(+0.000209850076645816976906797);
+    __m256d const coeff_17_f64x4 = _mm256_set1_pd(-0.00110611831486672482563471);
+    __m256d const coeff_16_f64x4 = _mm256_set1_pd(+0.00370026744188713119232403);
+    __m256d const coeff_15_f64x4 = _mm256_set1_pd(-0.00889896195887655491740809);
+    __m256d const coeff_14_f64x4 = _mm256_set1_pd(+0.016599329773529201970117);
+    __m256d const coeff_13_f64x4 = _mm256_set1_pd(-0.0254517624932312641616861);
+    __m256d const coeff_12_f64x4 = _mm256_set1_pd(+0.0337852580001353069993897);
+    __m256d const coeff_11_f64x4 = _mm256_set1_pd(-0.0407629191276836500001934);
+    __m256d const coeff_10_f64x4 = _mm256_set1_pd(+0.0466667150077840625632675);
+    __m256d const coeff_9_f64x4 = _mm256_set1_pd(-0.0523674852303482457616113);
+    __m256d const coeff_8_f64x4 = _mm256_set1_pd(+0.0587666392926673580854313);
+    __m256d const coeff_7_f64x4 = _mm256_set1_pd(-0.0666573579361080525984562);
+    __m256d const coeff_6_f64x4 = _mm256_set1_pd(+0.0769219538311769618355029);
+    __m256d const coeff_5_f64x4 = _mm256_set1_pd(-0.090908995008245008229153);
+    __m256d const coeff_4_f64x4 = _mm256_set1_pd(+0.111111105648261418443745);
+    __m256d const coeff_3_f64x4 = _mm256_set1_pd(-0.14285714266771329383765);
+    __m256d const coeff_2_f64x4 = _mm256_set1_pd(+0.199999999996591265594148);
+    __m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.333333333333311110369124);
+    __m256d const sign_mask_f64x4 = _mm256_set1_pd(-0.0);
+    // Adjust for quadrant - detect negative values_f64x4
+    __m256d values_f64x4 = inputs;
+    __m256d negative_mask_f64x4 = _mm256_cmp_pd(values_f64x4, _mm256_setzero_pd(), _CMP_LT_OS);
+    values_f64x4 = _mm256_andnot_pd(sign_mask_f64x4, values_f64x4); // abs(values_f64x4)
+    // Check if values_f64x4 > 1 (need reciprocal)
     // Note: For f64, we keep VDIVPD since RCPPD doesn't exist and Newton-Raphson
     // would need 2 iterations for sufficient precision (~44 bits needed for f64)
-    __m256d reciprocal_mask = _mm256_cmp_pd(values, _mm256_set1_pd(1.0), _CMP_GT_OS);
-    __m256d reciprocal_values = _mm256_div_pd(_mm256_set1_pd(1.0), values);
-    values = _mm256_blendv_pd(values, reciprocal_values, reciprocal_mask);
+    __m256d reciprocal_mask_f64x4 = _mm256_cmp_pd(values_f64x4, _mm256_set1_pd(1.0), _CMP_GT_OS);
+    __m256d reciprocal_values_f64x4 = _mm256_div_pd(_mm256_set1_pd(1.0), values_f64x4);
+    values_f64x4 = _mm256_blendv_pd(values_f64x4, reciprocal_values_f64x4, reciprocal_mask_f64x4);
     // Argument reduction
-    __m256d const values_squared = _mm256_mul_pd(values, values);
-    __m256d const values_cubed = _mm256_mul_pd(values, values_squared);
+    __m256d const values_squared_f64x4 = _mm256_mul_pd(values_f64x4, values_f64x4);
+    __m256d const values_cubed_f64x4 = _mm256_mul_pd(values_f64x4, values_squared_f64x4);
     // Polynomial evaluation using Horner's method.
     // For large arrays, out-of-order execution across loop iterations already hides
     // FMA latency. Estrin's scheme was tested but showed minimal improvement (~1%)
     // while adding complexity. Keeping Horner for maintainability.
-    __m256d polynomials = coeff_19;
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_18);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_17);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_16);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_15);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_14);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_13);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_12);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_11);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_10);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_9);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_8);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_7);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_6);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_5);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_4);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_3);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_2);
-    polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_1);
-    // Compute result
-    __m256d result = _mm256_fmadd_pd(values_cubed, polynomials, values);
-    // Adjust for reciprocal: result = π/2 - result
-    __m256d adjusted = _mm256_sub_pd(_mm256_set1_pd(1.5707963267948966), result);
-    result = _mm256_blendv_pd(result, adjusted, reciprocal_mask);
-    // Adjust for negative: result = -result
-    __m256d negated = _mm256_sub_pd(_mm256_setzero_pd(), result);
-    result = _mm256_blendv_pd(result, negated, negative_mask);
-    return result;
+    __m256d polynomials_f64x4 = coeff_19_f64x4;
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_18_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_17_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_16_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_15_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_14_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_13_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_12_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_11_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_10_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_9_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_8_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_7_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_6_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_5_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_4_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_3_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_2_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_1_f64x4);
+    // Compute result_f64x4
+    __m256d result_f64x4 = _mm256_fmadd_pd(values_cubed_f64x4, polynomials_f64x4, values_f64x4);
+    // Adjust for reciprocal: result_f64x4 = π/2 - result_f64x4
+    __m256d adjusted_f64x4 = _mm256_sub_pd(_mm256_set1_pd(1.5707963267948966), result_f64x4);
+    result_f64x4 = _mm256_blendv_pd(result_f64x4, adjusted_f64x4, reciprocal_mask_f64x4);
+    // Adjust for negative: result_f64x4 = -result_f64x4
+    __m256d negated_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), result_f64x4);
+    result_f64x4 = _mm256_blendv_pd(result_f64x4, negated_f64x4, negative_mask_f64x4);
+    return result_f64x4;
 }
 NK_INTERNAL __m256d nk_atan2_f64x4_haswell_(__m256d const ys_inputs, __m256d const xs_inputs) {
     // Polynomial coefficients for atan approximation (19 coefficients, same as atan)
-    __m256d const coeff_19 = _mm256_set1_pd(-1.88796008463073496563746e-05);
-    __m256d const coeff_18 = _mm256_set1_pd(+0.000209850076645816976906797);
-    __m256d const coeff_17 = _mm256_set1_pd(-0.00110611831486672482563471);
-    __m256d const coeff_16 = _mm256_set1_pd(+0.00370026744188713119232403);
-    __m256d const coeff_15 = _mm256_set1_pd(-0.00889896195887655491740809);
-    __m256d const coeff_14 = _mm256_set1_pd(+0.016599329773529201970117);
-    __m256d const coeff_13 = _mm256_set1_pd(-0.0254517624932312641616861);
-    __m256d const coeff_12 = _mm256_set1_pd(+0.0337852580001353069993897);
-    __m256d const coeff_11 = _mm256_set1_pd(-0.0407629191276836500001934);
-    __m256d const coeff_10 = _mm256_set1_pd(+0.0466667150077840625632675);
-    __m256d const coeff_9 = _mm256_set1_pd(-0.0523674852303482457616113);
-    __m256d const coeff_8 = _mm256_set1_pd(+0.0587666392926673580854313);
-    __m256d const coeff_7 = _mm256_set1_pd(-0.0666573579361080525984562);
-    __m256d const coeff_6 = _mm256_set1_pd(+0.0769219538311769618355029);
-    __m256d const coeff_5 = _mm256_set1_pd(-0.090908995008245008229153);
-    __m256d const coeff_4 = _mm256_set1_pd(+0.111111105648261418443745);
-    __m256d const coeff_3 = _mm256_set1_pd(-0.14285714266771329383765);
-    __m256d const coeff_2 = _mm256_set1_pd(+0.199999999996591265594148);
-    __m256d const coeff_1 = _mm256_set1_pd(-0.333333333333311110369124);
-    __m256d const sign_mask = _mm256_set1_pd(-0.0);
+    __m256d const coeff_19_f64x4 = _mm256_set1_pd(-1.88796008463073496563746e-05);
+    __m256d const coeff_18_f64x4 = _mm256_set1_pd(+0.000209850076645816976906797);
+    __m256d const coeff_17_f64x4 = _mm256_set1_pd(-0.00110611831486672482563471);
+    __m256d const coeff_16_f64x4 = _mm256_set1_pd(+0.00370026744188713119232403);
+    __m256d const coeff_15_f64x4 = _mm256_set1_pd(-0.00889896195887655491740809);
+    __m256d const coeff_14_f64x4 = _mm256_set1_pd(+0.016599329773529201970117);
+    __m256d const coeff_13_f64x4 = _mm256_set1_pd(-0.0254517624932312641616861);
+    __m256d const coeff_12_f64x4 = _mm256_set1_pd(+0.0337852580001353069993897);
+    __m256d const coeff_11_f64x4 = _mm256_set1_pd(-0.0407629191276836500001934);
+    __m256d const coeff_10_f64x4 = _mm256_set1_pd(+0.0466667150077840625632675);
+    __m256d const coeff_9_f64x4 = _mm256_set1_pd(-0.0523674852303482457616113);
+    __m256d const coeff_8_f64x4 = _mm256_set1_pd(+0.0587666392926673580854313);
+    __m256d const coeff_7_f64x4 = _mm256_set1_pd(-0.0666573579361080525984562);
+    __m256d const coeff_6_f64x4 = _mm256_set1_pd(+0.0769219538311769618355029);
+    __m256d const coeff_5_f64x4 = _mm256_set1_pd(-0.090908995008245008229153);
+    __m256d const coeff_4_f64x4 = _mm256_set1_pd(+0.111111105648261418443745);
+    __m256d const coeff_3_f64x4 = _mm256_set1_pd(-0.14285714266771329383765);
+    __m256d const coeff_2_f64x4 = _mm256_set1_pd(+0.199999999996591265594148);
+    __m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.333333333333311110369124);
+    __m256d const sign_mask_f64x4 = _mm256_set1_pd(-0.0);
     // Quadrant adjustments normalizing to absolute values of x and y
-    __m256d xs_negative_mask = _mm256_cmp_pd(xs_inputs, _mm256_setzero_pd(), _CMP_LT_OS);
-    __m256d xs = _mm256_andnot_pd(sign_mask, xs_inputs); // abs(xs_inputs)
-    __m256d ys = _mm256_andnot_pd(sign_mask, ys_inputs); // abs(ys_inputs)
+    __m256d xs_negative_mask_f64x4 = _mm256_cmp_pd(xs_inputs, _mm256_setzero_pd(), _CMP_LT_OS);
+    __m256d xs_f64x4 = _mm256_andnot_pd(sign_mask_f64x4, xs_inputs); // abs(xs_inputs)
+    __m256d ys_f64x4 = _mm256_andnot_pd(sign_mask_f64x4, ys_inputs); // abs(ys_inputs)
     // Ensure proper fraction where the numerator is smaller than the denominator
-    __m256d swap_mask = _mm256_cmp_pd(ys, xs, _CMP_GT_OS);
-    __m256d temps = xs;
-    xs = _mm256_blendv_pd(xs, ys, swap_mask);
-    __m256d neg_temps = _mm256_sub_pd(_mm256_setzero_pd(), temps);
-    ys = _mm256_blendv_pd(ys, neg_temps, swap_mask);
+    __m256d swap_mask_f64x4 = _mm256_cmp_pd(ys_f64x4, xs_f64x4, _CMP_GT_OS);
+    __m256d temps_f64x4 = xs_f64x4;
+    xs_f64x4 = _mm256_blendv_pd(xs_f64x4, ys_f64x4, swap_mask_f64x4);
+    __m256d neg_temps_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), temps_f64x4);
+    ys_f64x4 = _mm256_blendv_pd(ys_f64x4, neg_temps_f64x4, swap_mask_f64x4);
-    // Compute ratio and powers
-    __m256d const ratio = _mm256_div_pd(ys, xs);
-    __m256d const ratio_squared = _mm256_mul_pd(ratio, ratio);
-    __m256d const ratio_cubed = _mm256_mul_pd(ratio, ratio_squared);
+    // Compute ratio_f64x4 and powers
+    __m256d const ratio_f64x4 = _mm256_div_pd(ys_f64x4, xs_f64x4);
+    __m256d const ratio_squared_f64x4 = _mm256_mul_pd(ratio_f64x4, ratio_f64x4);
+    __m256d const ratio_cubed_f64x4 = _mm256_mul_pd(ratio_f64x4, ratio_squared_f64x4);
     // Polynomial evaluation using Horner's method
-    __m256d polynomials = coeff_19;
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_18);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_17);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_16);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_15);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_14);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_13);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_12);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_11);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_10);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_9);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_8);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_7);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_6);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_5);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_4);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_3);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_2);
-    polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_1);
-    // Compute the result using masks for quadrant adjustments
-    __m256d results = _mm256_fmadd_pd(ratio_cubed, polynomials, ratio);
-    // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
+    __m256d polynomials_f64x4 = coeff_19_f64x4;
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_18_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_17_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_16_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_15_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_14_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_13_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_12_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_11_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_10_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_9_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_8_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_7_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_6_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_5_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_4_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_3_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_2_f64x4);
+    polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_1_f64x4);
+    // Compute the result using masks for quadrant_f64x4 adjustments
+    __m256d results_f64x4 = _mm256_fmadd_pd(ratio_cubed_f64x4, polynomials_f64x4, ratio_f64x4);
+    // Compute quadrant_f64x4 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
     //                        -2 for x<0 && !swap, -1 for x<0 && swap
-    __m256d quadrant = _mm256_setzero_pd();
-    __m256d neg_two = _mm256_set1_pd(-2.0);
-    quadrant = _mm256_blendv_pd(quadrant, neg_two, xs_negative_mask);
-    __m256d one = _mm256_set1_pd(1.0);
-    __m256d quadrant_incremented = _mm256_add_pd(quadrant, one);
-    quadrant = _mm256_blendv_pd(quadrant, quadrant_incremented, swap_mask);
+    __m256d quadrant_f64x4 = _mm256_setzero_pd();
+    __m256d neg_two_f64x4 = _mm256_set1_pd(-2.0);
+    quadrant_f64x4 = _mm256_blendv_pd(quadrant_f64x4, neg_two_f64x4, xs_negative_mask_f64x4);
+    __m256d one_f64x4 = _mm256_set1_pd(1.0);
+    __m256d quadrant_incremented_f64x4 = _mm256_add_pd(quadrant_f64x4, one_f64x4);
+    quadrant_f64x4 = _mm256_blendv_pd(quadrant_f64x4, quadrant_incremented_f64x4, swap_mask_f64x4);
-    // Adjust for quadrant: result += quadrant * π/2
-    __m256d pi_half = _mm256_set1_pd(1.5707963267948966);
-    results = _mm256_fmadd_pd(quadrant, pi_half, results);
+    // Adjust for quadrant_f64x4: result += quadrant_f64x4 * π/2
+    __m256d pi_half_f64x4 = _mm256_set1_pd(1.5707963267948966);
+    results_f64x4 = _mm256_fmadd_pd(quadrant_f64x4, pi_half_f64x4, results_f64x4);
     // Transfer sign from x (XOR with sign bit of x_input)
-    __m256d xs_sign_bits = _mm256_and_pd(xs_inputs, sign_mask);
-    results = _mm256_xor_pd(results, xs_sign_bits);
+    __m256d xs_sign_bits_f64x4 = _mm256_and_pd(xs_inputs, sign_mask_f64x4);
+    results_f64x4 = _mm256_xor_pd(results_f64x4, xs_sign_bits_f64x4);
     // Transfer sign from y (XOR with sign bit of y_input)
-    __m256d ys_sign_bits = _mm256_and_pd(ys_inputs, sign_mask);
-    results = _mm256_xor_pd(results, ys_sign_bits);
+    __m256d ys_sign_bits_f64x4 = _mm256_and_pd(ys_inputs, sign_mask_f64x4);
+    results_f64x4 = _mm256_xor_pd(results_f64x4, ys_sign_bits_f64x4);
-    return results;
+    return results_f64x4;
 }
 NK_PUBLIC void nk_each_sin_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
     nk_size_t i = 0;
     for (; i + 8 <= n; i += 8) {
-        __m256 angles = _mm256_loadu_ps(ins + i);
-        __m256 results = nk_sin_f32x8_haswell_(angles);
-        _mm256_storeu_ps(outs + i, results);
+        __m256 angles_f32x8 = _mm256_loadu_ps(ins + i);
+        __m256 results_f32x8 = nk_sin_f32x8_haswell_(angles_f32x8);
+        _mm256_storeu_ps(outs + i, results_f32x8);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -555,9 +556,9 @@ NK_PUBLIC void nk_each_sin_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_
 NK_PUBLIC void nk_each_cos_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
     nk_size_t i = 0;
     for (; i + 8 <= n; i += 8) {
-        __m256 angles = _mm256_loadu_ps(ins + i);
-        __m256 results = nk_cos_f32x8_haswell_(angles);
-        _mm256_storeu_ps(outs + i, results);
+        __m256 angles_f32x8 = _mm256_loadu_ps(ins + i);
+        __m256 results_f32x8 = nk_cos_f32x8_haswell_(angles_f32x8);
+        _mm256_storeu_ps(outs + i, results_f32x8);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -572,9 +573,9 @@ NK_PUBLIC void nk_each_cos_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_
 NK_PUBLIC void nk_each_atan_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
     nk_size_t i = 0;
     for (; i + 8 <= n; i += 8) {
-        __m256 values = _mm256_loadu_ps(ins + i);
-        __m256 results = nk_atan_f32x8_haswell_(values);
-        _mm256_storeu_ps(outs + i, results);
+        __m256 values_f32x8 = _mm256_loadu_ps(ins + i);
+        __m256 results_f32x8 = nk_atan_f32x8_haswell_(values_f32x8);
+        _mm256_storeu_ps(outs + i, results_f32x8);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -589,9 +590,9 @@ NK_PUBLIC void nk_each_atan_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32
 NK_PUBLIC void nk_each_sin_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
     nk_size_t i = 0;
     for (; i + 4 <= n; i += 4) {
-        __m256d angles = _mm256_loadu_pd(ins + i);
-        __m256d results = nk_sin_f64x4_haswell_(angles);
-        _mm256_storeu_pd(outs + i, results);
+        __m256d angles_f64x4 = _mm256_loadu_pd(ins + i);
+        __m256d results_f64x4 = nk_sin_f64x4_haswell_(angles_f64x4);
+        _mm256_storeu_pd(outs + i, results_f64x4);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -606,9 +607,9 @@ NK_PUBLIC void nk_each_sin_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_
 NK_PUBLIC void nk_each_cos_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
     nk_size_t i = 0;
     for (; i + 4 <= n; i += 4) {
-        __m256d angles = _mm256_loadu_pd(ins + i);
-        __m256d results = nk_cos_f64x4_haswell_(angles);
-        _mm256_storeu_pd(outs + i, results);
+        __m256d angles_f64x4 = _mm256_loadu_pd(ins + i);
+        __m256d results_f64x4 = nk_cos_f64x4_haswell_(angles_f64x4);
+        _mm256_storeu_pd(outs + i, results_f64x4);
     }
     if (i < n) {
         nk_size_t remaining = n - i;
@@ -623,9 +624,9 @@ NK_PUBLIC void nk_each_cos_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_
 NK_PUBLIC void nk_each_atan_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
     nk_size_t i = 0;
     for (; i + 4 <= n; i += 4) {
-        __m256d values = _mm256_loadu_pd(ins + i);
-        __m256d results = nk_atan_f64x4_haswell_(values);
-        _mm256_storeu_pd(outs + i, results);
+        __m256d values_f64x4 = _mm256_loadu_pd(ins + i);
+        __m256d results_f64x4 = nk_atan_f64x4_haswell_(values_f64x4);
+        _mm256_storeu_pd(outs + i, results_f64x4);
     }
     if (i < n) {
         nk_size_t remaining = n - i;