npm - numkong - Versions diffs - 7.0.0 → 7.4.1 - Mend

numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +239 -122
package/binding.gyp +25 -491
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/trigonometry/skylake.h CHANGED Viewed

@@ -9,12 +9,12 @@
  *
  *  @section skylake_trig_instructions Key AVX-512 Trigonometry Instructions
  *
- *      Intrinsic                   Instruction                     Latency     Throughput  Ports
- *      _mm512_fmadd_ps             VFMADD132PS (ZMM, ZMM, ZMM)     4cy         0.5/cy      p05
- *      _mm512_mul_ps               VMULPS (ZMM, ZMM, ZMM)          4cy         0.5/cy      p05
- *      _mm512_and_ps               VANDPS (ZMM, ZMM, ZMM)          1cy         0.33/cy     p015
- *      _mm512_cmp_ps_mask          VCMPPS (K, ZMM, ZMM, I8)        3cy         1/cy        p01
- *      _mm512_roundscale_ps        VRNDSCALEPS (ZMM, ZMM, I8)      8cy         0.5/cy      p01
+ *      Intrinsic             Instruction                  Skylake-X      Genoa
+ *      _mm512_fmadd_ps       VFMADD132PS (ZMM, ZMM, ZMM)  4cy @ p05      4cy @ p01
+ *      _mm512_mul_ps         VMULPS (ZMM, ZMM, ZMM)       4cy @ p05      3cy @ p01
+ *      _mm512_and_ps         VANDPS (ZMM, ZMM, ZMM)       1cy @ p05      1cy @ p0123
+ *      _mm512_cmp_ps_mask    VCMPPS (K, ZMM, ZMM, I8)     4cy @ p5       5cy @ p01
+ *      _mm512_roundscale_ps  VRNDSCALEPS (ZMM, ZMM, I8)   8cy @ p05+p05  3cy @ p23
  *
  *  Trigonometric functions use polynomial approximations evaluated via Horner's method with FMA chains.
  *  AVX-512 mask registers enable branchless range reduction and sign handling without blend overhead.
@@ -42,394 +42,398 @@ extern "C" {
 NK_INTERNAL __m512 nk_sin_f32x16_skylake_(__m512 const angles_radians) {
     // Cody-Waite constants for argument reduction
-    __m512 const pi_hi_f32x16 = _mm512_set1_ps(3.1415927f);
-    __m512 const pi_lo_f32x16 = _mm512_set1_ps(-8.742278e-8f);
-    __m512 const pi_reciprocal = _mm512_set1_ps(0.31830988618379067154f); // 1/π
+    __m512 const pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
+    __m512 const pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
+    __m512 const pi_reciprocal_f32x16 = _mm512_set1_ps(0.31830988618379067154f); // 1/π
     // Degree-9 minimax coefficients
-    __m512 const coeff_9 = _mm512_set1_ps(+2.7557319224e-6f);
-    __m512 const coeff_7 = _mm512_set1_ps(-1.9841269841e-4f);
-    __m512 const coeff_5 = _mm512_set1_ps(+8.3333293855e-3f);
-    __m512 const coeff_3 = _mm512_set1_ps(-1.6666666641e-1f);
-    // Compute (multiples_of_pi) = round(angle / π)
-    __m512 quotients = _mm512_mul_ps(angles_radians, pi_reciprocal);
-    __m512 rounded_quotients = _mm512_roundscale_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    __m512 const coeff_9_f32x16 = _mm512_set1_ps(+2.7557319224e-6f);
+    __m512 const coeff_7_f32x16 = _mm512_set1_ps(-1.9841269841e-4f);
+    __m512 const coeff_5_f32x16 = _mm512_set1_ps(+8.3333293855e-3f);
+    __m512 const coeff_3_f32x16 = _mm512_set1_ps(-1.6666666641e-1f);
+    // Compute (multiples_of_pi_i32x16) = round(angle / π)
+    __m512 quotients_f32x16 = _mm512_mul_ps(angles_radians, pi_reciprocal_f32x16);
+    __m512 rounded_quotients_f32x16 = _mm512_roundscale_ps(quotients_f32x16,
+                                                           _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
     // Use explicit rounding to match roundscale (MXCSR-independent)
-    __m512i multiples_of_pi = _mm512_cvt_roundps_epi32(rounded_quotients,
-                                                       _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    __m512i multiples_of_pi_i32x16 = _mm512_cvt_roundps_epi32(rounded_quotients_f32x16,
+                                                              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
     // Cody-Waite range reduction
-    __m512 angles = _mm512_fnmadd_ps(rounded_quotients, pi_hi_f32x16, angles_radians);
-    angles = _mm512_fnmadd_ps(rounded_quotients, pi_lo_f32x16, angles);
-    __m512 const angles_squared = _mm512_mul_ps(angles, angles);
-    __m512 const angles_cubed = _mm512_mul_ps(angles, angles_squared);
+    __m512 angles_f32x16 = _mm512_fnmadd_ps(rounded_quotients_f32x16, pi_high_f32x16, angles_radians);
+    angles_f32x16 = _mm512_fnmadd_ps(rounded_quotients_f32x16, pi_low_f32x16, angles_f32x16);
+    __m512 const angles_squared_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
+    __m512 const angles_cubed_f32x16 = _mm512_mul_ps(angles_f32x16, angles_squared_f32x16);
     // Degree-9 polynomial via Horner's method
-    __m512 polynomials = coeff_9;
-    polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_7);
-    polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_5);
-    polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_3);
-    // If multiples_of_pi is odd, flip the sign of the results
-    __mmask16 odd_mask = _mm512_test_epi32_mask(multiples_of_pi, _mm512_set1_epi32(1));
-    __m512 results = _mm512_fmadd_ps(angles_cubed, polynomials, angles);
-    results = _mm512_mask_sub_ps(results, odd_mask, _mm512_setzero_ps(), results);
-    return results;
+    __m512 polynomials_f32x16 = coeff_9_f32x16;
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_7_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_5_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_3_f32x16);
+    // If multiples_of_pi_i32x16 is odd, flip the sign of the results_f32x16
+    __mmask16 odd_mask = _mm512_test_epi32_mask(multiples_of_pi_i32x16, _mm512_set1_epi32(1));
+    __m512 results_f32x16 = _mm512_fmadd_ps(angles_cubed_f32x16, polynomials_f32x16, angles_f32x16);
+    results_f32x16 = _mm512_mask_sub_ps(results_f32x16, odd_mask, _mm512_setzero_ps(), results_f32x16);
+    return results_f32x16;
 }
 NK_INTERNAL __m512 nk_cos_f32x16_skylake_(__m512 const angles_radians) {
     // Cody-Waite constants for argument reduction
-    __m512 const pi_hi_f32x16 = _mm512_set1_ps(3.1415927f);
-    __m512 const pi_lo_f32x16 = _mm512_set1_ps(-8.742278e-8f);
-    __m512 const pi_half = _mm512_set1_ps(1.57079632679489661923f);       // π/2
-    __m512 const pi_reciprocal = _mm512_set1_ps(0.31830988618379067154f); // 1/π
+    __m512 const pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
+    __m512 const pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
+    __m512 const pi_half_f32x16 = _mm512_set1_ps(1.57079632679489661923f);       // π/2
+    __m512 const pi_reciprocal_f32x16 = _mm512_set1_ps(0.31830988618379067154f); // 1/π
     // Degree-9 minimax coefficients
-    __m512 const coeff_9 = _mm512_set1_ps(+2.7557319224e-6f);
-    __m512 const coeff_7 = _mm512_set1_ps(-1.9841269841e-4f);
-    __m512 const coeff_5 = _mm512_set1_ps(+8.3333293855e-3f);
-    __m512 const coeff_3 = _mm512_set1_ps(-1.6666666641e-1f);
-    // Compute (multiples_of_pi) = round((angle / π) - 0.5)
-    __m512 quotients = _mm512_fmsub_ps(angles_radians, pi_reciprocal, _mm512_set1_ps(0.5f));
-    __m512 rounded_quotients = _mm512_roundscale_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    __m512 const coeff_9_f32x16 = _mm512_set1_ps(+2.7557319224e-6f);
+    __m512 const coeff_7_f32x16 = _mm512_set1_ps(-1.9841269841e-4f);
+    __m512 const coeff_5_f32x16 = _mm512_set1_ps(+8.3333293855e-3f);
+    __m512 const coeff_3_f32x16 = _mm512_set1_ps(-1.6666666641e-1f);
+    // Compute (multiples_of_pi_i32x16) = round((angle / π) - 0.5)
+    __m512 quotients_f32x16 = _mm512_fmsub_ps(angles_radians, pi_reciprocal_f32x16, _mm512_set1_ps(0.5f));
+    __m512 rounded_quotients_f32x16 = _mm512_roundscale_ps(quotients_f32x16,
+                                                           _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
     // Use explicit rounding to match roundscale (MXCSR-independent)
-    __m512i multiples_of_pi = _mm512_cvt_roundps_epi32(rounded_quotients,
-                                                       _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    __m512i multiples_of_pi_i32x16 = _mm512_cvt_roundps_epi32(rounded_quotients_f32x16,
+                                                              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
     // Cody-Waite range reduction: angle = angle_radians - (multiples * pi + pi/2)
-    __m512 const offset = _mm512_fmadd_ps(rounded_quotients, pi_hi_f32x16, pi_half);
-    __m512 angles = _mm512_sub_ps(angles_radians, offset);
-    angles = _mm512_fnmadd_ps(rounded_quotients, pi_lo_f32x16, angles);
-    __m512 const angles_squared = _mm512_mul_ps(angles, angles);
-    __m512 const angles_cubed = _mm512_mul_ps(angles, angles_squared);
+    __m512 const offset_f32x16 = _mm512_fmadd_ps(rounded_quotients_f32x16, pi_high_f32x16, pi_half_f32x16);
+    __m512 angles_f32x16 = _mm512_sub_ps(angles_radians, offset_f32x16);
+    angles_f32x16 = _mm512_fnmadd_ps(rounded_quotients_f32x16, pi_low_f32x16, angles_f32x16);
+    __m512 const angles_squared_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
+    __m512 const angles_cubed_f32x16 = _mm512_mul_ps(angles_f32x16, angles_squared_f32x16);
     // Degree-9 polynomial via Horner's method
-    __m512 polynomials = coeff_9;
-    polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_7);
-    polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_5);
-    polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_3);
-    __m512 results = _mm512_fmadd_ps(angles_cubed, polynomials, angles);
-    // If multiples_of_pi is even, flip the sign of the results
-    __mmask16 even_mask = _mm512_testn_epi32_mask(multiples_of_pi, _mm512_set1_epi32(1));
-    results = _mm512_mask_sub_ps(results, even_mask, _mm512_setzero_ps(), results);
-    return results;
+    __m512 polynomials_f32x16 = coeff_9_f32x16;
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_7_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_5_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_3_f32x16);
+    __m512 results_f32x16 = _mm512_fmadd_ps(angles_cubed_f32x16, polynomials_f32x16, angles_f32x16);
+    // If multiples_of_pi_i32x16 is even, flip the sign of the results_f32x16
+    __mmask16 even_mask = _mm512_testn_epi32_mask(multiples_of_pi_i32x16, _mm512_set1_epi32(1));
+    results_f32x16 = _mm512_mask_sub_ps(results_f32x16, even_mask, _mm512_setzero_ps(), results_f32x16);
+    return results_f32x16;
 }
 NK_INTERNAL __m512 nk_atan_f32x16_skylake_(__m512 const inputs) {
     // Polynomial coefficients
-    __m512 const coeff_8 = _mm512_set1_ps(-0.333331018686294555664062f);
-    __m512 const coeff_7 = _mm512_set1_ps(+0.199926957488059997558594f);
-    __m512 const coeff_6 = _mm512_set1_ps(-0.142027363181114196777344f);
-    __m512 const coeff_5 = _mm512_set1_ps(+0.106347933411598205566406f);
-    __m512 const coeff_4 = _mm512_set1_ps(-0.0748900920152664184570312f);
-    __m512 const coeff_3 = _mm512_set1_ps(+0.0425049886107444763183594f);
-    __m512 const coeff_2 = _mm512_set1_ps(-0.0159569028764963150024414f);
-    __m512 const coeff_1 = _mm512_set1_ps(+0.00282363896258175373077393f);
+    __m512 const coeff_8_f32x16 = _mm512_set1_ps(-0.333331018686294555664062f);
+    __m512 const coeff_7_f32x16 = _mm512_set1_ps(+0.199926957488059997558594f);
+    __m512 const coeff_6_f32x16 = _mm512_set1_ps(-0.142027363181114196777344f);
+    __m512 const coeff_5_f32x16 = _mm512_set1_ps(+0.106347933411598205566406f);
+    __m512 const coeff_4_f32x16 = _mm512_set1_ps(-0.0748900920152664184570312f);
+    __m512 const coeff_3_f32x16 = _mm512_set1_ps(+0.0425049886107444763183594f);
+    __m512 const coeff_2_f32x16 = _mm512_set1_ps(-0.0159569028764963150024414f);
+    __m512 const coeff_1_f32x16 = _mm512_set1_ps(+0.00282363896258175373077393f);
     // Adjust for quadrant
-    __m512 values = inputs;
-    __mmask16 const negative_mask = _mm512_fpclass_ps_mask(values, 0x40);
-    values = _mm512_abs_ps(values);
-    __mmask16 const reciprocal_mask = _mm512_cmp_ps_mask(values, _mm512_set1_ps(1.0f), _CMP_GT_OS);
-    values = _mm512_mask_div_ps(values, reciprocal_mask, _mm512_set1_ps(1.0f), values);
+    __m512 values_f32x16 = inputs;
+    __mmask16 const negative_mask = _mm512_fpclass_ps_mask(values_f32x16, 0x40);
+    values_f32x16 = _mm512_abs_ps(values_f32x16);
+    __mmask16 const reciprocal_mask = _mm512_cmp_ps_mask(values_f32x16, _mm512_set1_ps(1.0f), _CMP_GT_OS);
+    values_f32x16 = _mm512_mask_div_ps(values_f32x16, reciprocal_mask, _mm512_set1_ps(1.0f), values_f32x16);
     // Argument reduction
-    __m512 const values_squared = _mm512_mul_ps(values, values);
-    __m512 const values_cubed = _mm512_mul_ps(values, values_squared);
+    __m512 const values_squared_f32x16 = _mm512_mul_ps(values_f32x16, values_f32x16);
+    __m512 const values_cubed_f32x16 = _mm512_mul_ps(values_f32x16, values_squared_f32x16);
     // Polynomial evaluation
-    __m512 polynomials = coeff_1;
-    polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_2);
-    polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_3);
-    polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_4);
-    polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_5);
-    polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_6);
-    polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_7);
-    polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_8);
-    // Adjust result for quadrants
-    __m512 result = _mm512_fmadd_ps(values_cubed, polynomials, values);
-    result = _mm512_mask_sub_ps(result, reciprocal_mask, _mm512_set1_ps(1.5707963267948966f), result);
-    result = _mm512_mask_sub_ps(result, negative_mask, _mm512_setzero_ps(), result);
-    return result;
+    __m512 polynomials_f32x16 = coeff_1_f32x16;
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_2_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_3_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_4_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_5_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_6_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_7_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_8_f32x16);
+    // Adjust result_f32x16 for quadrants
+    __m512 result_f32x16 = _mm512_fmadd_ps(values_cubed_f32x16, polynomials_f32x16, values_f32x16);
+    result_f32x16 = _mm512_mask_sub_ps(result_f32x16, reciprocal_mask, _mm512_set1_ps(1.5707963267948966f),
+                                       result_f32x16);
+    result_f32x16 = _mm512_mask_sub_ps(result_f32x16, negative_mask, _mm512_setzero_ps(), result_f32x16);
+    return result_f32x16;
 }
 NK_INTERNAL __m512 nk_atan2_f32x16_skylake_(__m512 const ys_inputs, __m512 const xs_inputs) {
     // Polynomial coefficients
-    __m512 const coeff_8 = _mm512_set1_ps(-0.333331018686294555664062f);
-    __m512 const coeff_7 = _mm512_set1_ps(+0.199926957488059997558594f);
-    __m512 const coeff_6 = _mm512_set1_ps(-0.142027363181114196777344f);
-    __m512 const coeff_5 = _mm512_set1_ps(+0.106347933411598205566406f);
-    __m512 const coeff_4 = _mm512_set1_ps(-0.0748900920152664184570312f);
-    __m512 const coeff_3 = _mm512_set1_ps(+0.0425049886107444763183594f);
-    __m512 const coeff_2 = _mm512_set1_ps(-0.0159569028764963150024414f);
-    __m512 const coeff_1 = _mm512_set1_ps(+0.00282363896258175373077393f);
+    __m512 const coeff_8_f32x16 = _mm512_set1_ps(-0.333331018686294555664062f);
+    __m512 const coeff_7_f32x16 = _mm512_set1_ps(+0.199926957488059997558594f);
+    __m512 const coeff_6_f32x16 = _mm512_set1_ps(-0.142027363181114196777344f);
+    __m512 const coeff_5_f32x16 = _mm512_set1_ps(+0.106347933411598205566406f);
+    __m512 const coeff_4_f32x16 = _mm512_set1_ps(-0.0748900920152664184570312f);
+    __m512 const coeff_3_f32x16 = _mm512_set1_ps(+0.0425049886107444763183594f);
+    __m512 const coeff_2_f32x16 = _mm512_set1_ps(-0.0159569028764963150024414f);
+    __m512 const coeff_1_f32x16 = _mm512_set1_ps(+0.00282363896258175373077393f);
     // Quadrant adjustments normalizing to absolute values of x and y
     __mmask16 const xs_negative_mask = _mm512_fpclass_ps_mask(xs_inputs, 0x40);
-    __m512 xs = _mm512_abs_ps(xs_inputs);
-    __m512 ys = _mm512_abs_ps(ys_inputs);
+    __m512 xs_f32x16 = _mm512_abs_ps(xs_inputs);
+    __m512 ys_f32x16 = _mm512_abs_ps(ys_inputs);
     // Ensure proper fraction where the numerator is smaller than the denominator
-    __mmask16 const swap_mask = _mm512_cmp_ps_mask(ys, xs, _CMP_GT_OS);
-    __m512 temps = xs;
-    xs = _mm512_mask_blend_ps(swap_mask, xs, ys);
-    ys = _mm512_mask_sub_ps(ys, swap_mask, _mm512_setzero_ps(), temps);
+    __mmask16 const swap_mask = _mm512_cmp_ps_mask(ys_f32x16, xs_f32x16, _CMP_GT_OS);
+    __m512 temps_f32x16 = xs_f32x16;
+    xs_f32x16 = _mm512_mask_blend_ps(swap_mask, xs_f32x16, ys_f32x16);
+    ys_f32x16 = _mm512_mask_sub_ps(ys_f32x16, swap_mask, _mm512_setzero_ps(), temps_f32x16);
-    // Compute ratio and ratio²
-    __m512 const ratio = _mm512_div_ps(ys, xs);
-    __m512 const ratio_squared = _mm512_mul_ps(ratio, ratio);
-    __m512 const ratio_cubed = _mm512_mul_ps(ratio, ratio_squared);
+    // Compute ratio_f32x16 and ratio²
+    __m512 const ratio_f32x16 = _mm512_div_ps(ys_f32x16, xs_f32x16);
+    __m512 const ratio_squared_f32x16 = _mm512_mul_ps(ratio_f32x16, ratio_f32x16);
+    __m512 const ratio_cubed_f32x16 = _mm512_mul_ps(ratio_f32x16, ratio_squared_f32x16);
     // Polynomial evaluation
-    __m512 polynomials = coeff_1;
-    polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_2);
-    polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_3);
-    polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_4);
-    polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_5);
-    polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_6);
-    polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_7);
-    polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_8);
-    // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
+    __m512 polynomials_f32x16 = coeff_1_f32x16;
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_2_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_3_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_4_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_5_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_6_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_7_f32x16);
+    polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_8_f32x16);
+    // Compute quadrant_f32x16 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
     //                        -2 for x<0 && !swap, -1 for x<0 && swap
-    __m512 results = _mm512_fmadd_ps(ratio_cubed, polynomials, ratio);
-    __m512 quadrant = _mm512_setzero_ps();
-    __m512 neg_two = _mm512_set1_ps(-2.0f);
-    quadrant = _mm512_mask_blend_ps(xs_negative_mask, quadrant, neg_two);
-    __m512 one = _mm512_set1_ps(1.0f);
-    __m512 quadrant_incremented = _mm512_add_ps(quadrant, one);
-    quadrant = _mm512_mask_blend_ps(swap_mask, quadrant, quadrant_incremented);
-    // Adjust for quadrant: result += quadrant * π/2
-    __m512 pi_half = _mm512_set1_ps(1.5707963267948966f);
-    results = _mm512_fmadd_ps(quadrant, pi_half, results);
+    __m512 results_f32x16 = _mm512_fmadd_ps(ratio_cubed_f32x16, polynomials_f32x16, ratio_f32x16);
+    __m512 quadrant_f32x16 = _mm512_setzero_ps();
+    __m512 neg_two_f32x16 = _mm512_set1_ps(-2.0f);
+    quadrant_f32x16 = _mm512_mask_blend_ps(xs_negative_mask, quadrant_f32x16, neg_two_f32x16);
+    __m512 one_f32x16 = _mm512_set1_ps(1.0f);
+    __m512 quadrant_incremented_f32x16 = _mm512_add_ps(quadrant_f32x16, one_f32x16);
+    quadrant_f32x16 = _mm512_mask_blend_ps(swap_mask, quadrant_f32x16, quadrant_incremented_f32x16);
+    // Adjust for quadrant_f32x16: result += quadrant_f32x16 * π/2
+    __m512 pi_half_f32x16 = _mm512_set1_ps(1.5707963267948966f);
+    results_f32x16 = _mm512_fmadd_ps(quadrant_f32x16, pi_half_f32x16, results_f32x16);
     // Transfer sign from x (XOR with sign bit of x_input)
-    __m512 xs_sign_bits = _mm512_and_ps(xs_inputs, _mm512_set1_ps(-0.0f));
-    results = _mm512_xor_ps(results, xs_sign_bits);
+    __m512 xs_sign_bits_f32x16 = _mm512_and_ps(xs_inputs, _mm512_set1_ps(-0.0f));
+    results_f32x16 = _mm512_xor_ps(results_f32x16, xs_sign_bits_f32x16);
     // Transfer sign from y (XOR with sign bit of y_input)
-    __m512 ys_sign_bits = _mm512_and_ps(ys_inputs, _mm512_set1_ps(-0.0f));
-    results = _mm512_xor_ps(results, ys_sign_bits);
+    __m512 ys_sign_bits_f32x16 = _mm512_and_ps(ys_inputs, _mm512_set1_ps(-0.0f));
+    results_f32x16 = _mm512_xor_ps(results_f32x16, ys_sign_bits_f32x16);
-    return results;
+    return results_f32x16;
 }
 NK_PUBLIC void nk_each_sin_f32_skylake(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
     nk_size_t i = 0;
     for (; i + 16 <= n; i += 16) {
-        __m512 angles = _mm512_loadu_ps(ins + i);
-        __m512 results = nk_sin_f32x16_skylake_(angles);
-        _mm512_storeu_ps(outs + i, results);
+        __m512 angles_f32x16 = _mm512_loadu_ps(ins + i);
+        __m512 results_f32x16 = nk_sin_f32x16_skylake_(angles_f32x16);
+        _mm512_storeu_ps(outs + i, results_f32x16);
     }
     if (i < n) {
         __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n - i);
-        __m512 angles = _mm512_maskz_loadu_ps(mask, ins + i);
-        __m512 results = nk_sin_f32x16_skylake_(angles);
-        _mm512_mask_storeu_ps(outs + i, mask, results);
+        __m512 angles_f32x16 = _mm512_maskz_loadu_ps(mask, ins + i);
+        __m512 results_f32x16 = nk_sin_f32x16_skylake_(angles_f32x16);
+        _mm512_mask_storeu_ps(outs + i, mask, results_f32x16);
     }
 }
 NK_PUBLIC void nk_each_cos_f32_skylake(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
     nk_size_t i = 0;
     for (; i + 16 <= n; i += 16) {
-        __m512 angles = _mm512_loadu_ps(ins + i);
-        __m512 results = nk_cos_f32x16_skylake_(angles);
-        _mm512_storeu_ps(outs + i, results);
+        __m512 angles_f32x16 = _mm512_loadu_ps(ins + i);
+        __m512 results_f32x16 = nk_cos_f32x16_skylake_(angles_f32x16);
+        _mm512_storeu_ps(outs + i, results_f32x16);
     }
     if (i < n) {
         __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n - i);
-        __m512 angles = _mm512_maskz_loadu_ps(mask, ins + i);
-        __m512 results = nk_cos_f32x16_skylake_(angles);
-        _mm512_mask_storeu_ps(outs + i, mask, results);
+        __m512 angles_f32x16 = _mm512_maskz_loadu_ps(mask, ins + i);
+        __m512 results_f32x16 = nk_cos_f32x16_skylake_(angles_f32x16);
+        _mm512_mask_storeu_ps(outs + i, mask, results_f32x16);
     }
 }
 NK_PUBLIC void nk_each_atan_f32_skylake(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
     nk_size_t i = 0;
     for (; i + 16 <= n; i += 16) {
-        __m512 angles = _mm512_loadu_ps(ins + i);
-        __m512 results = nk_atan_f32x16_skylake_(angles);
-        _mm512_storeu_ps(outs + i, results);
+        __m512 angles_f32x16 = _mm512_loadu_ps(ins + i);
+        __m512 results_f32x16 = nk_atan_f32x16_skylake_(angles_f32x16);
+        _mm512_storeu_ps(outs + i, results_f32x16);
     }
     if (i < n) {
         __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n - i);
-        __m512 angles = _mm512_maskz_loadu_ps(mask, ins + i);
-        __m512 results = nk_atan_f32x16_skylake_(angles);
-        _mm512_mask_storeu_ps(outs + i, mask, results);
+        __m512 angles_f32x16 = _mm512_maskz_loadu_ps(mask, ins + i);
+        __m512 results_f32x16 = nk_atan_f32x16_skylake_(angles_f32x16);
+        _mm512_mask_storeu_ps(outs + i, mask, results_f32x16);
     }
 }
 NK_INTERNAL __m512d nk_sin_f64x8_skylake_(__m512d const angles_radians) {
     // Constants for argument reduction
-    __m512d const pi_high = _mm512_set1_pd(3.141592653589793116);         // High-digits part of π
-    __m512d const pi_low = _mm512_set1_pd(1.2246467991473532072e-16);     // Low-digits part of π
-    __m512d const pi_reciprocal = _mm512_set1_pd(0.31830988618379067154); // 1/π
+    __m512d const pi_high_f64x8 = _mm512_set1_pd(3.141592653589793116);         // High-digits part of π
+    __m512d const pi_low_f64x8 = _mm512_set1_pd(1.2246467991473532072e-16);     // Low-digits part of π
+    __m512d const pi_reciprocal_f64x8 = _mm512_set1_pd(0.31830988618379067154); // 1/π
     // Polynomial coefficients for sine/cosine approximation (minimax polynomial)
-    __m512d const coeff_0 = _mm512_set1_pd(+0.00833333333333332974823815);
-    __m512d const coeff_1 = _mm512_set1_pd(-0.000198412698412696162806809);
-    __m512d const coeff_2 = _mm512_set1_pd(+2.75573192239198747630416e-06);
-    __m512d const coeff_3 = _mm512_set1_pd(-2.50521083763502045810755e-08);
-    __m512d const coeff_4 = _mm512_set1_pd(+1.60590430605664501629054e-10);
-    __m512d const coeff_5 = _mm512_set1_pd(-7.64712219118158833288484e-13);
-    __m512d const coeff_6 = _mm512_set1_pd(+2.81009972710863200091251e-15);
-    __m512d const coeff_7 = _mm512_set1_pd(-7.97255955009037868891952e-18);
-    __m512d const coeff_8 = _mm512_set1_pd(-0.166666666666666657414808);
-    // Compute (rounded_quotients) = round(angle / π)
-    __m512d const quotients = _mm512_mul_pd(angles_radians, pi_reciprocal);
-    __m512d const rounded_quotients = _mm512_roundscale_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    // Reduce the angle to: angle - (rounded_quotients * π_high + rounded_quotients * π_low)
-    __m512d angles = angles_radians;
-    angles = _mm512_fnmadd_pd(rounded_quotients, pi_high, angles);
-    angles = _mm512_fnmadd_pd(rounded_quotients, pi_low, angles);
-    // If rounded_quotients is odd (bit 0 set), negate the angle
+    __m512d const coeff_0_f64x8 = _mm512_set1_pd(+0.00833333333333332974823815);
+    __m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.000198412698412696162806809);
+    __m512d const coeff_2_f64x8 = _mm512_set1_pd(+2.75573192239198747630416e-06);
+    __m512d const coeff_3_f64x8 = _mm512_set1_pd(-2.50521083763502045810755e-08);
+    __m512d const coeff_4_f64x8 = _mm512_set1_pd(+1.60590430605664501629054e-10);
+    __m512d const coeff_5_f64x8 = _mm512_set1_pd(-7.64712219118158833288484e-13);
+    __m512d const coeff_6_f64x8 = _mm512_set1_pd(+2.81009972710863200091251e-15);
+    __m512d const coeff_7_f64x8 = _mm512_set1_pd(-7.97255955009037868891952e-18);
+    __m512d const coeff_8_f64x8 = _mm512_set1_pd(-0.166666666666666657414808);
+    // Compute (rounded_quotients_f64x8) = round(angle / π)
+    __m512d const quotients_f64x8 = _mm512_mul_pd(angles_radians, pi_reciprocal_f64x8);
+    __m512d const rounded_quotients_f64x8 = _mm512_roundscale_pd(quotients_f64x8,
+                                                                 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    // Reduce the angle to: angle - (rounded_quotients_f64x8 * π_high + rounded_quotients_f64x8 * π_low)
+    __m512d angles_f64x8 = angles_radians;
+    angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_high_f64x8, angles_f64x8);
+    angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_low_f64x8, angles_f64x8);
+    // If rounded_quotients_f64x8 is odd (bit 0 set), negate the angle
     // Use explicit rounding to match roundscale (MXCSR-independent)
     __mmask8 const sign_flip_mask = _mm256_test_epi32_mask(
-        _mm512_cvt_roundpd_epi32(rounded_quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
+        _mm512_cvt_roundpd_epi32(rounded_quotients_f64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
         _mm256_set1_epi32(1));
-    angles = _mm512_mask_sub_pd(angles, sign_flip_mask, _mm512_setzero_pd(), angles);
+    angles_f64x8 = _mm512_mask_sub_pd(angles_f64x8, sign_flip_mask, _mm512_setzero_pd(), angles_f64x8);
-    __m512d const angles_squared = _mm512_mul_pd(angles, angles);
-    __m512d const angles_cubed = _mm512_mul_pd(angles, angles_squared);
-    __m512d const angles_quadratic = _mm512_mul_pd(angles_squared, angles_squared);
-    __m512d const angles_octic = _mm512_mul_pd(angles_quadratic, angles_quadratic);
+    __m512d const angles_squared_f64x8 = _mm512_mul_pd(angles_f64x8, angles_f64x8);
+    __m512d const angles_cubed_f64x8 = _mm512_mul_pd(angles_f64x8, angles_squared_f64x8);
+    __m512d const angles_quadratic_f64x8 = _mm512_mul_pd(angles_squared_f64x8, angles_squared_f64x8);
+    __m512d const angles_octic_f64x8 = _mm512_mul_pd(angles_quadratic_f64x8, angles_quadratic_f64x8);
     // Compute higher-degree polynomial terms
-    __m512d const poly_67 = _mm512_fmadd_pd(angles_squared, coeff_7, coeff_6);
-    __m512d const poly_45 = _mm512_fmadd_pd(angles_squared, coeff_5, coeff_4);
-    __m512d const poly_4567 = _mm512_fmadd_pd(angles_quadratic, poly_67, poly_45);
+    __m512d const poly_67_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_7_f64x8, coeff_6_f64x8);
+    __m512d const poly_45_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_5_f64x8, coeff_4_f64x8);
+    __m512d const poly_4567_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_67_f64x8, poly_45_f64x8);
     // Compute lower-degree polynomial terms
-    __m512d const poly_23 = _mm512_fmadd_pd(angles_squared, coeff_3, coeff_2);
-    __m512d const poly_01 = _mm512_fmadd_pd(angles_squared, coeff_1, coeff_0);
-    __m512d const poly_0123 = _mm512_fmadd_pd(angles_quadratic, poly_23, poly_01);
+    __m512d const poly_23_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_3_f64x8, coeff_2_f64x8);
+    __m512d const poly_01_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_1_f64x8, coeff_0_f64x8);
+    __m512d const poly_0123_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_23_f64x8, poly_01_f64x8);
     // Combine polynomial terms
-    __m512d results = _mm512_fmadd_pd(angles_octic, poly_4567, poly_0123);
-    results = _mm512_fmadd_pd(results, angles_squared, coeff_8);
-    results = _mm512_fmadd_pd(results, angles_cubed, angles);
+    __m512d results_f64x8 = _mm512_fmadd_pd(angles_octic_f64x8, poly_4567_f64x8, poly_0123_f64x8);
+    results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_squared_f64x8, coeff_8_f64x8);
+    results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_cubed_f64x8, angles_f64x8);
     // Handle the special case of negative zero input
     __mmask8 const non_zero_mask = _mm512_cmpneq_pd_mask(angles_radians, _mm512_setzero_pd());
-    results = _mm512_maskz_mov_pd(non_zero_mask, results);
-    return results;
+    results_f64x8 = _mm512_maskz_mov_pd(non_zero_mask, results_f64x8);
+    return results_f64x8;
 }
 NK_INTERNAL __m512d nk_cos_f64x8_skylake_(__m512d const angles_radians) {
     // Constants for argument reduction
-    __m512d const pi_high_half = _mm512_set1_pd(3.141592653589793116 * 0.5);     // High-digits part of π
-    __m512d const pi_low_half = _mm512_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π
-    __m512d const pi_reciprocal = _mm512_set1_pd(0.31830988618379067154);        // 1/π
+    __m512d const pi_high_half_f64x8 = _mm512_set1_pd(3.141592653589793116 * 0.5);     // High-digits part of π
+    __m512d const pi_low_half_f64x8 = _mm512_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π
+    __m512d const pi_reciprocal_f64x8 = _mm512_set1_pd(0.31830988618379067154);        // 1/π
     // Polynomial coefficients for sine/cosine approximation (minimax polynomial)
-    __m512d const coeff_0 = _mm512_set1_pd(+0.00833333333333332974823815);
-    __m512d const coeff_1 = _mm512_set1_pd(-0.000198412698412696162806809);
-    __m512d const coeff_2 = _mm512_set1_pd(+2.75573192239198747630416e-06);
-    __m512d const coeff_3 = _mm512_set1_pd(-2.50521083763502045810755e-08);
-    __m512d const coeff_4 = _mm512_set1_pd(+1.60590430605664501629054e-10);
-    __m512d const coeff_5 = _mm512_set1_pd(-7.64712219118158833288484e-13);
-    __m512d const coeff_6 = _mm512_set1_pd(+2.81009972710863200091251e-15);
-    __m512d const coeff_7 = _mm512_set1_pd(-7.97255955009037868891952e-18);
-    __m512d const coeff_8 = _mm512_set1_pd(-0.166666666666666657414808);
-    // Compute (rounded_quotients) = 2 * round(angle / π - 0.5) + 1
-    // Use fmsub: a*b - c = angles * (1/π) - 0.5
-    __m512d const quotients = _mm512_fmsub_pd(angles_radians, pi_reciprocal, _mm512_set1_pd(0.5));
-    __m512d const rounded_quotients = _mm512_fmadd_pd(                                  //
-        _mm512_set1_pd(2),                                                              //
-        _mm512_roundscale_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
+    __m512d const coeff_0_f64x8 = _mm512_set1_pd(+0.00833333333333332974823815);
+    __m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.000198412698412696162806809);
+    __m512d const coeff_2_f64x8 = _mm512_set1_pd(+2.75573192239198747630416e-06);
+    __m512d const coeff_3_f64x8 = _mm512_set1_pd(-2.50521083763502045810755e-08);
+    __m512d const coeff_4_f64x8 = _mm512_set1_pd(+1.60590430605664501629054e-10);
+    __m512d const coeff_5_f64x8 = _mm512_set1_pd(-7.64712219118158833288484e-13);
+    __m512d const coeff_6_f64x8 = _mm512_set1_pd(+2.81009972710863200091251e-15);
+    __m512d const coeff_7_f64x8 = _mm512_set1_pd(-7.97255955009037868891952e-18);
+    __m512d const coeff_8_f64x8 = _mm512_set1_pd(-0.166666666666666657414808);
+    // Compute (rounded_quotients_f64x8) = 2 * round(angle / π - 0.5) + 1
+    // Use fmsub: a*b - c = angles_f64x8 * (1/π) - 0.5
+    __m512d const quotients_f64x8 = _mm512_fmsub_pd(angles_radians, pi_reciprocal_f64x8, _mm512_set1_pd(0.5));
+    __m512d const rounded_quotients_f64x8 = _mm512_fmadd_pd(                                  //
+        _mm512_set1_pd(2),                                                                    //
+        _mm512_roundscale_pd(quotients_f64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
         _mm512_set1_pd(1));
-    // Reduce the angle to: angle - (rounded_quotients * π_high + rounded_quotients * π_low)
-    __m512d angles = angles_radians;
-    angles = _mm512_fnmadd_pd(rounded_quotients, pi_high_half, angles);
-    angles = _mm512_fnmadd_pd(rounded_quotients, pi_low_half, angles);
+    // Reduce the angle to: angle - (rounded_quotients_f64x8 * π_high + rounded_quotients_f64x8 * π_low)
+    __m512d angles_f64x8 = angles_radians;
+    angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_high_half_f64x8, angles_f64x8);
+    angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_low_half_f64x8, angles_f64x8);
     // Use explicit rounding to match roundscale (MXCSR-independent)
     __mmask8 const sign_flip_mask = _mm256_testn_epi32_mask(
-        _mm512_cvt_roundpd_epi32(rounded_quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
+        _mm512_cvt_roundpd_epi32(rounded_quotients_f64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
         _mm256_set1_epi32(2));
-    angles = _mm512_mask_sub_pd(angles, sign_flip_mask, _mm512_setzero_pd(), angles);
-    __m512d const angles_squared = _mm512_mul_pd(angles, angles);
-    __m512d const angles_cubed = _mm512_mul_pd(angles, angles_squared);
-    __m512d const angles_quadratic = _mm512_mul_pd(angles_squared, angles_squared);
-    __m512d const angles_octic = _mm512_mul_pd(angles_quadratic, angles_quadratic);
+    angles_f64x8 = _mm512_mask_sub_pd(angles_f64x8, sign_flip_mask, _mm512_setzero_pd(), angles_f64x8);
+    __m512d const angles_squared_f64x8 = _mm512_mul_pd(angles_f64x8, angles_f64x8);
+    __m512d const angles_cubed_f64x8 = _mm512_mul_pd(angles_f64x8, angles_squared_f64x8);
+    __m512d const angles_quadratic_f64x8 = _mm512_mul_pd(angles_squared_f64x8, angles_squared_f64x8);
+    __m512d const angles_octic_f64x8 = _mm512_mul_pd(angles_quadratic_f64x8, angles_quadratic_f64x8);
     // Compute higher-degree polynomial terms
-    __m512d const poly_67 = _mm512_fmadd_pd(angles_squared, coeff_7, coeff_6);
-    __m512d const poly_45 = _mm512_fmadd_pd(angles_squared, coeff_5, coeff_4);
-    __m512d const poly_4567 = _mm512_fmadd_pd(angles_quadratic, poly_67, poly_45);
+    __m512d const poly_67_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_7_f64x8, coeff_6_f64x8);
+    __m512d const poly_45_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_5_f64x8, coeff_4_f64x8);
+    __m512d const poly_4567_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_67_f64x8, poly_45_f64x8);
     // Compute lower-degree polynomial terms
-    __m512d const poly_23 = _mm512_fmadd_pd(angles_squared, coeff_3, coeff_2);
-    __m512d const poly_01 = _mm512_fmadd_pd(angles_squared, coeff_1, coeff_0);
-    __m512d const poly_0123 = _mm512_fmadd_pd(angles_quadratic, poly_23, poly_01);
+    __m512d const poly_23_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_3_f64x8, coeff_2_f64x8);
+    __m512d const poly_01_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_1_f64x8, coeff_0_f64x8);
+    __m512d const poly_0123_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_23_f64x8, poly_01_f64x8);
     // Combine polynomial terms
-    __m512d results = _mm512_fmadd_pd(angles_octic, poly_4567, poly_0123);
-    results = _mm512_fmadd_pd(results, angles_squared, coeff_8);
-    results = _mm512_fmadd_pd(results, angles_cubed, angles);
-    return results;
+    __m512d results_f64x8 = _mm512_fmadd_pd(angles_octic_f64x8, poly_4567_f64x8, poly_0123_f64x8);
+    results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_squared_f64x8, coeff_8_f64x8);
+    results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_cubed_f64x8, angles_f64x8);
+    return results_f64x8;
 }
 NK_INTERNAL __m512d nk_atan_f64x8_skylake_(__m512d const inputs) {
     // Polynomial coefficients for atan approximation
-    __m512d const coeff_19 = _mm512_set1_pd(-1.88796008463073496563746e-05);
-    __m512d const coeff_18 = _mm512_set1_pd(+0.000209850076645816976906797);
-    __m512d const coeff_17 = _mm512_set1_pd(-0.00110611831486672482563471);
-    __m512d const coeff_16 = _mm512_set1_pd(+0.00370026744188713119232403);
-    __m512d const coeff_15 = _mm512_set1_pd(-0.00889896195887655491740809);
-    __m512d const coeff_14 = _mm512_set1_pd(+0.016599329773529201970117);
-    __m512d const coeff_13 = _mm512_set1_pd(-0.0254517624932312641616861);
-    __m512d const coeff_12 = _mm512_set1_pd(+0.0337852580001353069993897);
-    __m512d const coeff_11 = _mm512_set1_pd(-0.0407629191276836500001934);
-    __m512d const coeff_10 = _mm512_set1_pd(+0.0466667150077840625632675);
-    __m512d const coeff_9 = _mm512_set1_pd(-0.0523674852303482457616113);
-    __m512d const coeff_8 = _mm512_set1_pd(+0.0587666392926673580854313);
-    __m512d const coeff_7 = _mm512_set1_pd(-0.0666573579361080525984562);
-    __m512d const coeff_6 = _mm512_set1_pd(+0.0769219538311769618355029);
-    __m512d const coeff_5 = _mm512_set1_pd(-0.090908995008245008229153);
-    __m512d const coeff_4 = _mm512_set1_pd(+0.111111105648261418443745);
-    __m512d const coeff_3 = _mm512_set1_pd(-0.14285714266771329383765);
-    __m512d const coeff_2 = _mm512_set1_pd(+0.199999999996591265594148);
-    __m512d const coeff_1 = _mm512_set1_pd(-0.333333333333311110369124);
+    __m512d const coeff_19_f64x8 = _mm512_set1_pd(-1.88796008463073496563746e-05);
+    __m512d const coeff_18_f64x8 = _mm512_set1_pd(+0.000209850076645816976906797);
+    __m512d const coeff_17_f64x8 = _mm512_set1_pd(-0.00110611831486672482563471);
+    __m512d const coeff_16_f64x8 = _mm512_set1_pd(+0.00370026744188713119232403);
+    __m512d const coeff_15_f64x8 = _mm512_set1_pd(-0.00889896195887655491740809);
+    __m512d const coeff_14_f64x8 = _mm512_set1_pd(+0.016599329773529201970117);
+    __m512d const coeff_13_f64x8 = _mm512_set1_pd(-0.0254517624932312641616861);
+    __m512d const coeff_12_f64x8 = _mm512_set1_pd(+0.0337852580001353069993897);
+    __m512d const coeff_11_f64x8 = _mm512_set1_pd(-0.0407629191276836500001934);
+    __m512d const coeff_10_f64x8 = _mm512_set1_pd(+0.0466667150077840625632675);
+    __m512d const coeff_9_f64x8 = _mm512_set1_pd(-0.0523674852303482457616113);
+    __m512d const coeff_8_f64x8 = _mm512_set1_pd(+0.0587666392926673580854313);
+    __m512d const coeff_7_f64x8 = _mm512_set1_pd(-0.0666573579361080525984562);
+    __m512d const coeff_6_f64x8 = _mm512_set1_pd(+0.0769219538311769618355029);
+    __m512d const coeff_5_f64x8 = _mm512_set1_pd(-0.090908995008245008229153);
+    __m512d const coeff_4_f64x8 = _mm512_set1_pd(+0.111111105648261418443745);
+    __m512d const coeff_3_f64x8 = _mm512_set1_pd(-0.14285714266771329383765);
+    __m512d const coeff_2_f64x8 = _mm512_set1_pd(+0.199999999996591265594148);
+    __m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.333333333333311110369124);
     // Quadrant adjustments
     __mmask8 negative_mask = _mm512_cmp_pd_mask(inputs, _mm512_setzero_pd(), _CMP_LT_OS);
-    __m512d values = _mm512_abs_pd(inputs);
-    __mmask8 reciprocal_mask = _mm512_cmp_pd_mask(values, _mm512_set1_pd(1.0), _CMP_GT_OS);
-    values = _mm512_mask_div_pd(values, reciprocal_mask, _mm512_set1_pd(1.0), values);
-    __m512d const values_squared = _mm512_mul_pd(values, values);
-    __m512d const values_cubed = _mm512_mul_pd(values, values_squared);
+    __m512d values_f64x8 = _mm512_abs_pd(inputs);
+    __mmask8 reciprocal_mask = _mm512_cmp_pd_mask(values_f64x8, _mm512_set1_pd(1.0), _CMP_GT_OS);
+    values_f64x8 = _mm512_mask_div_pd(values_f64x8, reciprocal_mask, _mm512_set1_pd(1.0), values_f64x8);
+    __m512d const values_squared_f64x8 = _mm512_mul_pd(values_f64x8, values_f64x8);
+    __m512d const values_cubed_f64x8 = _mm512_mul_pd(values_f64x8, values_squared_f64x8);
     // Polynomial evaluation (argument reduction and approximation)
-    __m512d polynomials = coeff_19;
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_18);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_17);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_16);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_15);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_14);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_13);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_12);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_11);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_10);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_9);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_8);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_7);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_6);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_5);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_4);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_3);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_2);
-    polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_1);
+    __m512d polynomials_f64x8 = coeff_19_f64x8;
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_18_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_17_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_16_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_15_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_14_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_13_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_12_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_11_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_10_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_9_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_8_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_7_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_6_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_5_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_4_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_3_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_2_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_1_f64x8);
     // Compute atan approximation
-    __m512d result = _mm512_fmadd_pd(values_cubed, polynomials, values);
-    result = _mm512_mask_sub_pd(result, reciprocal_mask, _mm512_set1_pd(1.5707963267948966), result);
-    result = _mm512_mask_sub_pd(result, negative_mask, _mm512_setzero_pd(), result);
-    return result;
+    __m512d result_f64x8 = _mm512_fmadd_pd(values_cubed_f64x8, polynomials_f64x8, values_f64x8);
+    result_f64x8 = _mm512_mask_sub_pd(result_f64x8, reciprocal_mask, _mm512_set1_pd(1.5707963267948966), result_f64x8);
+    result_f64x8 = _mm512_mask_sub_pd(result_f64x8, negative_mask, _mm512_setzero_pd(), result_f64x8);
+    return result_f64x8;
 }
 /**
@@ -438,126 +442,126 @@ NK_INTERNAL __m512d nk_atan_f64x8_skylake_(__m512d const inputs) {
  */
 NK_INTERNAL __m512d nk_atan2_f64x8_skylake_(__m512d const ys_inputs, __m512d const xs_inputs) {
     // Polynomial coefficients for atan approximation (higher precision than f32)
-    __m512d const coeff_19 = _mm512_set1_pd(-1.88796008463073496563746e-05);
-    __m512d const coeff_18 = _mm512_set1_pd(+0.000209850076645816976906797);
-    __m512d const coeff_17 = _mm512_set1_pd(-0.00110611831486672482563471);
-    __m512d const coeff_16 = _mm512_set1_pd(+0.00370026744188713119232403);
-    __m512d const coeff_15 = _mm512_set1_pd(-0.00889896195887655491740809);
-    __m512d const coeff_14 = _mm512_set1_pd(+0.016599329773529201970117);
-    __m512d const coeff_13 = _mm512_set1_pd(-0.0254517624932312641616861);
-    __m512d const coeff_12 = _mm512_set1_pd(+0.0337852580001353069993897);
-    __m512d const coeff_11 = _mm512_set1_pd(-0.0407629191276836500001934);
-    __m512d const coeff_10 = _mm512_set1_pd(+0.0466667150077840625632675);
-    __m512d const coeff_9 = _mm512_set1_pd(-0.0523674852303482457616113);
-    __m512d const coeff_8 = _mm512_set1_pd(+0.0587666392926673580854313);
-    __m512d const coeff_7 = _mm512_set1_pd(-0.0666573579361080525984562);
-    __m512d const coeff_6 = _mm512_set1_pd(+0.0769219538311769618355029);
-    __m512d const coeff_5 = _mm512_set1_pd(-0.090908995008245008229153);
-    __m512d const coeff_4 = _mm512_set1_pd(+0.111111105648261418443745);
-    __m512d const coeff_3 = _mm512_set1_pd(-0.14285714266771329383765);
-    __m512d const coeff_2 = _mm512_set1_pd(+0.199999999996591265594148);
-    __m512d const coeff_1 = _mm512_set1_pd(-0.333333333333311110369124);
+    __m512d const coeff_19_f64x8 = _mm512_set1_pd(-1.88796008463073496563746e-05);
+    __m512d const coeff_18_f64x8 = _mm512_set1_pd(+0.000209850076645816976906797);
+    __m512d const coeff_17_f64x8 = _mm512_set1_pd(-0.00110611831486672482563471);
+    __m512d const coeff_16_f64x8 = _mm512_set1_pd(+0.00370026744188713119232403);
+    __m512d const coeff_15_f64x8 = _mm512_set1_pd(-0.00889896195887655491740809);
+    __m512d const coeff_14_f64x8 = _mm512_set1_pd(+0.016599329773529201970117);
+    __m512d const coeff_13_f64x8 = _mm512_set1_pd(-0.0254517624932312641616861);
+    __m512d const coeff_12_f64x8 = _mm512_set1_pd(+0.0337852580001353069993897);
+    __m512d const coeff_11_f64x8 = _mm512_set1_pd(-0.0407629191276836500001934);
+    __m512d const coeff_10_f64x8 = _mm512_set1_pd(+0.0466667150077840625632675);
+    __m512d const coeff_9_f64x8 = _mm512_set1_pd(-0.0523674852303482457616113);
+    __m512d const coeff_8_f64x8 = _mm512_set1_pd(+0.0587666392926673580854313);
+    __m512d const coeff_7_f64x8 = _mm512_set1_pd(-0.0666573579361080525984562);
+    __m512d const coeff_6_f64x8 = _mm512_set1_pd(+0.0769219538311769618355029);
+    __m512d const coeff_5_f64x8 = _mm512_set1_pd(-0.090908995008245008229153);
+    __m512d const coeff_4_f64x8 = _mm512_set1_pd(+0.111111105648261418443745);
+    __m512d const coeff_3_f64x8 = _mm512_set1_pd(-0.14285714266771329383765);
+    __m512d const coeff_2_f64x8 = _mm512_set1_pd(+0.199999999996591265594148);
+    __m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.333333333333311110369124);
     // Quadrant adjustments normalizing to absolute values of x and y
     __mmask8 const xs_negative_mask = _mm512_cmp_pd_mask(xs_inputs, _mm512_setzero_pd(), _CMP_LT_OS);
-    __m512d xs = _mm512_abs_pd(xs_inputs);
-    __m512d ys = _mm512_abs_pd(ys_inputs);
+    __m512d xs_f64x8 = _mm512_abs_pd(xs_inputs);
+    __m512d ys_f64x8 = _mm512_abs_pd(ys_inputs);
     // Ensure proper fraction where the numerator is smaller than the denominator
-    __mmask8 const swap_mask = _mm512_cmp_pd_mask(ys, xs, _CMP_GT_OS);
-    __m512d temps = xs;
-    xs = _mm512_mask_blend_pd(swap_mask, xs, ys);
-    ys = _mm512_mask_sub_pd(ys, swap_mask, _mm512_setzero_pd(), temps);
+    __mmask8 const swap_mask = _mm512_cmp_pd_mask(ys_f64x8, xs_f64x8, _CMP_GT_OS);
+    __m512d temps_f64x8 = xs_f64x8;
+    xs_f64x8 = _mm512_mask_blend_pd(swap_mask, xs_f64x8, ys_f64x8);
+    ys_f64x8 = _mm512_mask_sub_pd(ys_f64x8, swap_mask, _mm512_setzero_pd(), temps_f64x8);
-    // Compute ratio and ratio²
-    __m512d const ratio = _mm512_div_pd(ys, xs);
-    __m512d const ratio_squared = _mm512_mul_pd(ratio, ratio);
-    __m512d const ratio_cubed = _mm512_mul_pd(ratio, ratio_squared);
+    // Compute ratio_f64x8 and ratio²
+    __m512d const ratio_f64x8 = _mm512_div_pd(ys_f64x8, xs_f64x8);
+    __m512d const ratio_squared_f64x8 = _mm512_mul_pd(ratio_f64x8, ratio_f64x8);
+    __m512d const ratio_cubed_f64x8 = _mm512_mul_pd(ratio_f64x8, ratio_squared_f64x8);
     // Polynomial evaluation
-    __m512d polynomials = coeff_19;
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_18);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_17);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_16);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_15);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_14);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_13);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_12);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_11);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_10);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_9);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_8);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_7);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_6);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_5);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_4);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_3);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_2);
-    polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_1);
-    // Compute the result with quadrant adjustments
-    __m512d results = _mm512_fmadd_pd(ratio_cubed, polynomials, ratio);
-    // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
+    __m512d polynomials_f64x8 = coeff_19_f64x8;
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_18_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_17_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_16_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_15_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_14_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_13_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_12_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_11_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_10_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_9_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_8_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_7_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_6_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_5_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_4_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_3_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_2_f64x8);
+    polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_1_f64x8);
+    // Compute the result with quadrant_f64x8 adjustments
+    __m512d results_f64x8 = _mm512_fmadd_pd(ratio_cubed_f64x8, polynomials_f64x8, ratio_f64x8);
+    // Compute quadrant_f64x8 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
     //                        -2 for x<0 && !swap, -1 for x<0 && swap
-    __m512d quadrant = _mm512_setzero_pd();
-    quadrant = _mm512_mask_blend_pd(xs_negative_mask, quadrant, _mm512_set1_pd(-2.0));
-    __m512d quadrant_incremented = _mm512_add_pd(quadrant, _mm512_set1_pd(1.0));
-    quadrant = _mm512_mask_blend_pd(swap_mask, quadrant, quadrant_incremented);
+    __m512d quadrant_f64x8 = _mm512_setzero_pd();
+    quadrant_f64x8 = _mm512_mask_blend_pd(xs_negative_mask, quadrant_f64x8, _mm512_set1_pd(-2.0));
+    __m512d quadrant_incremented_f64x8 = _mm512_add_pd(quadrant_f64x8, _mm512_set1_pd(1.0));
+    quadrant_f64x8 = _mm512_mask_blend_pd(swap_mask, quadrant_f64x8, quadrant_incremented_f64x8);
-    // Adjust for quadrant: result += quadrant * π/2
-    results = _mm512_fmadd_pd(quadrant, _mm512_set1_pd(1.5707963267948966), results);
+    // Adjust for quadrant_f64x8: result += quadrant_f64x8 * π/2
+    results_f64x8 = _mm512_fmadd_pd(quadrant_f64x8, _mm512_set1_pd(1.5707963267948966), results_f64x8);
     // Transfer sign from x (XOR with sign bit of x_input)
-    __m512d xs_sign = _mm512_and_pd(xs_inputs, _mm512_set1_pd(-0.0));
-    results = _mm512_xor_pd(results, xs_sign);
+    __m512d xs_sign_f64x8 = _mm512_and_pd(xs_inputs, _mm512_set1_pd(-0.0));
+    results_f64x8 = _mm512_xor_pd(results_f64x8, xs_sign_f64x8);
     // Transfer sign from y (XOR with sign bit of y_input)
-    __m512d ys_sign = _mm512_and_pd(ys_inputs, _mm512_set1_pd(-0.0));
-    results = _mm512_xor_pd(results, ys_sign);
+    __m512d ys_sign_f64x8 = _mm512_and_pd(ys_inputs, _mm512_set1_pd(-0.0));
+    results_f64x8 = _mm512_xor_pd(results_f64x8, ys_sign_f64x8);
-    return results;
+    return results_f64x8;
 }
 NK_PUBLIC void nk_each_sin_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
     nk_size_t i = 0;
     for (; i + 8 <= n; i += 8) {
-        __m512d angles = _mm512_loadu_pd(ins + i);
-        __m512d results = nk_sin_f64x8_skylake_(angles);
-        _mm512_storeu_pd(outs + i, results);
+        __m512d angles_f64x8 = _mm512_loadu_pd(ins + i);
+        __m512d results_f64x8 = nk_sin_f64x8_skylake_(angles_f64x8);
+        _mm512_storeu_pd(outs + i, results_f64x8);
     }
     if (i < n) {
         __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFF, n - i);
-        __m512d angles = _mm512_maskz_loadu_pd(mask, ins + i);
-        __m512d results = nk_sin_f64x8_skylake_(angles);
-        _mm512_mask_storeu_pd(outs + i, mask, results);
+        __m512d angles_f64x8 = _mm512_maskz_loadu_pd(mask, ins + i);
+        __m512d results_f64x8 = nk_sin_f64x8_skylake_(angles_f64x8);
+        _mm512_mask_storeu_pd(outs + i, mask, results_f64x8);
     }
 }
 NK_PUBLIC void nk_each_cos_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
     nk_size_t i = 0;
     for (; i + 8 <= n; i += 8) {
-        __m512d angles = _mm512_loadu_pd(ins + i);
-        __m512d results = nk_cos_f64x8_skylake_(angles);
-        _mm512_storeu_pd(outs + i, results);
+        __m512d angles_f64x8 = _mm512_loadu_pd(ins + i);
+        __m512d results_f64x8 = nk_cos_f64x8_skylake_(angles_f64x8);
+        _mm512_storeu_pd(outs + i, results_f64x8);
     }
     if (i < n) {
         __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFF, n - i);
-        __m512d angles = _mm512_maskz_loadu_pd(mask, ins + i);
-        __m512d results = nk_cos_f64x8_skylake_(angles);
-        _mm512_mask_storeu_pd(outs + i, mask, results);
+        __m512d angles_f64x8 = _mm512_maskz_loadu_pd(mask, ins + i);
+        __m512d results_f64x8 = nk_cos_f64x8_skylake_(angles_f64x8);
+        _mm512_mask_storeu_pd(outs + i, mask, results_f64x8);
     }
 }
 NK_PUBLIC void nk_each_atan_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
     nk_size_t i = 0;
     for (; i + 8 <= n; i += 8) {
-        __m512d angles = _mm512_loadu_pd(ins + i);
-        __m512d results = nk_atan_f64x8_skylake_(angles);
-        _mm512_storeu_pd(outs + i, results);
+        __m512d angles_f64x8 = _mm512_loadu_pd(ins + i);
+        __m512d results_f64x8 = nk_atan_f64x8_skylake_(angles_f64x8);
+        _mm512_storeu_pd(outs + i, results_f64x8);
     }
     if (i < n) {
         __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFF, n - i);
-        __m512d angles = _mm512_maskz_loadu_pd(mask, ins + i);
-        __m512d results = nk_atan_f64x8_skylake_(angles);
-        _mm512_mask_storeu_pd(outs + i, mask, results);
+        __m512d angles_f64x8 = _mm512_maskz_loadu_pd(mask, ins + i);
+        __m512d results_f64x8 = nk_atan_f64x8_skylake_(angles_f64x8);
+        _mm512_mask_storeu_pd(outs + i, mask, results_f64x8);
     }
 }
@@ -570,8 +574,8 @@ NK_PUBLIC void nk_each_atan_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64
 NK_INTERNAL __m256i nk_sin_f16x16_skylake_(__m256i angles_f16x16) {
     __m512 angles_f32x16 = _mm512_cvtph_ps(angles_f16x16);
     // Cody-Waite range reduction constants
-    __m512 pi_hi_f32x16 = _mm512_set1_ps(3.1415927f);
-    __m512 pi_lo_f32x16 = _mm512_set1_ps(-8.742278e-8f);
+    __m512 pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
+    __m512 pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
     __m512 pi_recip_f32x16 = _mm512_set1_ps(0.31830988618f);
     __m512 c3_f32x16 = _mm512_set1_ps(-1.6666666641e-1f);
     __m512 c5_f32x16 = _mm512_set1_ps(8.3333293855e-3f);
@@ -581,8 +585,8 @@ NK_INTERNAL __m256i nk_sin_f16x16_skylake_(__m256i angles_f16x16) {
     // Use explicit rounding to match roundscale (MXCSR-independent)
     __m512i multiple_i32x16 = _mm512_cvt_roundps_epi32(rounded_f32x16, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_hi_f32x16, angles_f32x16);
-    angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_lo_f32x16, angles_f32x16);
+    angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_high_f32x16, angles_f32x16);
+    angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_low_f32x16, angles_f32x16);
     __m512 x2_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
     __m512 poly_f32x16 = _mm512_fmadd_ps(c5_f32x16, x2_f32x16, c3_f32x16);
@@ -601,8 +605,8 @@ NK_INTERNAL __m256i nk_sin_f16x16_skylake_(__m256i angles_f16x16) {
  */
 NK_INTERNAL __m256i nk_cos_f16x16_skylake_(__m256i angles_f16x16) {
     __m512 angles_f32x16 = _mm512_cvtph_ps(angles_f16x16);
-    __m512 pi_hi_f32x16 = _mm512_set1_ps(3.1415927f);
-    __m512 pi_lo_f32x16 = _mm512_set1_ps(-8.742278e-8f);
+    __m512 pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
+    __m512 pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
     __m512 pi_half_f32x16 = _mm512_set1_ps(1.5707963268f);
     __m512 pi_recip_f32x16 = _mm512_set1_ps(0.31830988618f);
     __m512 half_f32x16 = _mm512_set1_ps(0.5f);
@@ -614,9 +618,9 @@ NK_INTERNAL __m256i nk_cos_f16x16_skylake_(__m256i angles_f16x16) {
     // Use explicit rounding to match roundscale (MXCSR-independent)
     __m512i multiple_i32x16 = _mm512_cvt_roundps_epi32(rounded_f32x16, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    __m512 shift_f32x16 = _mm512_fmadd_ps(rounded_f32x16, pi_hi_f32x16, pi_half_f32x16);
+    __m512 shift_f32x16 = _mm512_fmadd_ps(rounded_f32x16, pi_high_f32x16, pi_half_f32x16);
     angles_f32x16 = _mm512_sub_ps(angles_f32x16, shift_f32x16);
-    angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_lo_f32x16, angles_f32x16);
+    angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_low_f32x16, angles_f32x16);
     __m512 x2_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
     __m512 poly_f32x16 = _mm512_fmadd_ps(c5_f32x16, x2_f32x16, c3_f32x16);