npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/geospatial/skylake.h CHANGED Viewed

@@ -8,13 +8,14 @@
  *
  *  @section geospatial_skylake_instructions Key AVX-512 Geospatial Instructions
  *
- *      Intrinsic               Instruction                     Ice         Genoa
- *      _mm512_sqrt_ps          VSQRTPS (ZMM, ZMM)              19c @ p05   15c @ p01
- *      _mm512_sqrt_pd          VSQRTPD (ZMM, ZMM)              23c @ p05   21c @ p01
- *      _mm256_div_ps           VDIVPS (YMM, YMM, YMM)          11c @ p0    11c @ p01
- *      _mm256_div_pd           VDIVPD (YMM, YMM, YMM)          13c @ p0    13c @ p01
- *      _mm256_fmadd_ps         VFMADD231PS (YMM, YMM, YMM)     4c @ p01    4c @ p01
- *      _mm256_fmadd_pd         VFMADD231PD (YMM, YMM, YMM)     4c @ p01    4c @ p01
+ *      Intrinsic           Instruction                  Icelake           Genoa
+ *      _mm512_sqrt_ps      VSQRTPS (ZMM, ZMM)           19cy @ p0+p0+p05  15cy @ p01
+ *      _mm512_sqrt_pd      VSQRTPD (ZMM, ZMM)           23cy @ p0+p0+p05  21cy @ p01
+ *      _mm256_div_ps       VDIVPS (YMM, YMM, YMM)       11cy @ p0         11cy @ p01
+ *      _mm256_div_pd       VDIVPD (YMM, YMM, YMM)       13cy @ p0         13cy @ p01
+ *      _mm256_fmadd_ps     VFMADD231PS (YMM, YMM, YMM)  4cy @ p01         4cy @ p01
+ *      _mm256_fmadd_pd     VFMADD231PD (YMM, YMM, YMM)  4cy @ p01         4cy @ p01
+ *      _mm512_cmp_ps_mask  VCMPPS (K, ZMM, ZMM, I8)     4cy @ p5          5cy @ p01
  */
 #ifndef NK_GEOSPATIAL_SKYLAKE_H
 #define NK_GEOSPATIAL_SKYLAKE_H
@@ -37,44 +38,48 @@ extern "C" {
 #pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "f16c", "fma", "bmi", "bmi2")
 #endif
-NK_INTERNAL __m512d nk_haversine_f64x8_skylake_(       //
-    __m512d first_latitudes, __m512d first_longitudes, //
-    __m512d second_latitudes, __m512d second_longitudes) {
+NK_INTERNAL __m512d nk_haversine_f64x8_skylake_(                   //
+    __m512d first_latitudes_f64x8, __m512d first_longitudes_f64x8, //
+    __m512d second_latitudes_f64x8, __m512d second_longitudes_f64x8) {
-    __m512d const earth_radius = _mm512_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
-    __m512d const half = _mm512_set1_pd(0.5);
-    __m512d const one = _mm512_set1_pd(1.0);
-    __m512d const two = _mm512_set1_pd(2.0);
+    __m512d const earth_radius_f64x8 = _mm512_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
+    __m512d const half_f64x8 = _mm512_set1_pd(0.5);
+    __m512d const one_f64x8 = _mm512_set1_pd(1.0);
+    __m512d const two_f64x8 = _mm512_set1_pd(2.0);
-    __m512d latitude_delta = _mm512_sub_pd(second_latitudes, first_latitudes);
-    __m512d longitude_delta = _mm512_sub_pd(second_longitudes, first_longitudes);
+    __m512d latitude_delta_f64x8 = _mm512_sub_pd(second_latitudes_f64x8, first_latitudes_f64x8);
+    __m512d longitude_delta_f64x8 = _mm512_sub_pd(second_longitudes_f64x8, first_longitudes_f64x8);
     // Haversine terms: sin²(Δ/2)
-    __m512d latitude_delta_half = _mm512_mul_pd(latitude_delta, half);
-    __m512d longitude_delta_half = _mm512_mul_pd(longitude_delta, half);
-    __m512d sin_latitude_delta_half = nk_sin_f64x8_skylake_(latitude_delta_half);
-    __m512d sin_longitude_delta_half = nk_sin_f64x8_skylake_(longitude_delta_half);
-    __m512d sin_squared_latitude_delta_half = _mm512_mul_pd(sin_latitude_delta_half, sin_latitude_delta_half);
-    __m512d sin_squared_longitude_delta_half = _mm512_mul_pd(sin_longitude_delta_half, sin_longitude_delta_half);
+    __m512d latitude_delta_half_f64x8 = _mm512_mul_pd(latitude_delta_f64x8, half_f64x8);
+    __m512d longitude_delta_half_f64x8 = _mm512_mul_pd(longitude_delta_f64x8, half_f64x8);
+    __m512d sin_latitude_delta_half_f64x8 = nk_sin_f64x8_skylake_(latitude_delta_half_f64x8);
+    __m512d sin_longitude_delta_half_f64x8 = nk_sin_f64x8_skylake_(longitude_delta_half_f64x8);
+    __m512d sin_squared_latitude_delta_half_f64x8 = _mm512_mul_pd(sin_latitude_delta_half_f64x8,
+                                                                  sin_latitude_delta_half_f64x8);
+    __m512d sin_squared_longitude_delta_half_f64x8 = _mm512_mul_pd(sin_longitude_delta_half_f64x8,
+                                                                   sin_longitude_delta_half_f64x8);
     // Latitude cosine product
-    __m512d cos_first_latitude = nk_cos_f64x8_skylake_(first_latitudes);
-    __m512d cos_second_latitude = nk_cos_f64x8_skylake_(second_latitudes);
-    __m512d cos_latitude_product = _mm512_mul_pd(cos_first_latitude, cos_second_latitude);
+    __m512d cos_first_latitude_f64x8 = nk_cos_f64x8_skylake_(first_latitudes_f64x8);
+    __m512d cos_second_latitude_f64x8 = nk_cos_f64x8_skylake_(second_latitudes_f64x8);
+    __m512d cos_latitude_product_f64x8 = _mm512_mul_pd(cos_first_latitude_f64x8, cos_second_latitude_f64x8);
     // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
-    __m512d haversine_term = _mm512_add_pd(sin_squared_latitude_delta_half,
-                                           _mm512_mul_pd(cos_latitude_product, sin_squared_longitude_delta_half));
-    // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
-    __m512d zero = _mm512_setzero_pd();
-    haversine_term = _mm512_max_pd(zero, _mm512_min_pd(one, haversine_term));
+    __m512d haversine_term_f64x8 = _mm512_add_pd(
+        sin_squared_latitude_delta_half_f64x8,
+        _mm512_mul_pd(cos_latitude_product_f64x8, sin_squared_longitude_delta_half_f64x8));
+    // Clamp haversine_term_f64x8 to [0, 1] to prevent NaN from sqrt of negative values
+    __m512d zero_f64x8 = _mm512_setzero_pd();
+    haversine_term_f64x8 = _mm512_max_pd(zero_f64x8, _mm512_min_pd(one_f64x8, haversine_term_f64x8));
     // Central angle: c = 2 × atan2(√a, √(1-a))
-    __m512d sqrt_haversine = _mm512_sqrt_pd(haversine_term);
-    __m512d sqrt_complement = _mm512_sqrt_pd(_mm512_sub_pd(one, haversine_term));
-    __m512d central_angle = _mm512_mul_pd(two, nk_atan2_f64x8_skylake_(sqrt_haversine, sqrt_complement));
+    __m512d sqrt_haversine_f64x8 = _mm512_sqrt_pd(haversine_term_f64x8);
+    __m512d sqrt_complement_f64x8 = _mm512_sqrt_pd(_mm512_sub_pd(one_f64x8, haversine_term_f64x8));
+    __m512d central_angle_f64x8 = _mm512_mul_pd(two_f64x8,
+                                                nk_atan2_f64x8_skylake_(sqrt_haversine_f64x8, sqrt_complement_f64x8));
-    return _mm512_mul_pd(earth_radius, central_angle);
+    return _mm512_mul_pd(earth_radius_f64x8, central_angle_f64x8);
 }
 NK_PUBLIC void nk_haversine_f64_skylake(            //
@@ -83,14 +88,14 @@ NK_PUBLIC void nk_haversine_f64_skylake(            //
     nk_size_t n, nk_f64_t *results) {
     while (n >= 8) {
-        __m512d first_latitudes = _mm512_loadu_pd(a_lats);
-        __m512d first_longitudes = _mm512_loadu_pd(a_lons);
-        __m512d second_latitudes = _mm512_loadu_pd(b_lats);
-        __m512d second_longitudes = _mm512_loadu_pd(b_lons);
+        __m512d first_latitudes_f64x8 = _mm512_loadu_pd(a_lats);
+        __m512d first_longitudes_f64x8 = _mm512_loadu_pd(a_lons);
+        __m512d second_latitudes_f64x8 = _mm512_loadu_pd(b_lats);
+        __m512d second_longitudes_f64x8 = _mm512_loadu_pd(b_lons);
-        __m512d distances = nk_haversine_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
-                                                        second_longitudes);
-        _mm512_storeu_pd(results, distances);
+        __m512d distances_f64x8 = nk_haversine_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
+                                                              second_latitudes_f64x8, second_longitudes_f64x8);
+        _mm512_storeu_pd(results, distances_f64x8);
         a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
     }
@@ -98,14 +103,14 @@ NK_PUBLIC void nk_haversine_f64_skylake(            //
     // Handle remaining elements with masked operations
     if (n > 0) {
         __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
-        __m512d first_latitudes = _mm512_maskz_loadu_pd(mask, a_lats);
-        __m512d first_longitudes = _mm512_maskz_loadu_pd(mask, a_lons);
-        __m512d second_latitudes = _mm512_maskz_loadu_pd(mask, b_lats);
-        __m512d second_longitudes = _mm512_maskz_loadu_pd(mask, b_lons);
-        __m512d distances = nk_haversine_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
-                                                        second_longitudes);
-        _mm512_mask_storeu_pd(results, mask, distances);
+        __m512d first_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lats);
+        __m512d first_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lons);
+        __m512d second_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lats);
+        __m512d second_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lons);
+        __m512d distances_f64x8 = nk_haversine_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
+                                                              second_latitudes_f64x8, second_longitudes_f64x8);
+        _mm512_mask_storeu_pd(results, mask, distances_f64x8);
     }
 }
@@ -113,158 +118,171 @@ NK_PUBLIC void nk_haversine_f64_skylake(            //
  *  @brief  AVX-512 helper for Vincenty's geodesic distance on 8 f64 point pairs.
  *  @note   This is a true SIMD implementation using masked convergence tracking.
  */
-NK_INTERNAL __m512d nk_vincenty_f64x8_skylake_(        //
-    __m512d first_latitudes, __m512d first_longitudes, //
-    __m512d second_latitudes, __m512d second_longitudes) {
-    __m512d const equatorial_radius = _mm512_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
-    __m512d const polar_radius = _mm512_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
-    __m512d const flattening = _mm512_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
-    __m512d const convergence_threshold = _mm512_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
-    __m512d const one = _mm512_set1_pd(1.0);
-    __m512d const two = _mm512_set1_pd(2.0);
-    __m512d const three = _mm512_set1_pd(3.0);
-    __m512d const four = _mm512_set1_pd(4.0);
-    __m512d const six = _mm512_set1_pd(6.0);
-    __m512d const sixteen = _mm512_set1_pd(16.0);
+NK_INTERNAL __m512d nk_vincenty_f64x8_skylake_(                    //
+    __m512d first_latitudes_f64x8, __m512d first_longitudes_f64x8, //
+    __m512d second_latitudes_f64x8, __m512d second_longitudes_f64x8) {
+    __m512d const equatorial_radius_f64x8 = _mm512_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
+    __m512d const polar_radius_f64x8 = _mm512_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
+    __m512d const flattening_f64x8 = _mm512_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
+    __m512d const convergence_threshold_f64x8 = _mm512_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
+    __m512d const one_f64x8 = _mm512_set1_pd(1.0);
+    __m512d const two_f64x8 = _mm512_set1_pd(2.0);
+    __m512d const three_f64x8 = _mm512_set1_pd(3.0);
+    __m512d const four_f64x8 = _mm512_set1_pd(4.0);
+    __m512d const six_f64x8 = _mm512_set1_pd(6.0);
+    __m512d const sixteen_f64x8 = _mm512_set1_pd(16.0);
     // Longitude difference
-    __m512d longitude_difference = _mm512_sub_pd(second_longitudes, first_longitudes);
+    __m512d longitude_difference_f64x8 = _mm512_sub_pd(second_longitudes_f64x8, first_longitudes_f64x8);
     // Reduced latitudes: tan(U) = (1-f) * tan(lat)
-    __m512d one_minus_f = _mm512_sub_pd(one, flattening);
-    __m512d tan_first = _mm512_div_pd(nk_sin_f64x8_skylake_(first_latitudes), nk_cos_f64x8_skylake_(first_latitudes));
-    __m512d tan_second = _mm512_div_pd(nk_sin_f64x8_skylake_(second_latitudes),
-                                       nk_cos_f64x8_skylake_(second_latitudes));
-    __m512d tan_reduced_first = _mm512_mul_pd(one_minus_f, tan_first);
-    __m512d tan_reduced_second = _mm512_mul_pd(one_minus_f, tan_second);
+    __m512d one_minus_f_f64x8 = _mm512_sub_pd(one_f64x8, flattening_f64x8);
+    __m512d tan_first_f64x8 = _mm512_div_pd(nk_sin_f64x8_skylake_(first_latitudes_f64x8),
+                                            nk_cos_f64x8_skylake_(first_latitudes_f64x8));
+    __m512d tan_second_f64x8 = _mm512_div_pd(nk_sin_f64x8_skylake_(second_latitudes_f64x8),
+                                             nk_cos_f64x8_skylake_(second_latitudes_f64x8));
+    __m512d tan_reduced_first_f64x8 = _mm512_mul_pd(one_minus_f_f64x8, tan_first_f64x8);
+    __m512d tan_reduced_second_f64x8 = _mm512_mul_pd(one_minus_f_f64x8, tan_second_f64x8);
     // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
-    __m512d cos_reduced_first = _mm512_div_pd(
-        one, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_first, tan_reduced_first, one)));
-    __m512d sin_reduced_first = _mm512_mul_pd(tan_reduced_first, cos_reduced_first);
-    __m512d cos_reduced_second = _mm512_div_pd(
-        one, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_second, tan_reduced_second, one)));
-    __m512d sin_reduced_second = _mm512_mul_pd(tan_reduced_second, cos_reduced_second);
-    // Initialize lambda and tracking variables
-    __m512d lambda = longitude_difference;
-    __m512d sin_angular_distance, cos_angular_distance, angular_distance;
-    __m512d sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
+    __m512d cos_reduced_first_f64x8 = _mm512_div_pd(
+        one_f64x8, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_first_f64x8, tan_reduced_first_f64x8, one_f64x8)));
+    __m512d sin_reduced_first_f64x8 = _mm512_mul_pd(tan_reduced_first_f64x8, cos_reduced_first_f64x8);
+    __m512d cos_reduced_second_f64x8 = _mm512_div_pd(
+        one_f64x8, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_second_f64x8, tan_reduced_second_f64x8, one_f64x8)));
+    __m512d sin_reduced_second_f64x8 = _mm512_mul_pd(tan_reduced_second_f64x8, cos_reduced_second_f64x8);
+    // Initialize lambda_f64x8 and tracking variables
+    __m512d lambda_f64x8 = longitude_difference_f64x8;
+    __m512d sin_angular_distance_f64x8, cos_angular_distance_f64x8, angular_distance_f64x8;
+    __m512d sin_azimuth_f64x8, cos_squared_azimuth_f64x8, cos_double_angular_midpoint_f64x8;
     // Track convergence and coincident points
     __mmask8 converged_mask = 0;
     __mmask8 coincident_mask = 0;
     for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFF; ++iteration) {
-        __m512d sin_lambda = nk_sin_f64x8_skylake_(lambda);
-        __m512d cos_lambda = nk_cos_f64x8_skylake_(lambda);
-        // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
-        __m512d cross_term = _mm512_mul_pd(cos_reduced_second, sin_lambda);
-        __m512d mixed_term = _mm512_sub_pd(
-            _mm512_mul_pd(cos_reduced_first, sin_reduced_second),
-            _mm512_mul_pd(_mm512_mul_pd(sin_reduced_first, cos_reduced_second), cos_lambda));
-        __m512d sin_angular_dist_sq = _mm512_fmadd_pd(cross_term, cross_term, _mm512_mul_pd(mixed_term, mixed_term));
-        sin_angular_distance = _mm512_sqrt_pd(sin_angular_dist_sq);
-        // Check for coincident points (sin_angular_distance ≈ 0)
-        coincident_mask = _mm512_cmp_pd_mask(sin_angular_distance, _mm512_set1_pd(1e-15), _CMP_LT_OS);
-        // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
-        cos_angular_distance = _mm512_fmadd_pd(_mm512_mul_pd(cos_reduced_first, cos_reduced_second), cos_lambda,
-                                               _mm512_mul_pd(sin_reduced_first, sin_reduced_second));
-        // angular_distance = atan2(sin, cos)
-        angular_distance = nk_atan2_f64x8_skylake_(sin_angular_distance, cos_angular_distance);
-        // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
+        __m512d sin_lambda_f64x8 = nk_sin_f64x8_skylake_(lambda_f64x8);
+        __m512d cos_lambda_f64x8 = nk_cos_f64x8_skylake_(lambda_f64x8);
+        // sin²(angular_distance_f64x8) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
+        __m512d cross_term_f64x8 = _mm512_mul_pd(cos_reduced_second_f64x8, sin_lambda_f64x8);
+        __m512d mixed_term_f64x8 = _mm512_sub_pd(
+            _mm512_mul_pd(cos_reduced_first_f64x8, sin_reduced_second_f64x8),
+            _mm512_mul_pd(_mm512_mul_pd(sin_reduced_first_f64x8, cos_reduced_second_f64x8), cos_lambda_f64x8));
+        __m512d sin_angular_dist_sq_f64x8 = _mm512_fmadd_pd(cross_term_f64x8, cross_term_f64x8,
+                                                            _mm512_mul_pd(mixed_term_f64x8, mixed_term_f64x8));
+        sin_angular_distance_f64x8 = _mm512_sqrt_pd(sin_angular_dist_sq_f64x8);
+        // Check for coincident points (sin_angular_distance_f64x8 ≈ 0)
+        coincident_mask = _mm512_cmp_pd_mask(sin_angular_distance_f64x8, _mm512_set1_pd(1e-15), _CMP_LT_OS);
+        // cos(angular_distance_f64x8) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
+        cos_angular_distance_f64x8 = _mm512_fmadd_pd(_mm512_mul_pd(cos_reduced_first_f64x8, cos_reduced_second_f64x8),
+                                                     cos_lambda_f64x8,
+                                                     _mm512_mul_pd(sin_reduced_first_f64x8, sin_reduced_second_f64x8));
+        // angular_distance_f64x8 = atan2(sin, cos)
+        angular_distance_f64x8 = nk_atan2_f64x8_skylake_(sin_angular_distance_f64x8, cos_angular_distance_f64x8);
+        // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f64x8)
         // Use masked divide: zero result for coincident lanes, avoids division by zero
-        sin_azimuth = _mm512_maskz_div_pd(
+        sin_azimuth_f64x8 = _mm512_maskz_div_pd(
             _knot_mask8(coincident_mask),
-            _mm512_mul_pd(_mm512_mul_pd(cos_reduced_first, cos_reduced_second), sin_lambda), sin_angular_distance);
-        cos_squared_azimuth = _mm512_sub_pd(one, _mm512_mul_pd(sin_azimuth, sin_azimuth));
+            _mm512_mul_pd(_mm512_mul_pd(cos_reduced_first_f64x8, cos_reduced_second_f64x8), sin_lambda_f64x8),
+            sin_angular_distance_f64x8);
+        cos_squared_azimuth_f64x8 = _mm512_sub_pd(one_f64x8, _mm512_mul_pd(sin_azimuth_f64x8, sin_azimuth_f64x8));
         // Handle equatorial case: cos²α = 0
-        __mmask8 equatorial_mask = _mm512_cmp_pd_mask(cos_squared_azimuth, _mm512_set1_pd(1e-15), _CMP_LT_OS);
+        __mmask8 equatorial_mask = _mm512_cmp_pd_mask(cos_squared_azimuth_f64x8, _mm512_set1_pd(1e-15), _CMP_LT_OS);
         // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
-        // Use masked divide: for equatorial lanes, quotient = cos_angular_distance (passthrough),
+        // Use masked divide: for equatorial lanes, quotient_f64x8 = cos_angular_distance_f64x8 (passthrough),
         // so subtraction yields zero. Avoids division by zero.
-        __m512d sin_product = _mm512_mul_pd(sin_reduced_first, sin_reduced_second);
-        __m512d quotient = _mm512_mask_div_pd(cos_angular_distance, _knot_mask8(equatorial_mask),
-                                              _mm512_mul_pd(two, sin_product), cos_squared_azimuth);
-        cos_double_angular_midpoint = _mm512_sub_pd(cos_angular_distance, quotient);
+        __m512d sin_product_f64x8 = _mm512_mul_pd(sin_reduced_first_f64x8, sin_reduced_second_f64x8);
+        __m512d quotient_f64x8 = _mm512_mask_div_pd(cos_angular_distance_f64x8, _knot_mask8(equatorial_mask),
+                                                    _mm512_mul_pd(two_f64x8, sin_product_f64x8),
+                                                    cos_squared_azimuth_f64x8);
+        cos_double_angular_midpoint_f64x8 = _mm512_sub_pd(cos_angular_distance_f64x8, quotient_f64x8);
         // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
-        __m512d correction_factor = _mm512_mul_pd(
-            _mm512_div_pd(flattening, sixteen),
-            _mm512_mul_pd(cos_squared_azimuth,
-                          _mm512_fmadd_pd(flattening, _mm512_fnmadd_pd(three, cos_squared_azimuth, four), four)));
+        __m512d correction_factor_f64x8 = _mm512_mul_pd(
+            _mm512_div_pd(flattening_f64x8, sixteen_f64x8),
+            _mm512_mul_pd(
+                cos_squared_azimuth_f64x8,
+                _mm512_fmadd_pd(flattening_f64x8, _mm512_fnmadd_pd(three_f64x8, cos_squared_azimuth_f64x8, four_f64x8),
+                                four_f64x8)));
         // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
-        __m512d cos_2sm_sq = _mm512_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
-        // innermost = -1 + 2 × cos²(2σₘ)
-        __m512d innermost = _mm512_fmadd_pd(two, cos_2sm_sq, _mm512_set1_pd(-1.0));
-        // middle = cos(2σₘ) + C × cos(σ) × innermost
-        __m512d middle = _mm512_fmadd_pd(_mm512_mul_pd(correction_factor, cos_angular_distance), innermost,
-                                         cos_double_angular_midpoint);
-        // inner = C × sin(σ) × middle
-        __m512d inner = _mm512_mul_pd(_mm512_mul_pd(correction_factor, sin_angular_distance), middle);
-        // λ' = L + (1-C) * f * sin_α * (σ + inner)
-        __m512d lambda_new = _mm512_fmadd_pd(
-            _mm512_mul_pd(_mm512_mul_pd(_mm512_sub_pd(one, correction_factor), flattening), sin_azimuth),
-            _mm512_add_pd(angular_distance, inner), longitude_difference);
+        __m512d cos_2sm_sq_f64x8 = _mm512_mul_pd(cos_double_angular_midpoint_f64x8, cos_double_angular_midpoint_f64x8);
+        // innermost_f64x8 = -1 + 2 × cos²(2σₘ)
+        __m512d innermost_f64x8 = _mm512_fmadd_pd(two_f64x8, cos_2sm_sq_f64x8, _mm512_set1_pd(-1.0));
+        // middle_f64x8 = cos(2σₘ) + C × cos(σ) × innermost_f64x8
+        __m512d middle_f64x8 = _mm512_fmadd_pd(_mm512_mul_pd(correction_factor_f64x8, cos_angular_distance_f64x8),
+                                               innermost_f64x8, cos_double_angular_midpoint_f64x8);
+        // inner_f64x8 = C × sin(σ) × middle_f64x8
+        __m512d inner_f64x8 = _mm512_mul_pd(_mm512_mul_pd(correction_factor_f64x8, sin_angular_distance_f64x8),
+                                            middle_f64x8);
+        // λ' = L + (1-C) * f * sin_α * (σ + inner_f64x8)
+        __m512d lambda_new_f64x8 = _mm512_fmadd_pd(
+            _mm512_mul_pd(_mm512_mul_pd(_mm512_sub_pd(one_f64x8, correction_factor_f64x8), flattening_f64x8),
+                          sin_azimuth_f64x8),
+            _mm512_add_pd(angular_distance_f64x8, inner_f64x8), longitude_difference_f64x8);
         // Check convergence: |λ - λ'| < threshold
-        __m512d lambda_diff = _mm512_abs_pd(_mm512_sub_pd(lambda_new, lambda));
-        converged_mask = _mm512_cmp_pd_mask(lambda_diff, convergence_threshold, _CMP_LT_OS);
+        __m512d lambda_diff_f64x8 = _mm512_abs_pd(_mm512_sub_pd(lambda_new_f64x8, lambda_f64x8));
+        converged_mask = _mm512_cmp_pd_mask(lambda_diff_f64x8, convergence_threshold_f64x8, _CMP_LT_OS);
-        lambda = lambda_new;
+        lambda_f64x8 = lambda_new_f64x8;
     }
     // Final distance calculation
     // u² = cos²α * (a² - b²) / b²
-    __m512d a_sq = _mm512_mul_pd(equatorial_radius, equatorial_radius);
-    __m512d b_sq = _mm512_mul_pd(polar_radius, polar_radius);
-    __m512d u_squared = _mm512_div_pd(_mm512_mul_pd(cos_squared_azimuth, _mm512_sub_pd(a_sq, b_sq)), b_sq);
+    __m512d a_sq_f64x8 = _mm512_mul_pd(equatorial_radius_f64x8, equatorial_radius_f64x8);
+    __m512d b_sq_f64x8 = _mm512_mul_pd(polar_radius_f64x8, polar_radius_f64x8);
+    __m512d u_squared_f64x8 = _mm512_div_pd(
+        _mm512_mul_pd(cos_squared_azimuth_f64x8, _mm512_sub_pd(a_sq_f64x8, b_sq_f64x8)), b_sq_f64x8);
     // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
-    __m512d series_a = _mm512_fmadd_pd(u_squared, _mm512_set1_pd(-175.0), _mm512_set1_pd(320.0));
-    series_a = _mm512_fmadd_pd(u_squared, series_a, _mm512_set1_pd(-768.0));
-    series_a = _mm512_fmadd_pd(u_squared, series_a, _mm512_set1_pd(4096.0));
-    series_a = _mm512_fmadd_pd(_mm512_div_pd(u_squared, _mm512_set1_pd(16384.0)), series_a, one);
+    __m512d series_a_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, _mm512_set1_pd(-175.0), _mm512_set1_pd(320.0));
+    series_a_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_a_f64x8, _mm512_set1_pd(-768.0));
+    series_a_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_a_f64x8, _mm512_set1_pd(4096.0));
+    series_a_f64x8 = _mm512_fmadd_pd(_mm512_div_pd(u_squared_f64x8, _mm512_set1_pd(16384.0)), series_a_f64x8,
+                                     one_f64x8);
     // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
-    __m512d series_b = _mm512_fmadd_pd(u_squared, _mm512_set1_pd(-47.0), _mm512_set1_pd(74.0));
-    series_b = _mm512_fmadd_pd(u_squared, series_b, _mm512_set1_pd(-128.0));
-    series_b = _mm512_fmadd_pd(u_squared, series_b, _mm512_set1_pd(256.0));
-    series_b = _mm512_mul_pd(_mm512_div_pd(u_squared, _mm512_set1_pd(1024.0)), series_b);
+    __m512d series_b_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, _mm512_set1_pd(-47.0), _mm512_set1_pd(74.0));
+    series_b_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_b_f64x8, _mm512_set1_pd(-128.0));
+    series_b_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_b_f64x8, _mm512_set1_pd(256.0));
+    series_b_f64x8 = _mm512_mul_pd(_mm512_div_pd(u_squared_f64x8, _mm512_set1_pd(1024.0)), series_b_f64x8);
     // Δσ = B × sin(σ) × (cos(2σₘ) +
     //      B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
-    __m512d cos_2sm_sq = _mm512_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
-    __m512d sin_sq = _mm512_mul_pd(sin_angular_distance, sin_angular_distance);
-    __m512d term1 = _mm512_fmadd_pd(two, cos_2sm_sq, _mm512_set1_pd(-1.0));
-    term1 = _mm512_mul_pd(cos_angular_distance, term1);
-    __m512d term2 = _mm512_fmadd_pd(four, sin_sq, _mm512_set1_pd(-3.0));
-    __m512d term3 = _mm512_fmadd_pd(four, cos_2sm_sq, _mm512_set1_pd(-3.0));
-    term2 = _mm512_mul_pd(_mm512_mul_pd(_mm512_div_pd(series_b, six), cos_double_angular_midpoint),
-                          _mm512_mul_pd(term2, term3));
-    __m512d delta_sigma = _mm512_mul_pd(
-        series_b, _mm512_mul_pd(sin_angular_distance, _mm512_add_pd(cos_double_angular_midpoint,
-                                                                    _mm512_mul_pd(_mm512_div_pd(series_b, four),
-                                                                                  _mm512_sub_pd(term1, term2)))));
+    __m512d cos_2sm_sq_f64x8 = _mm512_mul_pd(cos_double_angular_midpoint_f64x8, cos_double_angular_midpoint_f64x8);
+    __m512d sin_sq_f64x8 = _mm512_mul_pd(sin_angular_distance_f64x8, sin_angular_distance_f64x8);
+    __m512d term1_f64x8 = _mm512_fmadd_pd(two_f64x8, cos_2sm_sq_f64x8, _mm512_set1_pd(-1.0));
+    term1_f64x8 = _mm512_mul_pd(cos_angular_distance_f64x8, term1_f64x8);
+    __m512d term2_f64x8 = _mm512_fmadd_pd(four_f64x8, sin_sq_f64x8, _mm512_set1_pd(-3.0));
+    __m512d term3_f64x8 = _mm512_fmadd_pd(four_f64x8, cos_2sm_sq_f64x8, _mm512_set1_pd(-3.0));
+    term2_f64x8 = _mm512_mul_pd(
+        _mm512_mul_pd(_mm512_div_pd(series_b_f64x8, six_f64x8), cos_double_angular_midpoint_f64x8),
+        _mm512_mul_pd(term2_f64x8, term3_f64x8));
+    __m512d delta_sigma_f64x8 = _mm512_mul_pd(
+        series_b_f64x8, _mm512_mul_pd(sin_angular_distance_f64x8,
+                                      _mm512_add_pd(cos_double_angular_midpoint_f64x8,
+                                                    _mm512_mul_pd(_mm512_div_pd(series_b_f64x8, four_f64x8),
+                                                                  _mm512_sub_pd(term1_f64x8, term2_f64x8)))));
     // s = b * A * (σ - Δσ)
-    __m512d distances = _mm512_mul_pd(_mm512_mul_pd(polar_radius, series_a),
-                                      _mm512_sub_pd(angular_distance, delta_sigma));
+    __m512d distances_f64x8 = _mm512_mul_pd(_mm512_mul_pd(polar_radius_f64x8, series_a_f64x8),
+                                            _mm512_sub_pd(angular_distance_f64x8, delta_sigma_f64x8));
     // Set coincident points to zero
-    distances = _mm512_mask_blend_pd(coincident_mask, distances, _mm512_setzero_pd());
+    distances_f64x8 = _mm512_mask_blend_pd(coincident_mask, distances_f64x8, _mm512_setzero_pd());
-    return distances;
+    return distances_f64x8;
 }
 NK_PUBLIC void nk_vincenty_f64_skylake(             //
@@ -273,14 +291,14 @@ NK_PUBLIC void nk_vincenty_f64_skylake(             //
     nk_size_t n, nk_f64_t *results) {
     while (n >= 8) {
-        __m512d first_latitudes = _mm512_loadu_pd(a_lats);
-        __m512d first_longitudes = _mm512_loadu_pd(a_lons);
-        __m512d second_latitudes = _mm512_loadu_pd(b_lats);
-        __m512d second_longitudes = _mm512_loadu_pd(b_lons);
+        __m512d first_latitudes_f64x8 = _mm512_loadu_pd(a_lats);
+        __m512d first_longitudes_f64x8 = _mm512_loadu_pd(a_lons);
+        __m512d second_latitudes_f64x8 = _mm512_loadu_pd(b_lats);
+        __m512d second_longitudes_f64x8 = _mm512_loadu_pd(b_lons);
-        __m512d distances = nk_vincenty_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
-                                                       second_longitudes);
-        _mm512_storeu_pd(results, distances);
+        __m512d distances_f64x8 = nk_vincenty_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
+                                                             second_latitudes_f64x8, second_longitudes_f64x8);
+        _mm512_storeu_pd(results, distances_f64x8);
         a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
     }
@@ -288,56 +306,60 @@ NK_PUBLIC void nk_vincenty_f64_skylake(             //
     // Handle remaining elements with masked operations
     if (n > 0) {
         __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
-        __m512d first_latitudes = _mm512_maskz_loadu_pd(mask, a_lats);
-        __m512d first_longitudes = _mm512_maskz_loadu_pd(mask, a_lons);
-        __m512d second_latitudes = _mm512_maskz_loadu_pd(mask, b_lats);
-        __m512d second_longitudes = _mm512_maskz_loadu_pd(mask, b_lons);
-        __m512d distances = nk_vincenty_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
-                                                       second_longitudes);
-        _mm512_mask_storeu_pd(results, mask, distances);
+        __m512d first_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lats);
+        __m512d first_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lons);
+        __m512d second_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lats);
+        __m512d second_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lons);
+        __m512d distances_f64x8 = nk_vincenty_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
+                                                             second_latitudes_f64x8, second_longitudes_f64x8);
+        _mm512_mask_storeu_pd(results, mask, distances_f64x8);
     }
 }
-NK_INTERNAL __m512 nk_haversine_f32x16_skylake_(     //
-    __m512 first_latitudes, __m512 first_longitudes, //
-    __m512 second_latitudes, __m512 second_longitudes) {
+NK_INTERNAL __m512 nk_haversine_f32x16_skylake_(                   //
+    __m512 first_latitudes_f32x16, __m512 first_longitudes_f32x16, //
+    __m512 second_latitudes_f32x16, __m512 second_longitudes_f32x16) {
-    __m512 const earth_radius = _mm512_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
-    __m512 const half = _mm512_set1_ps(0.5f);
-    __m512 const one = _mm512_set1_ps(1.0f);
-    __m512 const two = _mm512_set1_ps(2.0f);
+    __m512 const earth_radius_f32x16 = _mm512_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
+    __m512 const half_f32x16 = _mm512_set1_ps(0.5f);
+    __m512 const one_f32x16 = _mm512_set1_ps(1.0f);
+    __m512 const two_f32x16 = _mm512_set1_ps(2.0f);
-    __m512 latitude_delta = _mm512_sub_ps(second_latitudes, first_latitudes);
-    __m512 longitude_delta = _mm512_sub_ps(second_longitudes, first_longitudes);
+    __m512 latitude_delta_f32x16 = _mm512_sub_ps(second_latitudes_f32x16, first_latitudes_f32x16);
+    __m512 longitude_delta_f32x16 = _mm512_sub_ps(second_longitudes_f32x16, first_longitudes_f32x16);
     // Haversine terms: sin²(Δ/2)
-    __m512 latitude_delta_half = _mm512_mul_ps(latitude_delta, half);
-    __m512 longitude_delta_half = _mm512_mul_ps(longitude_delta, half);
-    __m512 sin_latitude_delta_half = nk_sin_f32x16_skylake_(latitude_delta_half);
-    __m512 sin_longitude_delta_half = nk_sin_f32x16_skylake_(longitude_delta_half);
-    __m512 sin_squared_latitude_delta_half = _mm512_mul_ps(sin_latitude_delta_half, sin_latitude_delta_half);
-    __m512 sin_squared_longitude_delta_half = _mm512_mul_ps(sin_longitude_delta_half, sin_longitude_delta_half);
+    __m512 latitude_delta_half_f32x16 = _mm512_mul_ps(latitude_delta_f32x16, half_f32x16);
+    __m512 longitude_delta_half_f32x16 = _mm512_mul_ps(longitude_delta_f32x16, half_f32x16);
+    __m512 sin_latitude_delta_half_f32x16 = nk_sin_f32x16_skylake_(latitude_delta_half_f32x16);
+    __m512 sin_longitude_delta_half_f32x16 = nk_sin_f32x16_skylake_(longitude_delta_half_f32x16);
+    __m512 sin_squared_latitude_delta_half_f32x16 = _mm512_mul_ps(sin_latitude_delta_half_f32x16,
+                                                                  sin_latitude_delta_half_f32x16);
+    __m512 sin_squared_longitude_delta_half_f32x16 = _mm512_mul_ps(sin_longitude_delta_half_f32x16,
+                                                                   sin_longitude_delta_half_f32x16);
     // Latitude cosine product
-    __m512 cos_first_latitude = nk_cos_f32x16_skylake_(first_latitudes);
-    __m512 cos_second_latitude = nk_cos_f32x16_skylake_(second_latitudes);
-    __m512 cos_latitude_product = _mm512_mul_ps(cos_first_latitude, cos_second_latitude);
+    __m512 cos_first_latitude_f32x16 = nk_cos_f32x16_skylake_(first_latitudes_f32x16);
+    __m512 cos_second_latitude_f32x16 = nk_cos_f32x16_skylake_(second_latitudes_f32x16);
+    __m512 cos_latitude_product_f32x16 = _mm512_mul_ps(cos_first_latitude_f32x16, cos_second_latitude_f32x16);
     // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
-    __m512 haversine_term = _mm512_add_ps(sin_squared_latitude_delta_half,
-                                          _mm512_mul_ps(cos_latitude_product, sin_squared_longitude_delta_half));
+    __m512 haversine_term_f32x16 = _mm512_add_ps(
+        sin_squared_latitude_delta_half_f32x16,
+        _mm512_mul_ps(cos_latitude_product_f32x16, sin_squared_longitude_delta_half_f32x16));
     // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
-    __m512 zero = _mm512_setzero_ps();
-    haversine_term = _mm512_max_ps(zero, _mm512_min_ps(one, haversine_term));
+    __m512 zero_f32x16 = _mm512_setzero_ps();
+    haversine_term_f32x16 = _mm512_max_ps(zero_f32x16, _mm512_min_ps(one_f32x16, haversine_term_f32x16));
     // Central angle: c = 2 × atan2(√a, √(1-a))
-    __m512 sqrt_haversine = _mm512_sqrt_ps(haversine_term);
-    __m512 sqrt_complement = _mm512_sqrt_ps(_mm512_sub_ps(one, haversine_term));
-    __m512 central_angle = _mm512_mul_ps(two, nk_atan2_f32x16_skylake_(sqrt_haversine, sqrt_complement));
+    __m512 sqrt_haversine_f32x16 = _mm512_sqrt_ps(haversine_term_f32x16);
+    __m512 sqrt_complement_f32x16 = _mm512_sqrt_ps(_mm512_sub_ps(one_f32x16, haversine_term_f32x16));
+    __m512 central_angle_f32x16 = _mm512_mul_ps(
+        two_f32x16, nk_atan2_f32x16_skylake_(sqrt_haversine_f32x16, sqrt_complement_f32x16));
-    return _mm512_mul_ps(earth_radius, central_angle);
+    return _mm512_mul_ps(earth_radius_f32x16, central_angle_f32x16);
 }
 NK_PUBLIC void nk_haversine_f32_skylake(            //
@@ -346,14 +368,14 @@ NK_PUBLIC void nk_haversine_f32_skylake(            //
     nk_size_t n, nk_f32_t *results) {
     while (n >= 16) {
-        __m512 first_latitudes = _mm512_loadu_ps(a_lats);
-        __m512 first_longitudes = _mm512_loadu_ps(a_lons);
-        __m512 second_latitudes = _mm512_loadu_ps(b_lats);
-        __m512 second_longitudes = _mm512_loadu_ps(b_lons);
+        __m512 first_latitudes_f32x16 = _mm512_loadu_ps(a_lats);
+        __m512 first_longitudes_f32x16 = _mm512_loadu_ps(a_lons);
+        __m512 second_latitudes_f32x16 = _mm512_loadu_ps(b_lats);
+        __m512 second_longitudes_f32x16 = _mm512_loadu_ps(b_lons);
-        __m512 distances = nk_haversine_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
-                                                        second_longitudes);
-        _mm512_storeu_ps(results, distances);
+        __m512 distances_f32x16 = nk_haversine_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
+                                                               second_latitudes_f32x16, second_longitudes_f32x16);
+        _mm512_storeu_ps(results, distances_f32x16);
         a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
     }
@@ -361,14 +383,14 @@ NK_PUBLIC void nk_haversine_f32_skylake(            //
     // Handle remaining elements with masked operations
     if (n > 0) {
         __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
-        __m512 first_latitudes = _mm512_maskz_loadu_ps(mask, a_lats);
-        __m512 first_longitudes = _mm512_maskz_loadu_ps(mask, a_lons);
-        __m512 second_latitudes = _mm512_maskz_loadu_ps(mask, b_lats);
-        __m512 second_longitudes = _mm512_maskz_loadu_ps(mask, b_lons);
-        __m512 distances = nk_haversine_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
-                                                        second_longitudes);
-        _mm512_mask_storeu_ps(results, mask, distances);
+        __m512 first_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lats);
+        __m512 first_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lons);
+        __m512 second_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lats);
+        __m512 second_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lons);
+        __m512 distances_f32x16 = nk_haversine_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
+                                                               second_latitudes_f32x16, second_longitudes_f32x16);
+        _mm512_mask_storeu_ps(results, mask, distances_f32x16);
     }
 }
@@ -376,158 +398,172 @@ NK_PUBLIC void nk_haversine_f32_skylake(            //
  *  @brief  AVX-512 helper for Vincenty's geodesic distance on 16 f32 point pairs.
  *  @note   This is a true SIMD implementation using masked convergence tracking.
  */
-NK_INTERNAL __m512 nk_vincenty_f32x16_skylake_(      //
-    __m512 first_latitudes, __m512 first_longitudes, //
-    __m512 second_latitudes, __m512 second_longitudes) {
-    __m512 const equatorial_radius = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
-    __m512 const polar_radius = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
-    __m512 const flattening = _mm512_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
-    __m512 const convergence_threshold = _mm512_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
-    __m512 const one = _mm512_set1_ps(1.0f);
-    __m512 const two = _mm512_set1_ps(2.0f);
-    __m512 const three = _mm512_set1_ps(3.0f);
-    __m512 const four = _mm512_set1_ps(4.0f);
-    __m512 const six = _mm512_set1_ps(6.0f);
-    __m512 const sixteen = _mm512_set1_ps(16.0f);
+NK_INTERNAL __m512 nk_vincenty_f32x16_skylake_(                    //
+    __m512 first_latitudes_f32x16, __m512 first_longitudes_f32x16, //
+    __m512 second_latitudes_f32x16, __m512 second_longitudes_f32x16) {
+    __m512 const equatorial_radius_f32x16 = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
+    __m512 const polar_radius_f32x16 = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
+    __m512 const flattening_f32x16 = _mm512_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
+    __m512 const convergence_threshold_f32x16 = _mm512_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
+    __m512 const one_f32x16 = _mm512_set1_ps(1.0f);
+    __m512 const two_f32x16 = _mm512_set1_ps(2.0f);
+    __m512 const three_f32x16 = _mm512_set1_ps(3.0f);
+    __m512 const four_f32x16 = _mm512_set1_ps(4.0f);
+    __m512 const six_f32x16 = _mm512_set1_ps(6.0f);
+    __m512 const sixteen_f32x16 = _mm512_set1_ps(16.0f);
     // Longitude difference
-    __m512 longitude_difference = _mm512_sub_ps(second_longitudes, first_longitudes);
+    __m512 longitude_difference_f32x16 = _mm512_sub_ps(second_longitudes_f32x16, first_longitudes_f32x16);
     // Reduced latitudes: tan(U) = (1-f) * tan(lat)
-    __m512 one_minus_f = _mm512_sub_ps(one, flattening);
-    __m512 tan_first = _mm512_div_ps(nk_sin_f32x16_skylake_(first_latitudes), nk_cos_f32x16_skylake_(first_latitudes));
-    __m512 tan_second = _mm512_div_ps(nk_sin_f32x16_skylake_(second_latitudes),
-                                      nk_cos_f32x16_skylake_(second_latitudes));
-    __m512 tan_reduced_first = _mm512_mul_ps(one_minus_f, tan_first);
-    __m512 tan_reduced_second = _mm512_mul_ps(one_minus_f, tan_second);
+    __m512 one_minus_f_f32x16 = _mm512_sub_ps(one_f32x16, flattening_f32x16);
+    __m512 tan_first_f32x16 = _mm512_div_ps(nk_sin_f32x16_skylake_(first_latitudes_f32x16),
+                                            nk_cos_f32x16_skylake_(first_latitudes_f32x16));
+    __m512 tan_second_f32x16 = _mm512_div_ps(nk_sin_f32x16_skylake_(second_latitudes_f32x16),
+                                             nk_cos_f32x16_skylake_(second_latitudes_f32x16));
+    __m512 tan_reduced_first_f32x16 = _mm512_mul_ps(one_minus_f_f32x16, tan_first_f32x16);
+    __m512 tan_reduced_second_f32x16 = _mm512_mul_ps(one_minus_f_f32x16, tan_second_f32x16);
     // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
-    __m512 cos_reduced_first = _mm512_div_ps(
-        one, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_first, tan_reduced_first, one)));
-    __m512 sin_reduced_first = _mm512_mul_ps(tan_reduced_first, cos_reduced_first);
-    __m512 cos_reduced_second = _mm512_div_ps(
-        one, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_second, tan_reduced_second, one)));
-    __m512 sin_reduced_second = _mm512_mul_ps(tan_reduced_second, cos_reduced_second);
-    // Initialize lambda and tracking variables
-    __m512 lambda = longitude_difference;
-    __m512 sin_angular_distance, cos_angular_distance, angular_distance;
-    __m512 sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
+    __m512 cos_reduced_first_f32x16 = _mm512_div_ps(
+        one_f32x16, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_first_f32x16, tan_reduced_first_f32x16, one_f32x16)));
+    __m512 sin_reduced_first_f32x16 = _mm512_mul_ps(tan_reduced_first_f32x16, cos_reduced_first_f32x16);
+    __m512 cos_reduced_second_f32x16 = _mm512_div_ps(
+        one_f32x16, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_second_f32x16, tan_reduced_second_f32x16, one_f32x16)));
+    __m512 sin_reduced_second_f32x16 = _mm512_mul_ps(tan_reduced_second_f32x16, cos_reduced_second_f32x16);
+    // Initialize lambda_f32x16 and tracking variables
+    __m512 lambda_f32x16 = longitude_difference_f32x16;
+    __m512 sin_angular_distance_f32x16, cos_angular_distance_f32x16, angular_distance_f32x16;
+    __m512 sin_azimuth_f32x16, cos_squared_azimuth_f32x16, cos_double_angular_midpoint_f32x16;
     // Track convergence and coincident points
     __mmask16 converged_mask = 0;
     __mmask16 coincident_mask = 0;
     for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFFFF; ++iteration) {
-        __m512 sin_lambda = nk_sin_f32x16_skylake_(lambda);
-        __m512 cos_lambda = nk_cos_f32x16_skylake_(lambda);
-        // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
-        __m512 cross_term = _mm512_mul_ps(cos_reduced_second, sin_lambda);
-        __m512 mixed_term = _mm512_sub_ps(
-            _mm512_mul_ps(cos_reduced_first, sin_reduced_second),
-            _mm512_mul_ps(_mm512_mul_ps(sin_reduced_first, cos_reduced_second), cos_lambda));
-        __m512 sin_angular_dist_sq = _mm512_fmadd_ps(cross_term, cross_term, _mm512_mul_ps(mixed_term, mixed_term));
-        sin_angular_distance = _mm512_sqrt_ps(sin_angular_dist_sq);
-        // Check for coincident points (sin_angular_distance ≈ 0)
-        coincident_mask = _mm512_cmp_ps_mask(sin_angular_distance, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
-        // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
-        cos_angular_distance = _mm512_fmadd_ps(_mm512_mul_ps(cos_reduced_first, cos_reduced_second), cos_lambda,
-                                               _mm512_mul_ps(sin_reduced_first, sin_reduced_second));
-        // angular_distance = atan2(sin, cos)
-        angular_distance = nk_atan2_f32x16_skylake_(sin_angular_distance, cos_angular_distance);
-        // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
+        __m512 sin_lambda_f32x16 = nk_sin_f32x16_skylake_(lambda_f32x16);
+        __m512 cos_lambda_f32x16 = nk_cos_f32x16_skylake_(lambda_f32x16);
+        // sin²(angular_distance_f32x16) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
+        __m512 cross_term_f32x16 = _mm512_mul_ps(cos_reduced_second_f32x16, sin_lambda_f32x16);
+        __m512 mixed_term_f32x16 = _mm512_sub_ps(
+            _mm512_mul_ps(cos_reduced_first_f32x16, sin_reduced_second_f32x16),
+            _mm512_mul_ps(_mm512_mul_ps(sin_reduced_first_f32x16, cos_reduced_second_f32x16), cos_lambda_f32x16));
+        __m512 sin_angular_dist_sq_f32x16 = _mm512_fmadd_ps(cross_term_f32x16, cross_term_f32x16,
+                                                            _mm512_mul_ps(mixed_term_f32x16, mixed_term_f32x16));
+        sin_angular_distance_f32x16 = _mm512_sqrt_ps(sin_angular_dist_sq_f32x16);
+        // Check for coincident points (sin_angular_distance_f32x16 ≈ 0)
+        coincident_mask = _mm512_cmp_ps_mask(sin_angular_distance_f32x16, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
+        // cos(angular_distance_f32x16) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
+        cos_angular_distance_f32x16 = _mm512_fmadd_ps(
+            _mm512_mul_ps(cos_reduced_first_f32x16, cos_reduced_second_f32x16), cos_lambda_f32x16,
+            _mm512_mul_ps(sin_reduced_first_f32x16, sin_reduced_second_f32x16));
+        // angular_distance_f32x16 = atan2(sin, cos)
+        angular_distance_f32x16 = nk_atan2_f32x16_skylake_(sin_angular_distance_f32x16, cos_angular_distance_f32x16);
+        // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f32x16)
         // Use masked divide: zero result for coincident lanes, avoids division by zero
-        sin_azimuth = _mm512_maskz_div_ps(
+        sin_azimuth_f32x16 = _mm512_maskz_div_ps(
             _knot_mask16(coincident_mask),
-            _mm512_mul_ps(_mm512_mul_ps(cos_reduced_first, cos_reduced_second), sin_lambda), sin_angular_distance);
-        cos_squared_azimuth = _mm512_sub_ps(one, _mm512_mul_ps(sin_azimuth, sin_azimuth));
+            _mm512_mul_ps(_mm512_mul_ps(cos_reduced_first_f32x16, cos_reduced_second_f32x16), sin_lambda_f32x16),
+            sin_angular_distance_f32x16);
+        cos_squared_azimuth_f32x16 = _mm512_sub_ps(one_f32x16, _mm512_mul_ps(sin_azimuth_f32x16, sin_azimuth_f32x16));
         // Handle equatorial case: cos²α = 0
-        __mmask16 equatorial_mask = _mm512_cmp_ps_mask(cos_squared_azimuth, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
+        __mmask16 equatorial_mask = _mm512_cmp_ps_mask(cos_squared_azimuth_f32x16, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
         // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
-        // Use masked divide: for equatorial lanes, quotient = cos_angular_distance (passthrough),
+        // Use masked divide: for equatorial lanes, quotient_f32x16 = cos_angular_distance_f32x16 (passthrough),
         // so subtraction yields zero. Avoids division by zero.
-        __m512 sin_product = _mm512_mul_ps(sin_reduced_first, sin_reduced_second);
-        __m512 quotient = _mm512_mask_div_ps(cos_angular_distance, _knot_mask16(equatorial_mask),
-                                             _mm512_mul_ps(two, sin_product), cos_squared_azimuth);
-        cos_double_angular_midpoint = _mm512_sub_ps(cos_angular_distance, quotient);
+        __m512 sin_product_f32x16 = _mm512_mul_ps(sin_reduced_first_f32x16, sin_reduced_second_f32x16);
+        __m512 quotient_f32x16 = _mm512_mask_div_ps(cos_angular_distance_f32x16, _knot_mask16(equatorial_mask),
+                                                    _mm512_mul_ps(two_f32x16, sin_product_f32x16),
+                                                    cos_squared_azimuth_f32x16);
+        cos_double_angular_midpoint_f32x16 = _mm512_sub_ps(cos_angular_distance_f32x16, quotient_f32x16);
         // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
-        __m512 correction_factor = _mm512_mul_ps(
-            _mm512_div_ps(flattening, sixteen),
-            _mm512_mul_ps(cos_squared_azimuth,
-                          _mm512_fmadd_ps(flattening, _mm512_fnmadd_ps(three, cos_squared_azimuth, four), four)));
+        __m512 correction_factor_f32x16 = _mm512_mul_ps(
+            _mm512_div_ps(flattening_f32x16, sixteen_f32x16),
+            _mm512_mul_ps(
+                cos_squared_azimuth_f32x16,
+                _mm512_fmadd_ps(flattening_f32x16,
+                                _mm512_fnmadd_ps(three_f32x16, cos_squared_azimuth_f32x16, four_f32x16), four_f32x16)));
         // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
-        __m512 cos_2sm_sq = _mm512_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
-        // innermost = -1 + 2 × cos²(2σₘ)
-        __m512 innermost = _mm512_fmadd_ps(two, cos_2sm_sq, _mm512_set1_ps(-1.0f));
-        // middle = cos(2σₘ) + C × cos(σ) × innermost
-        __m512 middle = _mm512_fmadd_ps(_mm512_mul_ps(correction_factor, cos_angular_distance), innermost,
-                                        cos_double_angular_midpoint);
-        // inner = C × sin(σ) × middle
-        __m512 inner = _mm512_mul_ps(_mm512_mul_ps(correction_factor, sin_angular_distance), middle);
-        // λ' = L + (1-C) * f * sin_α * (σ + inner)
-        __m512 lambda_new = _mm512_fmadd_ps(
-            _mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(one, correction_factor), flattening), sin_azimuth),
-            _mm512_add_ps(angular_distance, inner), longitude_difference);
+        __m512 cos_2sm_sq_f32x16 = _mm512_mul_ps(cos_double_angular_midpoint_f32x16,
+                                                 cos_double_angular_midpoint_f32x16);
+        // innermost_f32x16 = -1 + 2 × cos²(2σₘ)
+        __m512 innermost_f32x16 = _mm512_fmadd_ps(two_f32x16, cos_2sm_sq_f32x16, _mm512_set1_ps(-1.0f));
+        // middle_f32x16 = cos(2σₘ) + C × cos(σ) × innermost_f32x16
+        __m512 middle_f32x16 = _mm512_fmadd_ps(_mm512_mul_ps(correction_factor_f32x16, cos_angular_distance_f32x16),
+                                               innermost_f32x16, cos_double_angular_midpoint_f32x16);
+        // inner_f32x16 = C × sin(σ) × middle_f32x16
+        __m512 inner_f32x16 = _mm512_mul_ps(_mm512_mul_ps(correction_factor_f32x16, sin_angular_distance_f32x16),
+                                            middle_f32x16);
+        // λ' = L + (1-C) * f * sin_α * (σ + inner_f32x16)
+        __m512 lambda_new_f32x16 = _mm512_fmadd_ps(
+            _mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(one_f32x16, correction_factor_f32x16), flattening_f32x16),
+                          sin_azimuth_f32x16),
+            _mm512_add_ps(angular_distance_f32x16, inner_f32x16), longitude_difference_f32x16);
         // Check convergence: |λ - λ'| < threshold
-        __m512 lambda_diff = _mm512_abs_ps(_mm512_sub_ps(lambda_new, lambda));
-        converged_mask = _mm512_cmp_ps_mask(lambda_diff, convergence_threshold, _CMP_LT_OS);
+        __m512 lambda_diff_f32x16 = _mm512_abs_ps(_mm512_sub_ps(lambda_new_f32x16, lambda_f32x16));
+        converged_mask = _mm512_cmp_ps_mask(lambda_diff_f32x16, convergence_threshold_f32x16, _CMP_LT_OS);
-        lambda = lambda_new;
+        lambda_f32x16 = lambda_new_f32x16;
     }
     // Final distance calculation
     // u² = cos²α * (a² - b²) / b²
-    __m512 a_sq = _mm512_mul_ps(equatorial_radius, equatorial_radius);
-    __m512 b_sq = _mm512_mul_ps(polar_radius, polar_radius);
-    __m512 u_squared = _mm512_div_ps(_mm512_mul_ps(cos_squared_azimuth, _mm512_sub_ps(a_sq, b_sq)), b_sq);
+    __m512 a_sq_f32x16 = _mm512_mul_ps(equatorial_radius_f32x16, equatorial_radius_f32x16);
+    __m512 b_sq_f32x16 = _mm512_mul_ps(polar_radius_f32x16, polar_radius_f32x16);
+    __m512 u_squared_f32x16 = _mm512_div_ps(
+        _mm512_mul_ps(cos_squared_azimuth_f32x16, _mm512_sub_ps(a_sq_f32x16, b_sq_f32x16)), b_sq_f32x16);
     // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
-    __m512 series_a = _mm512_fmadd_ps(u_squared, _mm512_set1_ps(-175.0f), _mm512_set1_ps(320.0f));
-    series_a = _mm512_fmadd_ps(u_squared, series_a, _mm512_set1_ps(-768.0f));
-    series_a = _mm512_fmadd_ps(u_squared, series_a, _mm512_set1_ps(4096.0f));
-    series_a = _mm512_fmadd_ps(_mm512_div_ps(u_squared, _mm512_set1_ps(16384.0f)), series_a, one);
+    __m512 series_a_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, _mm512_set1_ps(-175.0f), _mm512_set1_ps(320.0f));
+    series_a_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_a_f32x16, _mm512_set1_ps(-768.0f));
+    series_a_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_a_f32x16, _mm512_set1_ps(4096.0f));
+    series_a_f32x16 = _mm512_fmadd_ps(_mm512_div_ps(u_squared_f32x16, _mm512_set1_ps(16384.0f)), series_a_f32x16,
+                                      one_f32x16);
     // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
-    __m512 series_b = _mm512_fmadd_ps(u_squared, _mm512_set1_ps(-47.0f), _mm512_set1_ps(74.0f));
-    series_b = _mm512_fmadd_ps(u_squared, series_b, _mm512_set1_ps(-128.0f));
-    series_b = _mm512_fmadd_ps(u_squared, series_b, _mm512_set1_ps(256.0f));
-    series_b = _mm512_mul_ps(_mm512_div_ps(u_squared, _mm512_set1_ps(1024.0f)), series_b);
+    __m512 series_b_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, _mm512_set1_ps(-47.0f), _mm512_set1_ps(74.0f));
+    series_b_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_b_f32x16, _mm512_set1_ps(-128.0f));
+    series_b_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_b_f32x16, _mm512_set1_ps(256.0f));
+    series_b_f32x16 = _mm512_mul_ps(_mm512_div_ps(u_squared_f32x16, _mm512_set1_ps(1024.0f)), series_b_f32x16);
     // Δσ = B × sin(σ) × (cos(2σₘ) +
     //      B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
-    __m512 cos_2sm_sq = _mm512_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
-    __m512 sin_sq = _mm512_mul_ps(sin_angular_distance, sin_angular_distance);
-    __m512 term1 = _mm512_fmadd_ps(two, cos_2sm_sq, _mm512_set1_ps(-1.0f));
-    term1 = _mm512_mul_ps(cos_angular_distance, term1);
-    __m512 term2 = _mm512_fmadd_ps(four, sin_sq, _mm512_set1_ps(-3.0f));
-    __m512 term3 = _mm512_fmadd_ps(four, cos_2sm_sq, _mm512_set1_ps(-3.0f));
-    term2 = _mm512_mul_ps(_mm512_mul_ps(_mm512_div_ps(series_b, six), cos_double_angular_midpoint),
-                          _mm512_mul_ps(term2, term3));
-    __m512 delta_sigma = _mm512_mul_ps(
-        series_b, _mm512_mul_ps(sin_angular_distance, _mm512_add_ps(cos_double_angular_midpoint,
-                                                                    _mm512_mul_ps(_mm512_div_ps(series_b, four),
-                                                                                  _mm512_sub_ps(term1, term2)))));
+    __m512 cos_2sm_sq_f32x16 = _mm512_mul_ps(cos_double_angular_midpoint_f32x16, cos_double_angular_midpoint_f32x16);
+    __m512 sin_sq_f32x16 = _mm512_mul_ps(sin_angular_distance_f32x16, sin_angular_distance_f32x16);
+    __m512 term1_f32x16 = _mm512_fmadd_ps(two_f32x16, cos_2sm_sq_f32x16, _mm512_set1_ps(-1.0f));
+    term1_f32x16 = _mm512_mul_ps(cos_angular_distance_f32x16, term1_f32x16);
+    __m512 term2_f32x16 = _mm512_fmadd_ps(four_f32x16, sin_sq_f32x16, _mm512_set1_ps(-3.0f));
+    __m512 term3_f32x16 = _mm512_fmadd_ps(four_f32x16, cos_2sm_sq_f32x16, _mm512_set1_ps(-3.0f));
+    term2_f32x16 = _mm512_mul_ps(
+        _mm512_mul_ps(_mm512_div_ps(series_b_f32x16, six_f32x16), cos_double_angular_midpoint_f32x16),
+        _mm512_mul_ps(term2_f32x16, term3_f32x16));
+    __m512 delta_sigma_f32x16 = _mm512_mul_ps(
+        series_b_f32x16, _mm512_mul_ps(sin_angular_distance_f32x16,
+                                       _mm512_add_ps(cos_double_angular_midpoint_f32x16,
+                                                     _mm512_mul_ps(_mm512_div_ps(series_b_f32x16, four_f32x16),
+                                                                   _mm512_sub_ps(term1_f32x16, term2_f32x16)))));
     // s = b * A * (σ - Δσ)
-    __m512 distances = _mm512_mul_ps(_mm512_mul_ps(polar_radius, series_a),
-                                     _mm512_sub_ps(angular_distance, delta_sigma));
+    __m512 distances_f32x16 = _mm512_mul_ps(_mm512_mul_ps(polar_radius_f32x16, series_a_f32x16),
+                                            _mm512_sub_ps(angular_distance_f32x16, delta_sigma_f32x16));
     // Set coincident points to zero
-    distances = _mm512_mask_blend_ps(coincident_mask, distances, _mm512_setzero_ps());
+    distances_f32x16 = _mm512_mask_blend_ps(coincident_mask, distances_f32x16, _mm512_setzero_ps());
-    return distances;
+    return distances_f32x16;
 }
 NK_PUBLIC void nk_vincenty_f32_skylake(             //
@@ -536,14 +572,14 @@ NK_PUBLIC void nk_vincenty_f32_skylake(             //
     nk_size_t n, nk_f32_t *results) {
     while (n >= 16) {
-        __m512 first_latitudes = _mm512_loadu_ps(a_lats);
-        __m512 first_longitudes = _mm512_loadu_ps(a_lons);
-        __m512 second_latitudes = _mm512_loadu_ps(b_lats);
-        __m512 second_longitudes = _mm512_loadu_ps(b_lons);
+        __m512 first_latitudes_f32x16 = _mm512_loadu_ps(a_lats);
+        __m512 first_longitudes_f32x16 = _mm512_loadu_ps(a_lons);
+        __m512 second_latitudes_f32x16 = _mm512_loadu_ps(b_lats);
+        __m512 second_longitudes_f32x16 = _mm512_loadu_ps(b_lons);
-        __m512 distances = nk_vincenty_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
-                                                       second_longitudes);
-        _mm512_storeu_ps(results, distances);
+        __m512 distances_f32x16 = nk_vincenty_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
+                                                              second_latitudes_f32x16, second_longitudes_f32x16);
+        _mm512_storeu_ps(results, distances_f32x16);
         a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
     }
@@ -551,14 +587,14 @@ NK_PUBLIC void nk_vincenty_f32_skylake(             //
     // Handle remaining elements with masked operations
     if (n > 0) {
         __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
-        __m512 first_latitudes = _mm512_maskz_loadu_ps(mask, a_lats);
-        __m512 first_longitudes = _mm512_maskz_loadu_ps(mask, a_lons);
-        __m512 second_latitudes = _mm512_maskz_loadu_ps(mask, b_lats);
-        __m512 second_longitudes = _mm512_maskz_loadu_ps(mask, b_lons);
-        __m512 distances = nk_vincenty_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
-                                                       second_longitudes);
-        _mm512_mask_storeu_ps(results, mask, distances);
+        __m512 first_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lats);
+        __m512 first_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lons);
+        __m512 second_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lats);
+        __m512 second_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lons);
+        __m512 distances_f32x16 = nk_vincenty_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
+                                                              second_latitudes_f32x16, second_longitudes_f32x16);
+        _mm512_mask_storeu_ps(results, mask, distances_f32x16);
     }
 }