npm - numkong - Versions diffs - 7.0.0 → 7.4.1 - Mend

numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +239 -122
package/binding.gyp +25 -491
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/geospatial/haswell.h CHANGED Viewed

@@ -8,13 +8,14 @@
  *
  *  @section geospatial_haswell_instructions Key AVX2 Geospatial Instructions
  *
- *      Intrinsic               Instruction                     Ice         Genoa
- *      _mm256_sqrt_ps          VSQRTPS (YMM, YMM)              12c @ p0    15c @ p01
- *      _mm256_sqrt_pd          VSQRTPD (YMM, YMM)              13c @ p0    21c @ p01
- *      _mm256_div_ps           VDIVPS (YMM, YMM, YMM)          11c @ p0    11c @ p01
- *      _mm256_div_pd           VDIVPD (YMM, YMM, YMM)          13c @ p0    13c @ p01
- *      _mm256_fmadd_ps         VFMADD231PS (YMM, YMM, YMM)     4c @ p01    4c @ p01
- *      _mm256_fmadd_pd         VFMADD231PD (YMM, YMM, YMM)     4c @ p01    4c @ p01
+ *      Intrinsic        Instruction                  Icelake    Genoa
+ *      _mm256_sqrt_ps   VSQRTPS (YMM, YMM)           12cy @ p0  15cy @ p01
+ *      _mm256_sqrt_pd   VSQRTPD (YMM, YMM)           13cy @ p0  21cy @ p01
+ *      _mm256_div_ps    VDIVPS (YMM, YMM, YMM)       11cy @ p0  11cy @ p01
+ *      _mm256_div_pd    VDIVPD (YMM, YMM, YMM)       13cy @ p0  13cy @ p01
+ *      _mm256_fmadd_ps  VFMADD231PS (YMM, YMM, YMM)  4cy @ p01  4cy @ p01
+ *      _mm256_fmadd_pd  VFMADD231PD (YMM, YMM, YMM)  4cy @ p01  4cy @ p01
+ *      _mm256_cmp_ps    VCMPPS (YMM, YMM, YMM, I8)   3cy @ p01  3cy @ p01
  */
 #ifndef NK_GEOSPATIAL_HASWELL_H
 #define NK_GEOSPATIAL_HASWELL_H
@@ -40,44 +41,48 @@ extern "C" {
  *  These require AVX2 trigonometric kernels from trigonometry.h.
  */
-NK_INTERNAL __m256d nk_haversine_f64x4_haswell_(       //
-    __m256d first_latitudes, __m256d first_longitudes, //
-    __m256d second_latitudes, __m256d second_longitudes) {
+NK_INTERNAL __m256d nk_haversine_f64x4_haswell_(                   //
+    __m256d first_latitudes_f64x4, __m256d first_longitudes_f64x4, //
+    __m256d second_latitudes_f64x4, __m256d second_longitudes_f64x4) {
-    __m256d const earth_radius = _mm256_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
-    __m256d const half = _mm256_set1_pd(0.5);
-    __m256d const one = _mm256_set1_pd(1.0);
-    __m256d const two = _mm256_set1_pd(2.0);
+    __m256d const earth_radius_f64x4 = _mm256_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
+    __m256d const half_f64x4 = _mm256_set1_pd(0.5);
+    __m256d const one_f64x4 = _mm256_set1_pd(1.0);
+    __m256d const two_f64x4 = _mm256_set1_pd(2.0);
-    __m256d latitude_delta = _mm256_sub_pd(second_latitudes, first_latitudes);
-    __m256d longitude_delta = _mm256_sub_pd(second_longitudes, first_longitudes);
+    __m256d latitude_delta_f64x4 = _mm256_sub_pd(second_latitudes_f64x4, first_latitudes_f64x4);
+    __m256d longitude_delta_f64x4 = _mm256_sub_pd(second_longitudes_f64x4, first_longitudes_f64x4);
     // Haversine terms: sin²(Δ/2)
-    __m256d latitude_delta_half = _mm256_mul_pd(latitude_delta, half);
-    __m256d longitude_delta_half = _mm256_mul_pd(longitude_delta, half);
-    __m256d sin_latitude_delta_half = nk_sin_f64x4_haswell_(latitude_delta_half);
-    __m256d sin_longitude_delta_half = nk_sin_f64x4_haswell_(longitude_delta_half);
-    __m256d sin_squared_latitude_delta_half = _mm256_mul_pd(sin_latitude_delta_half, sin_latitude_delta_half);
-    __m256d sin_squared_longitude_delta_half = _mm256_mul_pd(sin_longitude_delta_half, sin_longitude_delta_half);
+    __m256d latitude_delta_half_f64x4 = _mm256_mul_pd(latitude_delta_f64x4, half_f64x4);
+    __m256d longitude_delta_half_f64x4 = _mm256_mul_pd(longitude_delta_f64x4, half_f64x4);
+    __m256d sin_latitude_delta_half_f64x4 = nk_sin_f64x4_haswell_(latitude_delta_half_f64x4);
+    __m256d sin_longitude_delta_half_f64x4 = nk_sin_f64x4_haswell_(longitude_delta_half_f64x4);
+    __m256d sin_squared_latitude_delta_half_f64x4 = _mm256_mul_pd(sin_latitude_delta_half_f64x4,
+                                                                  sin_latitude_delta_half_f64x4);
+    __m256d sin_squared_longitude_delta_half_f64x4 = _mm256_mul_pd(sin_longitude_delta_half_f64x4,
+                                                                   sin_longitude_delta_half_f64x4);
     // Latitude cosine product
-    __m256d cos_first_latitude = nk_cos_f64x4_haswell_(first_latitudes);
-    __m256d cos_second_latitude = nk_cos_f64x4_haswell_(second_latitudes);
-    __m256d cos_latitude_product = _mm256_mul_pd(cos_first_latitude, cos_second_latitude);
+    __m256d cos_first_latitude_f64x4 = nk_cos_f64x4_haswell_(first_latitudes_f64x4);
+    __m256d cos_second_latitude_f64x4 = nk_cos_f64x4_haswell_(second_latitudes_f64x4);
+    __m256d cos_latitude_product_f64x4 = _mm256_mul_pd(cos_first_latitude_f64x4, cos_second_latitude_f64x4);
     // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
-    __m256d haversine_term = _mm256_add_pd(sin_squared_latitude_delta_half,
-                                           _mm256_mul_pd(cos_latitude_product, sin_squared_longitude_delta_half));
-    // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
-    __m256d zero = _mm256_setzero_pd();
-    haversine_term = _mm256_max_pd(zero, _mm256_min_pd(one, haversine_term));
+    __m256d haversine_term_f64x4 = _mm256_add_pd(
+        sin_squared_latitude_delta_half_f64x4,
+        _mm256_mul_pd(cos_latitude_product_f64x4, sin_squared_longitude_delta_half_f64x4));
+    // Clamp haversine_term_f64x4 to [0, 1] to prevent NaN from sqrt of negative values
+    __m256d zero_f64x4 = _mm256_setzero_pd();
+    haversine_term_f64x4 = _mm256_max_pd(zero_f64x4, _mm256_min_pd(one_f64x4, haversine_term_f64x4));
     // Central angle: c = 2 × atan2(√a, √(1-a))
-    __m256d sqrt_haversine = _mm256_sqrt_pd(haversine_term);
-    __m256d sqrt_complement = _mm256_sqrt_pd(_mm256_sub_pd(one, haversine_term));
-    __m256d central_angle = _mm256_mul_pd(two, nk_atan2_f64x4_haswell_(sqrt_haversine, sqrt_complement));
+    __m256d sqrt_haversine_f64x4 = _mm256_sqrt_pd(haversine_term_f64x4);
+    __m256d sqrt_complement_f64x4 = _mm256_sqrt_pd(_mm256_sub_pd(one_f64x4, haversine_term_f64x4));
+    __m256d central_angle_f64x4 = _mm256_mul_pd(two_f64x4,
+                                                nk_atan2_f64x4_haswell_(sqrt_haversine_f64x4, sqrt_complement_f64x4));
-    return _mm256_mul_pd(earth_radius, central_angle);
+    return _mm256_mul_pd(earth_radius_f64x4, central_angle_f64x4);
 }
 NK_PUBLIC void nk_haversine_f64_haswell(            //
@@ -86,14 +91,14 @@ NK_PUBLIC void nk_haversine_f64_haswell(            //
     nk_size_t n, nk_f64_t *results) {
     while (n >= 4) {
-        __m256d first_latitudes = _mm256_loadu_pd(a_lats);
-        __m256d first_longitudes = _mm256_loadu_pd(a_lons);
-        __m256d second_latitudes = _mm256_loadu_pd(b_lats);
-        __m256d second_longitudes = _mm256_loadu_pd(b_lons);
+        __m256d first_latitudes_f64x4 = _mm256_loadu_pd(a_lats);
+        __m256d first_longitudes_f64x4 = _mm256_loadu_pd(a_lons);
+        __m256d second_latitudes_f64x4 = _mm256_loadu_pd(b_lats);
+        __m256d second_longitudes_f64x4 = _mm256_loadu_pd(b_lons);
-        __m256d distances = nk_haversine_f64x4_haswell_(first_latitudes, first_longitudes, second_latitudes,
-                                                        second_longitudes);
-        _mm256_storeu_pd(results, distances);
+        __m256d distances_f64x4 = nk_haversine_f64x4_haswell_(first_latitudes_f64x4, first_longitudes_f64x4,
+                                                              second_latitudes_f64x4, second_longitudes_f64x4);
+        _mm256_storeu_pd(results, distances_f64x4);
         a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
     }
@@ -105,52 +110,56 @@ NK_PUBLIC void nk_haversine_f64_haswell(            //
         nk_partial_load_b64x4_haswell_(a_lons, &a_lon_vec, n);
         nk_partial_load_b64x4_haswell_(b_lats, &b_lat_vec, n);
         nk_partial_load_b64x4_haswell_(b_lons, &b_lon_vec, n);
-        __m256d distances = nk_haversine_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
-                                                        b_lon_vec.ymm_pd);
-        result_vec.ymm_pd = distances;
+        __m256d distances_f64x4 = nk_haversine_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
+                                                              b_lon_vec.ymm_pd);
+        result_vec.ymm_pd = distances_f64x4;
         nk_partial_store_b64x4_haswell_(&result_vec, results, n);
     }
 }
-NK_INTERNAL __m256 nk_haversine_f32x8_haswell_(      //
-    __m256 first_latitudes, __m256 first_longitudes, //
-    __m256 second_latitudes, __m256 second_longitudes) {
+NK_INTERNAL __m256 nk_haversine_f32x8_haswell_(                  //
+    __m256 first_latitudes_f32x8, __m256 first_longitudes_f32x8, //
+    __m256 second_latitudes_f32x8, __m256 second_longitudes_f32x8) {
-    __m256 const earth_radius = _mm256_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
-    __m256 const half = _mm256_set1_ps(0.5f);
-    __m256 const one = _mm256_set1_ps(1.0f);
-    __m256 const two = _mm256_set1_ps(2.0f);
+    __m256 const earth_radius_f32x8 = _mm256_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
+    __m256 const half_f32x8 = _mm256_set1_ps(0.5f);
+    __m256 const one_f32x8 = _mm256_set1_ps(1.0f);
+    __m256 const two_f32x8 = _mm256_set1_ps(2.0f);
-    __m256 latitude_delta = _mm256_sub_ps(second_latitudes, first_latitudes);
-    __m256 longitude_delta = _mm256_sub_ps(second_longitudes, first_longitudes);
+    __m256 latitude_delta_f32x8 = _mm256_sub_ps(second_latitudes_f32x8, first_latitudes_f32x8);
+    __m256 longitude_delta_f32x8 = _mm256_sub_ps(second_longitudes_f32x8, first_longitudes_f32x8);
     // Haversine terms: sin²(Δ/2)
-    __m256 latitude_delta_half = _mm256_mul_ps(latitude_delta, half);
-    __m256 longitude_delta_half = _mm256_mul_ps(longitude_delta, half);
-    __m256 sin_latitude_delta_half = nk_sin_f32x8_haswell_(latitude_delta_half);
-    __m256 sin_longitude_delta_half = nk_sin_f32x8_haswell_(longitude_delta_half);
-    __m256 sin_squared_latitude_delta_half = _mm256_mul_ps(sin_latitude_delta_half, sin_latitude_delta_half);
-    __m256 sin_squared_longitude_delta_half = _mm256_mul_ps(sin_longitude_delta_half, sin_longitude_delta_half);
+    __m256 latitude_delta_half_f32x8 = _mm256_mul_ps(latitude_delta_f32x8, half_f32x8);
+    __m256 longitude_delta_half_f32x8 = _mm256_mul_ps(longitude_delta_f32x8, half_f32x8);
+    __m256 sin_latitude_delta_half_f32x8 = nk_sin_f32x8_haswell_(latitude_delta_half_f32x8);
+    __m256 sin_longitude_delta_half_f32x8 = nk_sin_f32x8_haswell_(longitude_delta_half_f32x8);
+    __m256 sin_squared_latitude_delta_half_f32x8 = _mm256_mul_ps(sin_latitude_delta_half_f32x8,
+                                                                 sin_latitude_delta_half_f32x8);
+    __m256 sin_squared_longitude_delta_half_f32x8 = _mm256_mul_ps(sin_longitude_delta_half_f32x8,
+                                                                  sin_longitude_delta_half_f32x8);
     // Latitude cosine product
-    __m256 cos_first_latitude = nk_cos_f32x8_haswell_(first_latitudes);
-    __m256 cos_second_latitude = nk_cos_f32x8_haswell_(second_latitudes);
-    __m256 cos_latitude_product = _mm256_mul_ps(cos_first_latitude, cos_second_latitude);
+    __m256 cos_first_latitude_f32x8 = nk_cos_f32x8_haswell_(first_latitudes_f32x8);
+    __m256 cos_second_latitude_f32x8 = nk_cos_f32x8_haswell_(second_latitudes_f32x8);
+    __m256 cos_latitude_product_f32x8 = _mm256_mul_ps(cos_first_latitude_f32x8, cos_second_latitude_f32x8);
     // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
-    __m256 haversine_term = _mm256_add_ps(sin_squared_latitude_delta_half,
-                                          _mm256_mul_ps(cos_latitude_product, sin_squared_longitude_delta_half));
+    __m256 haversine_term_f32x8 = _mm256_add_ps(
+        sin_squared_latitude_delta_half_f32x8,
+        _mm256_mul_ps(cos_latitude_product_f32x8, sin_squared_longitude_delta_half_f32x8));
     // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
-    __m256 zero = _mm256_setzero_ps();
-    haversine_term = _mm256_max_ps(zero, _mm256_min_ps(one, haversine_term));
+    __m256 zero_f32x8 = _mm256_setzero_ps();
+    haversine_term_f32x8 = _mm256_max_ps(zero_f32x8, _mm256_min_ps(one_f32x8, haversine_term_f32x8));
     // Central angle: c = 2 × atan2(√a, √(1-a))
-    __m256 sqrt_haversine = _mm256_sqrt_ps(haversine_term);
-    __m256 sqrt_complement = _mm256_sqrt_ps(_mm256_sub_ps(one, haversine_term));
-    __m256 central_angle = _mm256_mul_ps(two, nk_atan2_f32x8_haswell_(sqrt_haversine, sqrt_complement));
+    __m256 sqrt_haversine_f32x8 = _mm256_sqrt_ps(haversine_term_f32x8);
+    __m256 sqrt_complement_f32x8 = _mm256_sqrt_ps(_mm256_sub_ps(one_f32x8, haversine_term_f32x8));
+    __m256 central_angle_f32x8 = _mm256_mul_ps(two_f32x8,
+                                               nk_atan2_f32x8_haswell_(sqrt_haversine_f32x8, sqrt_complement_f32x8));
-    return _mm256_mul_ps(earth_radius, central_angle);
+    return _mm256_mul_ps(earth_radius_f32x8, central_angle_f32x8);
 }
 NK_PUBLIC void nk_haversine_f32_haswell(            //
@@ -159,14 +168,14 @@ NK_PUBLIC void nk_haversine_f32_haswell(            //
     nk_size_t n, nk_f32_t *results) {
     while (n >= 8) {
-        __m256 first_latitudes = _mm256_loadu_ps(a_lats);
-        __m256 first_longitudes = _mm256_loadu_ps(a_lons);
-        __m256 second_latitudes = _mm256_loadu_ps(b_lats);
-        __m256 second_longitudes = _mm256_loadu_ps(b_lons);
+        __m256 first_latitudes_f32x8 = _mm256_loadu_ps(a_lats);
+        __m256 first_longitudes_f32x8 = _mm256_loadu_ps(a_lons);
+        __m256 second_latitudes_f32x8 = _mm256_loadu_ps(b_lats);
+        __m256 second_longitudes_f32x8 = _mm256_loadu_ps(b_lons);
-        __m256 distances = nk_haversine_f32x8_haswell_(first_latitudes, first_longitudes, second_latitudes,
-                                                       second_longitudes);
-        _mm256_storeu_ps(results, distances);
+        __m256 distances_f32x8 = nk_haversine_f32x8_haswell_(first_latitudes_f32x8, first_longitudes_f32x8,
+                                                             second_latitudes_f32x8, second_longitudes_f32x8);
+        _mm256_storeu_ps(results, distances_f32x8);
         a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
     }
@@ -178,9 +187,9 @@ NK_PUBLIC void nk_haversine_f32_haswell(            //
         nk_partial_load_b32x8_serial_(a_lons, &a_lon_vec, n);
         nk_partial_load_b32x8_serial_(b_lats, &b_lat_vec, n);
         nk_partial_load_b32x8_serial_(b_lons, &b_lon_vec, n);
-        __m256 distances = nk_haversine_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
-                                                       b_lon_vec.ymm_ps);
-        result_vec.ymm_ps = distances;
+        __m256 distances_f32x8 = nk_haversine_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
+                                                             b_lon_vec.ymm_ps);
+        result_vec.ymm_ps = distances_f32x8;
         nk_partial_store_b32x8_serial_(&result_vec, results, n);
     }
 }
@@ -189,165 +198,180 @@ NK_PUBLIC void nk_haversine_f32_haswell(            //
  *  @brief  AVX2 helper for Vincenty's geodesic distance on 4 f64 point pairs.
  *  @note   This is a true SIMD implementation using masked convergence tracking via blending.
  */
-NK_INTERNAL __m256d nk_vincenty_f64x4_haswell_(        //
-    __m256d first_latitudes, __m256d first_longitudes, //
-    __m256d second_latitudes, __m256d second_longitudes) {
-    __m256d const equatorial_radius = _mm256_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
-    __m256d const polar_radius = _mm256_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
-    __m256d const flattening = _mm256_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
-    __m256d const convergence_threshold = _mm256_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
-    __m256d const one = _mm256_set1_pd(1.0);
-    __m256d const two = _mm256_set1_pd(2.0);
-    __m256d const three = _mm256_set1_pd(3.0);
-    __m256d const four = _mm256_set1_pd(4.0);
-    __m256d const six = _mm256_set1_pd(6.0);
-    __m256d const sixteen = _mm256_set1_pd(16.0);
-    __m256d const epsilon = _mm256_set1_pd(1e-15);
+NK_INTERNAL __m256d nk_vincenty_f64x4_haswell_(                    //
+    __m256d first_latitudes_f64x4, __m256d first_longitudes_f64x4, //
+    __m256d second_latitudes_f64x4, __m256d second_longitudes_f64x4) {
+    __m256d const equatorial_radius_f64x4 = _mm256_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
+    __m256d const polar_radius_f64x4 = _mm256_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
+    __m256d const flattening_f64x4 = _mm256_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
+    __m256d const convergence_threshold_f64x4 = _mm256_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
+    __m256d const one_f64x4 = _mm256_set1_pd(1.0);
+    __m256d const two_f64x4 = _mm256_set1_pd(2.0);
+    __m256d const three_f64x4 = _mm256_set1_pd(3.0);
+    __m256d const four_f64x4 = _mm256_set1_pd(4.0);
+    __m256d const six_f64x4 = _mm256_set1_pd(6.0);
+    __m256d const sixteen_f64x4 = _mm256_set1_pd(16.0);
+    __m256d const epsilon_f64x4 = _mm256_set1_pd(1e-15);
     // Longitude difference
-    __m256d longitude_difference = _mm256_sub_pd(second_longitudes, first_longitudes);
+    __m256d longitude_difference_f64x4 = _mm256_sub_pd(second_longitudes_f64x4, first_longitudes_f64x4);
     // Reduced latitudes: tan(U) = (1-f) * tan(lat)
-    __m256d one_minus_f = _mm256_sub_pd(one, flattening);
-    __m256d tan_first = _mm256_div_pd(nk_sin_f64x4_haswell_(first_latitudes), nk_cos_f64x4_haswell_(first_latitudes));
-    __m256d tan_second = _mm256_div_pd(nk_sin_f64x4_haswell_(second_latitudes),
-                                       nk_cos_f64x4_haswell_(second_latitudes));
-    __m256d tan_reduced_first = _mm256_mul_pd(one_minus_f, tan_first);
-    __m256d tan_reduced_second = _mm256_mul_pd(one_minus_f, tan_second);
+    __m256d one_minus_f_f64x4 = _mm256_sub_pd(one_f64x4, flattening_f64x4);
+    __m256d tan_first_f64x4 = _mm256_div_pd(nk_sin_f64x4_haswell_(first_latitudes_f64x4),
+                                            nk_cos_f64x4_haswell_(first_latitudes_f64x4));
+    __m256d tan_second_f64x4 = _mm256_div_pd(nk_sin_f64x4_haswell_(second_latitudes_f64x4),
+                                             nk_cos_f64x4_haswell_(second_latitudes_f64x4));
+    __m256d tan_reduced_first_f64x4 = _mm256_mul_pd(one_minus_f_f64x4, tan_first_f64x4);
+    __m256d tan_reduced_second_f64x4 = _mm256_mul_pd(one_minus_f_f64x4, tan_second_f64x4);
     // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
-    __m256d cos_reduced_first = _mm256_div_pd(
-        one, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_first, tan_reduced_first, one)));
-    __m256d sin_reduced_first = _mm256_mul_pd(tan_reduced_first, cos_reduced_first);
-    __m256d cos_reduced_second = _mm256_div_pd(
-        one, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_second, tan_reduced_second, one)));
-    __m256d sin_reduced_second = _mm256_mul_pd(tan_reduced_second, cos_reduced_second);
-    // Initialize lambda and tracking variables
-    __m256d lambda = longitude_difference;
-    __m256d sin_angular_distance, cos_angular_distance, angular_distance;
-    __m256d sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
+    __m256d cos_reduced_first_f64x4 = _mm256_div_pd(
+        one_f64x4, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_first_f64x4, tan_reduced_first_f64x4, one_f64x4)));
+    __m256d sin_reduced_first_f64x4 = _mm256_mul_pd(tan_reduced_first_f64x4, cos_reduced_first_f64x4);
+    __m256d cos_reduced_second_f64x4 = _mm256_div_pd(
+        one_f64x4, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_second_f64x4, tan_reduced_second_f64x4, one_f64x4)));
+    __m256d sin_reduced_second_f64x4 = _mm256_mul_pd(tan_reduced_second_f64x4, cos_reduced_second_f64x4);
+    // Initialize lambda_f64x4 and tracking variables
+    __m256d lambda_f64x4 = longitude_difference_f64x4;
+    __m256d sin_angular_distance_f64x4, cos_angular_distance_f64x4, angular_distance_f64x4;
+    __m256d sin_azimuth_f64x4, cos_squared_azimuth_f64x4, cos_double_angular_midpoint_f64x4;
     // Track convergence and coincident points using masks
-    __m256d converged_mask = _mm256_setzero_pd();
-    __m256d coincident_mask = _mm256_setzero_pd();
+    __m256d converged_mask_f64x4 = _mm256_setzero_pd();
+    __m256d coincident_mask_f64x4 = _mm256_setzero_pd();
     for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
         // Check if all lanes converged
-        int converged_bits = _mm256_movemask_pd(converged_mask);
+        int converged_bits = _mm256_movemask_pd(converged_mask_f64x4);
         if (converged_bits == 0xF) break;
-        __m256d sin_lambda = nk_sin_f64x4_haswell_(lambda);
-        __m256d cos_lambda = nk_cos_f64x4_haswell_(lambda);
+        __m256d sin_lambda_f64x4 = nk_sin_f64x4_haswell_(lambda_f64x4);
+        __m256d cos_lambda_f64x4 = nk_cos_f64x4_haswell_(lambda_f64x4);
-        // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
-        __m256d cross_term = _mm256_mul_pd(cos_reduced_second, sin_lambda);
-        __m256d mixed_term = _mm256_sub_pd(
-            _mm256_mul_pd(cos_reduced_first, sin_reduced_second),
-            _mm256_mul_pd(_mm256_mul_pd(sin_reduced_first, cos_reduced_second), cos_lambda));
-        __m256d sin_angular_dist_sq = _mm256_fmadd_pd(cross_term, cross_term, _mm256_mul_pd(mixed_term, mixed_term));
-        sin_angular_distance = _mm256_sqrt_pd(sin_angular_dist_sq);
+        // sin²(angular_distance_f64x4) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
+        __m256d cross_term_f64x4 = _mm256_mul_pd(cos_reduced_second_f64x4, sin_lambda_f64x4);
+        __m256d mixed_term_f64x4 = _mm256_sub_pd(
+            _mm256_mul_pd(cos_reduced_first_f64x4, sin_reduced_second_f64x4),
+            _mm256_mul_pd(_mm256_mul_pd(sin_reduced_first_f64x4, cos_reduced_second_f64x4), cos_lambda_f64x4));
+        __m256d sin_angular_dist_sq_f64x4 = _mm256_fmadd_pd(cross_term_f64x4, cross_term_f64x4,
+                                                            _mm256_mul_pd(mixed_term_f64x4, mixed_term_f64x4));
+        sin_angular_distance_f64x4 = _mm256_sqrt_pd(sin_angular_dist_sq_f64x4);
-        // Check for coincident points (sin_angular_distance ≈ 0)
-        coincident_mask = _mm256_cmp_pd(sin_angular_distance, epsilon, _CMP_LT_OS);
+        // Check for coincident points (sin_angular_distance_f64x4 ≈ 0)
+        coincident_mask_f64x4 = _mm256_cmp_pd(sin_angular_distance_f64x4, epsilon_f64x4, _CMP_LT_OS);
-        // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
-        cos_angular_distance = _mm256_fmadd_pd(_mm256_mul_pd(cos_reduced_first, cos_reduced_second), cos_lambda,
-                                               _mm256_mul_pd(sin_reduced_first, sin_reduced_second));
+        // cos(angular_distance_f64x4) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
+        cos_angular_distance_f64x4 = _mm256_fmadd_pd(_mm256_mul_pd(cos_reduced_first_f64x4, cos_reduced_second_f64x4),
+                                                     cos_lambda_f64x4,
+                                                     _mm256_mul_pd(sin_reduced_first_f64x4, sin_reduced_second_f64x4));
-        // angular_distance = atan2(sin, cos)
-        angular_distance = nk_atan2_f64x4_haswell_(sin_angular_distance, cos_angular_distance);
+        // angular_distance_f64x4 = atan2(sin, cos)
+        angular_distance_f64x4 = nk_atan2_f64x4_haswell_(sin_angular_distance_f64x4, cos_angular_distance_f64x4);
-        // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
+        // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f64x4)
         // Avoid division by zero by using blending
-        __m256d safe_sin_angular = _mm256_blendv_pd(sin_angular_distance, one, coincident_mask);
-        sin_azimuth = _mm256_div_pd(_mm256_mul_pd(_mm256_mul_pd(cos_reduced_first, cos_reduced_second), sin_lambda),
-                                    safe_sin_angular);
-        cos_squared_azimuth = _mm256_sub_pd(one, _mm256_mul_pd(sin_azimuth, sin_azimuth));
+        __m256d safe_sin_angular_f64x4 = _mm256_blendv_pd(sin_angular_distance_f64x4, one_f64x4, coincident_mask_f64x4);
+        sin_azimuth_f64x4 = _mm256_div_pd(
+            _mm256_mul_pd(_mm256_mul_pd(cos_reduced_first_f64x4, cos_reduced_second_f64x4), sin_lambda_f64x4),
+            safe_sin_angular_f64x4);
+        cos_squared_azimuth_f64x4 = _mm256_sub_pd(one_f64x4, _mm256_mul_pd(sin_azimuth_f64x4, sin_azimuth_f64x4));
         // Handle equatorial case: cos²α ≈ 0
-        __m256d equatorial_mask = _mm256_cmp_pd(cos_squared_azimuth, epsilon, _CMP_LT_OS);
-        __m256d safe_cos_sq_azimuth = _mm256_blendv_pd(cos_squared_azimuth, one, equatorial_mask);
+        __m256d equatorial_mask_f64x4 = _mm256_cmp_pd(cos_squared_azimuth_f64x4, epsilon_f64x4, _CMP_LT_OS);
+        __m256d safe_cos_sq_azimuth_f64x4 = _mm256_blendv_pd(cos_squared_azimuth_f64x4, one_f64x4,
+                                                             equatorial_mask_f64x4);
         // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
-        __m256d sin_product = _mm256_mul_pd(sin_reduced_first, sin_reduced_second);
-        cos_double_angular_midpoint = _mm256_sub_pd(
-            cos_angular_distance, _mm256_div_pd(_mm256_mul_pd(two, sin_product), safe_cos_sq_azimuth));
-        cos_double_angular_midpoint = _mm256_blendv_pd(cos_double_angular_midpoint, _mm256_setzero_pd(),
-                                                       equatorial_mask);
+        __m256d sin_product_f64x4 = _mm256_mul_pd(sin_reduced_first_f64x4, sin_reduced_second_f64x4);
+        cos_double_angular_midpoint_f64x4 = _mm256_sub_pd(
+            cos_angular_distance_f64x4,
+            _mm256_div_pd(_mm256_mul_pd(two_f64x4, sin_product_f64x4), safe_cos_sq_azimuth_f64x4));
+        cos_double_angular_midpoint_f64x4 = _mm256_blendv_pd(cos_double_angular_midpoint_f64x4, _mm256_setzero_pd(),
+                                                             equatorial_mask_f64x4);
         // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
-        __m256d correction_factor = _mm256_mul_pd(
-            _mm256_div_pd(flattening, sixteen),
-            _mm256_mul_pd(cos_squared_azimuth,
-                          _mm256_fmadd_pd(flattening, _mm256_fnmadd_pd(three, cos_squared_azimuth, four), four)));
+        __m256d correction_factor_f64x4 = _mm256_mul_pd(
+            _mm256_div_pd(flattening_f64x4, sixteen_f64x4),
+            _mm256_mul_pd(
+                cos_squared_azimuth_f64x4,
+                _mm256_fmadd_pd(flattening_f64x4, _mm256_fnmadd_pd(three_f64x4, cos_squared_azimuth_f64x4, four_f64x4),
+                                four_f64x4)));
         // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
-        __m256d cos_2sm_sq = _mm256_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
-        // innermost = -1 + 2 × cos²(2σₘ)
-        __m256d innermost = _mm256_fmadd_pd(two, cos_2sm_sq, _mm256_set1_pd(-1.0));
-        // middle = cos(2σₘ) + C × cos(σ) × innermost
-        __m256d middle = _mm256_fmadd_pd(_mm256_mul_pd(correction_factor, cos_angular_distance), innermost,
-                                         cos_double_angular_midpoint);
-        // inner = C × sin(σ) × middle
-        __m256d inner = _mm256_mul_pd(_mm256_mul_pd(correction_factor, sin_angular_distance), middle);
-        // λ' = L + (1-C) * f * sin_α * (σ + inner)
-        __m256d lambda_new = _mm256_fmadd_pd(
-            _mm256_mul_pd(_mm256_mul_pd(_mm256_sub_pd(one, correction_factor), flattening), sin_azimuth),
-            _mm256_add_pd(angular_distance, inner), longitude_difference);
+        __m256d cos_2sm_sq_f64x4 = _mm256_mul_pd(cos_double_angular_midpoint_f64x4, cos_double_angular_midpoint_f64x4);
+        // innermost_f64x4 = -1 + 2 × cos²(2σₘ)
+        __m256d innermost_f64x4 = _mm256_fmadd_pd(two_f64x4, cos_2sm_sq_f64x4, _mm256_set1_pd(-1.0));
+        // middle_f64x4 = cos(2σₘ) + C × cos(σ) × innermost_f64x4
+        __m256d middle_f64x4 = _mm256_fmadd_pd(_mm256_mul_pd(correction_factor_f64x4, cos_angular_distance_f64x4),
+                                               innermost_f64x4, cos_double_angular_midpoint_f64x4);
+        // inner_f64x4 = C × sin(σ) × middle_f64x4
+        __m256d inner_f64x4 = _mm256_mul_pd(_mm256_mul_pd(correction_factor_f64x4, sin_angular_distance_f64x4),
+                                            middle_f64x4);
+        // λ' = L + (1-C) * f * sin_α * (σ + inner_f64x4)
+        __m256d lambda_new_f64x4 = _mm256_fmadd_pd(
+            _mm256_mul_pd(_mm256_mul_pd(_mm256_sub_pd(one_f64x4, correction_factor_f64x4), flattening_f64x4),
+                          sin_azimuth_f64x4),
+            _mm256_add_pd(angular_distance_f64x4, inner_f64x4), longitude_difference_f64x4);
         // Check convergence: |λ - λ'| < threshold
-        __m256d lambda_diff_abs = _mm256_andnot_pd(_mm256_set1_pd(-0.0), _mm256_sub_pd(lambda_new, lambda));
-        __m256d newly_converged = _mm256_cmp_pd(lambda_diff_abs, convergence_threshold, _CMP_LT_OS);
-        converged_mask = _mm256_or_pd(converged_mask, newly_converged);
+        __m256d lambda_diff_abs_f64x4 = _mm256_andnot_pd(_mm256_set1_pd(-0.0),
+                                                         _mm256_sub_pd(lambda_new_f64x4, lambda_f64x4));
+        __m256d newly_converged_f64x4 = _mm256_cmp_pd(lambda_diff_abs_f64x4, convergence_threshold_f64x4, _CMP_LT_OS);
+        converged_mask_f64x4 = _mm256_or_pd(converged_mask_f64x4, newly_converged_f64x4);
-        // Only update lambda for non-converged lanes
-        lambda = _mm256_blendv_pd(lambda_new, lambda, converged_mask);
+        // Only update lambda_f64x4 for non-converged lanes
+        lambda_f64x4 = _mm256_blendv_pd(lambda_new_f64x4, lambda_f64x4, converged_mask_f64x4);
     }
     // Final distance calculation
     // u² = cos²α * (a² - b²) / b²
-    __m256d a_sq = _mm256_mul_pd(equatorial_radius, equatorial_radius);
-    __m256d b_sq = _mm256_mul_pd(polar_radius, polar_radius);
-    __m256d u_squared = _mm256_div_pd(_mm256_mul_pd(cos_squared_azimuth, _mm256_sub_pd(a_sq, b_sq)), b_sq);
+    __m256d a_sq_f64x4 = _mm256_mul_pd(equatorial_radius_f64x4, equatorial_radius_f64x4);
+    __m256d b_sq_f64x4 = _mm256_mul_pd(polar_radius_f64x4, polar_radius_f64x4);
+    __m256d u_squared_f64x4 = _mm256_div_pd(
+        _mm256_mul_pd(cos_squared_azimuth_f64x4, _mm256_sub_pd(a_sq_f64x4, b_sq_f64x4)), b_sq_f64x4);
     // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
-    __m256d series_a = _mm256_fmadd_pd(u_squared, _mm256_set1_pd(-175.0), _mm256_set1_pd(320.0));
-    series_a = _mm256_fmadd_pd(u_squared, series_a, _mm256_set1_pd(-768.0));
-    series_a = _mm256_fmadd_pd(u_squared, series_a, _mm256_set1_pd(4096.0));
-    series_a = _mm256_fmadd_pd(_mm256_div_pd(u_squared, _mm256_set1_pd(16384.0)), series_a, one);
+    __m256d series_a_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, _mm256_set1_pd(-175.0), _mm256_set1_pd(320.0));
+    series_a_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_a_f64x4, _mm256_set1_pd(-768.0));
+    series_a_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_a_f64x4, _mm256_set1_pd(4096.0));
+    series_a_f64x4 = _mm256_fmadd_pd(_mm256_div_pd(u_squared_f64x4, _mm256_set1_pd(16384.0)), series_a_f64x4,
+                                     one_f64x4);
     // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
-    __m256d series_b = _mm256_fmadd_pd(u_squared, _mm256_set1_pd(-47.0), _mm256_set1_pd(74.0));
-    series_b = _mm256_fmadd_pd(u_squared, series_b, _mm256_set1_pd(-128.0));
-    series_b = _mm256_fmadd_pd(u_squared, series_b, _mm256_set1_pd(256.0));
-    series_b = _mm256_mul_pd(_mm256_div_pd(u_squared, _mm256_set1_pd(1024.0)), series_b);
+    __m256d series_b_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, _mm256_set1_pd(-47.0), _mm256_set1_pd(74.0));
+    series_b_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_b_f64x4, _mm256_set1_pd(-128.0));
+    series_b_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_b_f64x4, _mm256_set1_pd(256.0));
+    series_b_f64x4 = _mm256_mul_pd(_mm256_div_pd(u_squared_f64x4, _mm256_set1_pd(1024.0)), series_b_f64x4);
     // Δσ = B × sin(σ) × (cos(2σₘ) +
     //      B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
-    __m256d cos_2sm_sq = _mm256_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
-    __m256d sin_sq = _mm256_mul_pd(sin_angular_distance, sin_angular_distance);
-    __m256d term1 = _mm256_fmadd_pd(two, cos_2sm_sq, _mm256_set1_pd(-1.0));
-    term1 = _mm256_mul_pd(cos_angular_distance, term1);
-    __m256d term2 = _mm256_fmadd_pd(four, sin_sq, _mm256_set1_pd(-3.0));
-    __m256d term3 = _mm256_fmadd_pd(four, cos_2sm_sq, _mm256_set1_pd(-3.0));
-    term2 = _mm256_mul_pd(_mm256_mul_pd(_mm256_div_pd(series_b, six), cos_double_angular_midpoint),
-                          _mm256_mul_pd(term2, term3));
-    __m256d delta_sigma = _mm256_mul_pd(
-        series_b, _mm256_mul_pd(sin_angular_distance, _mm256_add_pd(cos_double_angular_midpoint,
-                                                                    _mm256_mul_pd(_mm256_div_pd(series_b, four),
-                                                                                  _mm256_sub_pd(term1, term2)))));
+    __m256d cos_2sm_sq_f64x4 = _mm256_mul_pd(cos_double_angular_midpoint_f64x4, cos_double_angular_midpoint_f64x4);
+    __m256d sin_sq_f64x4 = _mm256_mul_pd(sin_angular_distance_f64x4, sin_angular_distance_f64x4);
+    __m256d term1_f64x4 = _mm256_fmadd_pd(two_f64x4, cos_2sm_sq_f64x4, _mm256_set1_pd(-1.0));
+    term1_f64x4 = _mm256_mul_pd(cos_angular_distance_f64x4, term1_f64x4);
+    __m256d term2_f64x4 = _mm256_fmadd_pd(four_f64x4, sin_sq_f64x4, _mm256_set1_pd(-3.0));
+    __m256d term3_f64x4 = _mm256_fmadd_pd(four_f64x4, cos_2sm_sq_f64x4, _mm256_set1_pd(-3.0));
+    term2_f64x4 = _mm256_mul_pd(
+        _mm256_mul_pd(_mm256_div_pd(series_b_f64x4, six_f64x4), cos_double_angular_midpoint_f64x4),
+        _mm256_mul_pd(term2_f64x4, term3_f64x4));
+    __m256d delta_sigma_f64x4 = _mm256_mul_pd(
+        series_b_f64x4, _mm256_mul_pd(sin_angular_distance_f64x4,
+                                      _mm256_add_pd(cos_double_angular_midpoint_f64x4,
+                                                    _mm256_mul_pd(_mm256_div_pd(series_b_f64x4, four_f64x4),
+                                                                  _mm256_sub_pd(term1_f64x4, term2_f64x4)))));
     // s = b * A * (σ - Δσ)
-    __m256d distances = _mm256_mul_pd(_mm256_mul_pd(polar_radius, series_a),
-                                      _mm256_sub_pd(angular_distance, delta_sigma));
+    __m256d distances_f64x4 = _mm256_mul_pd(_mm256_mul_pd(polar_radius_f64x4, series_a_f64x4),
+                                            _mm256_sub_pd(angular_distance_f64x4, delta_sigma_f64x4));
     // Set coincident points to zero
-    distances = _mm256_blendv_pd(distances, _mm256_setzero_pd(), coincident_mask);
+    distances_f64x4 = _mm256_blendv_pd(distances_f64x4, _mm256_setzero_pd(), coincident_mask_f64x4);
-    return distances;
+    return distances_f64x4;
 }
 NK_PUBLIC void nk_vincenty_f64_haswell(             //
@@ -356,14 +380,14 @@ NK_PUBLIC void nk_vincenty_f64_haswell(             //
     nk_size_t n, nk_f64_t *results) {
     while (n >= 4) {
-        __m256d first_latitudes = _mm256_loadu_pd(a_lats);
-        __m256d first_longitudes = _mm256_loadu_pd(a_lons);
-        __m256d second_latitudes = _mm256_loadu_pd(b_lats);
-        __m256d second_longitudes = _mm256_loadu_pd(b_lons);
+        __m256d first_latitudes_f64x4 = _mm256_loadu_pd(a_lats);
+        __m256d first_longitudes_f64x4 = _mm256_loadu_pd(a_lons);
+        __m256d second_latitudes_f64x4 = _mm256_loadu_pd(b_lats);
+        __m256d second_longitudes_f64x4 = _mm256_loadu_pd(b_lons);
-        __m256d distances = nk_vincenty_f64x4_haswell_(first_latitudes, first_longitudes, second_latitudes,
-                                                       second_longitudes);
-        _mm256_storeu_pd(results, distances);
+        __m256d distances_f64x4 = nk_vincenty_f64x4_haswell_(first_latitudes_f64x4, first_longitudes_f64x4,
+                                                             second_latitudes_f64x4, second_longitudes_f64x4);
+        _mm256_storeu_pd(results, distances_f64x4);
         a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
     }
@@ -375,9 +399,9 @@ NK_PUBLIC void nk_vincenty_f64_haswell(             //
         nk_partial_load_b64x4_haswell_(a_lons, &a_lon_vec, n);
         nk_partial_load_b64x4_haswell_(b_lats, &b_lat_vec, n);
         nk_partial_load_b64x4_haswell_(b_lons, &b_lon_vec, n);
-        __m256d distances = nk_vincenty_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
-                                                       b_lon_vec.ymm_pd);
-        result_vec.ymm_pd = distances;
+        __m256d distances_f64x4 = nk_vincenty_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
+                                                             b_lon_vec.ymm_pd);
+        result_vec.ymm_pd = distances_f64x4;
         nk_partial_store_b64x4_haswell_(&result_vec, results, n);
     }
 }
@@ -386,164 +410,180 @@ NK_PUBLIC void nk_vincenty_f64_haswell(             //
  *  @brief  AVX2 helper for Vincenty's geodesic distance on 8 f32 point pairs.
  *  @note   This is a true SIMD implementation using masked convergence tracking via blending.
  */
-NK_INTERNAL __m256 nk_vincenty_f32x8_haswell_(       //
-    __m256 first_latitudes, __m256 first_longitudes, //
-    __m256 second_latitudes, __m256 second_longitudes) {
-    __m256 const equatorial_radius = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
-    __m256 const polar_radius = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
-    __m256 const flattening = _mm256_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
-    __m256 const convergence_threshold = _mm256_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
-    __m256 const one = _mm256_set1_ps(1.0f);
-    __m256 const two = _mm256_set1_ps(2.0f);
-    __m256 const three = _mm256_set1_ps(3.0f);
-    __m256 const four = _mm256_set1_ps(4.0f);
-    __m256 const six = _mm256_set1_ps(6.0f);
-    __m256 const sixteen = _mm256_set1_ps(16.0f);
-    __m256 const epsilon = _mm256_set1_ps(1e-7f);
+NK_INTERNAL __m256 nk_vincenty_f32x8_haswell_(                   //
+    __m256 first_latitudes_f32x8, __m256 first_longitudes_f32x8, //
+    __m256 second_latitudes_f32x8, __m256 second_longitudes_f32x8) {
+    __m256 const equatorial_radius_f32x8 = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
+    __m256 const polar_radius_f32x8 = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
+    __m256 const flattening_f32x8 = _mm256_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
+    __m256 const convergence_threshold_f32x8 = _mm256_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
+    __m256 const one_f32x8 = _mm256_set1_ps(1.0f);
+    __m256 const two_f32x8 = _mm256_set1_ps(2.0f);
+    __m256 const three_f32x8 = _mm256_set1_ps(3.0f);
+    __m256 const four_f32x8 = _mm256_set1_ps(4.0f);
+    __m256 const six_f32x8 = _mm256_set1_ps(6.0f);
+    __m256 const sixteen_f32x8 = _mm256_set1_ps(16.0f);
+    __m256 const epsilon_f32x8 = _mm256_set1_ps(1e-7f);
     // Longitude difference
-    __m256 longitude_difference = _mm256_sub_ps(second_longitudes, first_longitudes);
+    __m256 longitude_difference_f32x8 = _mm256_sub_ps(second_longitudes_f32x8, first_longitudes_f32x8);
     // Reduced latitudes: tan(U) = (1-f) * tan(lat)
-    __m256 one_minus_f = _mm256_sub_ps(one, flattening);
-    __m256 tan_first = _mm256_div_ps(nk_sin_f32x8_haswell_(first_latitudes), nk_cos_f32x8_haswell_(first_latitudes));
-    __m256 tan_second = _mm256_div_ps(nk_sin_f32x8_haswell_(second_latitudes), nk_cos_f32x8_haswell_(second_latitudes));
-    __m256 tan_reduced_first = _mm256_mul_ps(one_minus_f, tan_first);
-    __m256 tan_reduced_second = _mm256_mul_ps(one_minus_f, tan_second);
+    __m256 one_minus_f_f32x8 = _mm256_sub_ps(one_f32x8, flattening_f32x8);
+    __m256 tan_first_f32x8 = _mm256_div_ps(nk_sin_f32x8_haswell_(first_latitudes_f32x8),
+                                           nk_cos_f32x8_haswell_(first_latitudes_f32x8));
+    __m256 tan_second_f32x8 = _mm256_div_ps(nk_sin_f32x8_haswell_(second_latitudes_f32x8),
+                                            nk_cos_f32x8_haswell_(second_latitudes_f32x8));
+    __m256 tan_reduced_first_f32x8 = _mm256_mul_ps(one_minus_f_f32x8, tan_first_f32x8);
+    __m256 tan_reduced_second_f32x8 = _mm256_mul_ps(one_minus_f_f32x8, tan_second_f32x8);
     // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
-    __m256 cos_reduced_first = _mm256_div_ps(
-        one, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_first, tan_reduced_first, one)));
-    __m256 sin_reduced_first = _mm256_mul_ps(tan_reduced_first, cos_reduced_first);
-    __m256 cos_reduced_second = _mm256_div_ps(
-        one, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_second, tan_reduced_second, one)));
-    __m256 sin_reduced_second = _mm256_mul_ps(tan_reduced_second, cos_reduced_second);
-    // Initialize lambda and tracking variables
-    __m256 lambda = longitude_difference;
-    __m256 sin_angular_distance, cos_angular_distance, angular_distance;
-    __m256 sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
+    __m256 cos_reduced_first_f32x8 = _mm256_div_ps(
+        one_f32x8, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_first_f32x8, tan_reduced_first_f32x8, one_f32x8)));
+    __m256 sin_reduced_first_f32x8 = _mm256_mul_ps(tan_reduced_first_f32x8, cos_reduced_first_f32x8);
+    __m256 cos_reduced_second_f32x8 = _mm256_div_ps(
+        one_f32x8, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_second_f32x8, tan_reduced_second_f32x8, one_f32x8)));
+    __m256 sin_reduced_second_f32x8 = _mm256_mul_ps(tan_reduced_second_f32x8, cos_reduced_second_f32x8);
+    // Initialize lambda_f32x8 and tracking variables
+    __m256 lambda_f32x8 = longitude_difference_f32x8;
+    __m256 sin_angular_distance_f32x8, cos_angular_distance_f32x8, angular_distance_f32x8;
+    __m256 sin_azimuth_f32x8, cos_squared_azimuth_f32x8, cos_double_angular_midpoint_f32x8;
     // Track convergence and coincident points using masks
-    __m256 converged_mask = _mm256_setzero_ps();
-    __m256 coincident_mask = _mm256_setzero_ps();
+    __m256 converged_mask_f32x8 = _mm256_setzero_ps();
+    __m256 coincident_mask_f32x8 = _mm256_setzero_ps();
     for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
         // Check if all lanes converged
-        int converged_bits = _mm256_movemask_ps(converged_mask);
+        int converged_bits = _mm256_movemask_ps(converged_mask_f32x8);
         if (converged_bits == 0xFF) break;
-        __m256 sin_lambda = nk_sin_f32x8_haswell_(lambda);
-        __m256 cos_lambda = nk_cos_f32x8_haswell_(lambda);
+        __m256 sin_lambda_f32x8 = nk_sin_f32x8_haswell_(lambda_f32x8);
+        __m256 cos_lambda_f32x8 = nk_cos_f32x8_haswell_(lambda_f32x8);
-        // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
-        __m256 cross_term = _mm256_mul_ps(cos_reduced_second, sin_lambda);
-        __m256 mixed_term = _mm256_sub_ps(
-            _mm256_mul_ps(cos_reduced_first, sin_reduced_second),
-            _mm256_mul_ps(_mm256_mul_ps(sin_reduced_first, cos_reduced_second), cos_lambda));
-        __m256 sin_angular_dist_sq = _mm256_fmadd_ps(cross_term, cross_term, _mm256_mul_ps(mixed_term, mixed_term));
-        sin_angular_distance = _mm256_sqrt_ps(sin_angular_dist_sq);
+        // sin²(angular_distance_f32x8) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
+        __m256 cross_term_f32x8 = _mm256_mul_ps(cos_reduced_second_f32x8, sin_lambda_f32x8);
+        __m256 mixed_term_f32x8 = _mm256_sub_ps(
+            _mm256_mul_ps(cos_reduced_first_f32x8, sin_reduced_second_f32x8),
+            _mm256_mul_ps(_mm256_mul_ps(sin_reduced_first_f32x8, cos_reduced_second_f32x8), cos_lambda_f32x8));
+        __m256 sin_angular_dist_sq_f32x8 = _mm256_fmadd_ps(cross_term_f32x8, cross_term_f32x8,
+                                                           _mm256_mul_ps(mixed_term_f32x8, mixed_term_f32x8));
+        sin_angular_distance_f32x8 = _mm256_sqrt_ps(sin_angular_dist_sq_f32x8);
-        // Check for coincident points (sin_angular_distance ≈ 0)
-        coincident_mask = _mm256_cmp_ps(sin_angular_distance, epsilon, _CMP_LT_OS);
+        // Check for coincident points (sin_angular_distance_f32x8 ≈ 0)
+        coincident_mask_f32x8 = _mm256_cmp_ps(sin_angular_distance_f32x8, epsilon_f32x8, _CMP_LT_OS);
-        // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
-        cos_angular_distance = _mm256_fmadd_ps(_mm256_mul_ps(cos_reduced_first, cos_reduced_second), cos_lambda,
-                                               _mm256_mul_ps(sin_reduced_first, sin_reduced_second));
+        // cos(angular_distance_f32x8) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
+        cos_angular_distance_f32x8 = _mm256_fmadd_ps(_mm256_mul_ps(cos_reduced_first_f32x8, cos_reduced_second_f32x8),
+                                                     cos_lambda_f32x8,
+                                                     _mm256_mul_ps(sin_reduced_first_f32x8, sin_reduced_second_f32x8));
-        // angular_distance = atan2(sin, cos)
-        angular_distance = nk_atan2_f32x8_haswell_(sin_angular_distance, cos_angular_distance);
+        // angular_distance_f32x8 = atan2(sin, cos)
+        angular_distance_f32x8 = nk_atan2_f32x8_haswell_(sin_angular_distance_f32x8, cos_angular_distance_f32x8);
-        // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
+        // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f32x8)
         // Avoid division by zero by using blending
-        __m256 safe_sin_angular = _mm256_blendv_ps(sin_angular_distance, one, coincident_mask);
-        sin_azimuth = _mm256_div_ps(_mm256_mul_ps(_mm256_mul_ps(cos_reduced_first, cos_reduced_second), sin_lambda),
-                                    safe_sin_angular);
-        cos_squared_azimuth = _mm256_sub_ps(one, _mm256_mul_ps(sin_azimuth, sin_azimuth));
+        __m256 safe_sin_angular_f32x8 = _mm256_blendv_ps(sin_angular_distance_f32x8, one_f32x8, coincident_mask_f32x8);
+        sin_azimuth_f32x8 = _mm256_div_ps(
+            _mm256_mul_ps(_mm256_mul_ps(cos_reduced_first_f32x8, cos_reduced_second_f32x8), sin_lambda_f32x8),
+            safe_sin_angular_f32x8);
+        cos_squared_azimuth_f32x8 = _mm256_sub_ps(one_f32x8, _mm256_mul_ps(sin_azimuth_f32x8, sin_azimuth_f32x8));
         // Handle equatorial case: cos²α ≈ 0
-        __m256 equatorial_mask = _mm256_cmp_ps(cos_squared_azimuth, epsilon, _CMP_LT_OS);
-        __m256 safe_cos_sq_azimuth = _mm256_blendv_ps(cos_squared_azimuth, one, equatorial_mask);
+        __m256 equatorial_mask_f32x8 = _mm256_cmp_ps(cos_squared_azimuth_f32x8, epsilon_f32x8, _CMP_LT_OS);
+        __m256 safe_cos_sq_azimuth_f32x8 = _mm256_blendv_ps(cos_squared_azimuth_f32x8, one_f32x8,
+                                                            equatorial_mask_f32x8);
         // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
-        __m256 sin_product = _mm256_mul_ps(sin_reduced_first, sin_reduced_second);
-        cos_double_angular_midpoint = _mm256_sub_ps(
-            cos_angular_distance, _mm256_div_ps(_mm256_mul_ps(two, sin_product), safe_cos_sq_azimuth));
-        cos_double_angular_midpoint = _mm256_blendv_ps(cos_double_angular_midpoint, _mm256_setzero_ps(),
-                                                       equatorial_mask);
+        __m256 sin_product_f32x8 = _mm256_mul_ps(sin_reduced_first_f32x8, sin_reduced_second_f32x8);
+        cos_double_angular_midpoint_f32x8 = _mm256_sub_ps(
+            cos_angular_distance_f32x8,
+            _mm256_div_ps(_mm256_mul_ps(two_f32x8, sin_product_f32x8), safe_cos_sq_azimuth_f32x8));
+        cos_double_angular_midpoint_f32x8 = _mm256_blendv_ps(cos_double_angular_midpoint_f32x8, _mm256_setzero_ps(),
+                                                             equatorial_mask_f32x8);
         // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
-        __m256 correction_factor = _mm256_mul_ps(
-            _mm256_div_ps(flattening, sixteen),
-            _mm256_mul_ps(cos_squared_azimuth,
-                          _mm256_fmadd_ps(flattening, _mm256_fnmadd_ps(three, cos_squared_azimuth, four), four)));
+        __m256 correction_factor_f32x8 = _mm256_mul_ps(
+            _mm256_div_ps(flattening_f32x8, sixteen_f32x8),
+            _mm256_mul_ps(
+                cos_squared_azimuth_f32x8,
+                _mm256_fmadd_ps(flattening_f32x8, _mm256_fnmadd_ps(three_f32x8, cos_squared_azimuth_f32x8, four_f32x8),
+                                four_f32x8)));
         // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
-        __m256 cos_2sm_sq = _mm256_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
-        // innermost = -1 + 2 × cos²(2σₘ)
-        __m256 innermost = _mm256_fmadd_ps(two, cos_2sm_sq, _mm256_set1_ps(-1.0f));
-        // middle = cos(2σₘ) + C × cos(σ) × innermost
-        __m256 middle = _mm256_fmadd_ps(_mm256_mul_ps(correction_factor, cos_angular_distance), innermost,
-                                        cos_double_angular_midpoint);
-        // inner = C × sin(σ) × middle
-        __m256 inner = _mm256_mul_ps(_mm256_mul_ps(correction_factor, sin_angular_distance), middle);
-        // λ' = L + (1-C) * f * sin_α * (σ + inner)
-        __m256 lambda_new = _mm256_fmadd_ps(
-            _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(one, correction_factor), flattening), sin_azimuth),
-            _mm256_add_ps(angular_distance, inner), longitude_difference);
+        __m256 cos_2sm_sq_f32x8 = _mm256_mul_ps(cos_double_angular_midpoint_f32x8, cos_double_angular_midpoint_f32x8);
+        // innermost_f32x8 = -1 + 2 × cos²(2σₘ)
+        __m256 innermost_f32x8 = _mm256_fmadd_ps(two_f32x8, cos_2sm_sq_f32x8, _mm256_set1_ps(-1.0f));
+        // middle_f32x8 = cos(2σₘ) + C × cos(σ) × innermost_f32x8
+        __m256 middle_f32x8 = _mm256_fmadd_ps(_mm256_mul_ps(correction_factor_f32x8, cos_angular_distance_f32x8),
+                                              innermost_f32x8, cos_double_angular_midpoint_f32x8);
+        // inner_f32x8 = C × sin(σ) × middle_f32x8
+        __m256 inner_f32x8 = _mm256_mul_ps(_mm256_mul_ps(correction_factor_f32x8, sin_angular_distance_f32x8),
+                                           middle_f32x8);
+        // λ' = L + (1-C) * f * sin_α * (σ + inner_f32x8)
+        __m256 lambda_new_f32x8 = _mm256_fmadd_ps(
+            _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(one_f32x8, correction_factor_f32x8), flattening_f32x8),
+                          sin_azimuth_f32x8),
+            _mm256_add_ps(angular_distance_f32x8, inner_f32x8), longitude_difference_f32x8);
         // Check convergence: |λ - λ'| < threshold
-        __m256 lambda_diff_abs = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), _mm256_sub_ps(lambda_new, lambda));
-        __m256 newly_converged = _mm256_cmp_ps(lambda_diff_abs, convergence_threshold, _CMP_LT_OS);
-        converged_mask = _mm256_or_ps(converged_mask, newly_converged);
+        __m256 lambda_diff_abs_f32x8 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f),
+                                                        _mm256_sub_ps(lambda_new_f32x8, lambda_f32x8));
+        __m256 newly_converged_f32x8 = _mm256_cmp_ps(lambda_diff_abs_f32x8, convergence_threshold_f32x8, _CMP_LT_OS);
+        converged_mask_f32x8 = _mm256_or_ps(converged_mask_f32x8, newly_converged_f32x8);
-        // Only update lambda for non-converged lanes
-        lambda = _mm256_blendv_ps(lambda_new, lambda, converged_mask);
+        // Only update lambda_f32x8 for non-converged lanes
+        lambda_f32x8 = _mm256_blendv_ps(lambda_new_f32x8, lambda_f32x8, converged_mask_f32x8);
     }
     // Final distance calculation
     // u² = cos²α * (a² - b²) / b²
-    __m256 a_sq = _mm256_mul_ps(equatorial_radius, equatorial_radius);
-    __m256 b_sq = _mm256_mul_ps(polar_radius, polar_radius);
-    __m256 u_squared = _mm256_div_ps(_mm256_mul_ps(cos_squared_azimuth, _mm256_sub_ps(a_sq, b_sq)), b_sq);
+    __m256 a_sq_f32x8 = _mm256_mul_ps(equatorial_radius_f32x8, equatorial_radius_f32x8);
+    __m256 b_sq_f32x8 = _mm256_mul_ps(polar_radius_f32x8, polar_radius_f32x8);
+    __m256 u_squared_f32x8 = _mm256_div_ps(
+        _mm256_mul_ps(cos_squared_azimuth_f32x8, _mm256_sub_ps(a_sq_f32x8, b_sq_f32x8)), b_sq_f32x8);
     // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
-    __m256 series_a = _mm256_fmadd_ps(u_squared, _mm256_set1_ps(-175.0f), _mm256_set1_ps(320.0f));
-    series_a = _mm256_fmadd_ps(u_squared, series_a, _mm256_set1_ps(-768.0f));
-    series_a = _mm256_fmadd_ps(u_squared, series_a, _mm256_set1_ps(4096.0f));
-    series_a = _mm256_fmadd_ps(_mm256_div_ps(u_squared, _mm256_set1_ps(16384.0f)), series_a, one);
+    __m256 series_a_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, _mm256_set1_ps(-175.0f), _mm256_set1_ps(320.0f));
+    series_a_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_a_f32x8, _mm256_set1_ps(-768.0f));
+    series_a_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_a_f32x8, _mm256_set1_ps(4096.0f));
+    series_a_f32x8 = _mm256_fmadd_ps(_mm256_div_ps(u_squared_f32x8, _mm256_set1_ps(16384.0f)), series_a_f32x8,
+                                     one_f32x8);
     // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
-    __m256 series_b = _mm256_fmadd_ps(u_squared, _mm256_set1_ps(-47.0f), _mm256_set1_ps(74.0f));
-    series_b = _mm256_fmadd_ps(u_squared, series_b, _mm256_set1_ps(-128.0f));
-    series_b = _mm256_fmadd_ps(u_squared, series_b, _mm256_set1_ps(256.0f));
-    series_b = _mm256_mul_ps(_mm256_div_ps(u_squared, _mm256_set1_ps(1024.0f)), series_b);
+    __m256 series_b_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, _mm256_set1_ps(-47.0f), _mm256_set1_ps(74.0f));
+    series_b_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_b_f32x8, _mm256_set1_ps(-128.0f));
+    series_b_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_b_f32x8, _mm256_set1_ps(256.0f));
+    series_b_f32x8 = _mm256_mul_ps(_mm256_div_ps(u_squared_f32x8, _mm256_set1_ps(1024.0f)), series_b_f32x8);
     // Δσ = B × sin(σ) × (cos(2σₘ) +
     //      B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
-    __m256 cos_2sm_sq = _mm256_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
-    __m256 sin_sq = _mm256_mul_ps(sin_angular_distance, sin_angular_distance);
-    __m256 term1 = _mm256_fmadd_ps(two, cos_2sm_sq, _mm256_set1_ps(-1.0f));
-    term1 = _mm256_mul_ps(cos_angular_distance, term1);
-    __m256 term2 = _mm256_fmadd_ps(four, sin_sq, _mm256_set1_ps(-3.0f));
-    __m256 term3 = _mm256_fmadd_ps(four, cos_2sm_sq, _mm256_set1_ps(-3.0f));
-    term2 = _mm256_mul_ps(_mm256_mul_ps(_mm256_div_ps(series_b, six), cos_double_angular_midpoint),
-                          _mm256_mul_ps(term2, term3));
-    __m256 delta_sigma = _mm256_mul_ps(
-        series_b, _mm256_mul_ps(sin_angular_distance, _mm256_add_ps(cos_double_angular_midpoint,
-                                                                    _mm256_mul_ps(_mm256_div_ps(series_b, four),
-                                                                                  _mm256_sub_ps(term1, term2)))));
+    __m256 cos_2sm_sq_f32x8 = _mm256_mul_ps(cos_double_angular_midpoint_f32x8, cos_double_angular_midpoint_f32x8);
+    __m256 sin_sq_f32x8 = _mm256_mul_ps(sin_angular_distance_f32x8, sin_angular_distance_f32x8);
+    __m256 term1_f32x8 = _mm256_fmadd_ps(two_f32x8, cos_2sm_sq_f32x8, _mm256_set1_ps(-1.0f));
+    term1_f32x8 = _mm256_mul_ps(cos_angular_distance_f32x8, term1_f32x8);
+    __m256 term2_f32x8 = _mm256_fmadd_ps(four_f32x8, sin_sq_f32x8, _mm256_set1_ps(-3.0f));
+    __m256 term3_f32x8 = _mm256_fmadd_ps(four_f32x8, cos_2sm_sq_f32x8, _mm256_set1_ps(-3.0f));
+    term2_f32x8 = _mm256_mul_ps(
+        _mm256_mul_ps(_mm256_div_ps(series_b_f32x8, six_f32x8), cos_double_angular_midpoint_f32x8),
+        _mm256_mul_ps(term2_f32x8, term3_f32x8));
+    __m256 delta_sigma_f32x8 = _mm256_mul_ps(
+        series_b_f32x8, _mm256_mul_ps(sin_angular_distance_f32x8,
+                                      _mm256_add_ps(cos_double_angular_midpoint_f32x8,
+                                                    _mm256_mul_ps(_mm256_div_ps(series_b_f32x8, four_f32x8),
+                                                                  _mm256_sub_ps(term1_f32x8, term2_f32x8)))));
     // s = b * A * (σ - Δσ)
-    __m256 distances = _mm256_mul_ps(_mm256_mul_ps(polar_radius, series_a),
-                                     _mm256_sub_ps(angular_distance, delta_sigma));
+    __m256 distances_f32x8 = _mm256_mul_ps(_mm256_mul_ps(polar_radius_f32x8, series_a_f32x8),
+                                           _mm256_sub_ps(angular_distance_f32x8, delta_sigma_f32x8));
     // Set coincident points to zero
-    distances = _mm256_blendv_ps(distances, _mm256_setzero_ps(), coincident_mask);
+    distances_f32x8 = _mm256_blendv_ps(distances_f32x8, _mm256_setzero_ps(), coincident_mask_f32x8);
-    return distances;
+    return distances_f32x8;
 }
 NK_PUBLIC void nk_vincenty_f32_haswell(             //
@@ -552,14 +592,14 @@ NK_PUBLIC void nk_vincenty_f32_haswell(             //
     nk_size_t n, nk_f32_t *results) {
     while (n >= 8) {
-        __m256 first_latitudes = _mm256_loadu_ps(a_lats);
-        __m256 first_longitudes = _mm256_loadu_ps(a_lons);
-        __m256 second_latitudes = _mm256_loadu_ps(b_lats);
-        __m256 second_longitudes = _mm256_loadu_ps(b_lons);
+        __m256 first_latitudes_f32x8 = _mm256_loadu_ps(a_lats);
+        __m256 first_longitudes_f32x8 = _mm256_loadu_ps(a_lons);
+        __m256 second_latitudes_f32x8 = _mm256_loadu_ps(b_lats);
+        __m256 second_longitudes_f32x8 = _mm256_loadu_ps(b_lons);
-        __m256 distances = nk_vincenty_f32x8_haswell_(first_latitudes, first_longitudes, second_latitudes,
-                                                      second_longitudes);
-        _mm256_storeu_ps(results, distances);
+        __m256 distances_f32x8 = nk_vincenty_f32x8_haswell_(first_latitudes_f32x8, first_longitudes_f32x8,
+                                                            second_latitudes_f32x8, second_longitudes_f32x8);
+        _mm256_storeu_ps(results, distances_f32x8);
         a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
     }
@@ -571,9 +611,9 @@ NK_PUBLIC void nk_vincenty_f32_haswell(             //
         nk_partial_load_b32x8_serial_(a_lons, &a_lon_vec, n);
         nk_partial_load_b32x8_serial_(b_lats, &b_lat_vec, n);
         nk_partial_load_b32x8_serial_(b_lons, &b_lon_vec, n);
-        __m256 distances = nk_vincenty_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
-                                                      b_lon_vec.ymm_ps);
-        result_vec.ymm_ps = distances;
+        __m256 distances_f32x8 = nk_vincenty_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
+                                                            b_lon_vec.ymm_ps);
+        result_vec.ymm_ps = distances_f32x8;
         nk_partial_store_b32x8_serial_(&result_vec, results, n);
     }
 }