npm - numkong - Versions diffs - 7.0.0 → 7.4.1 - Mend

numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +239 -122
package/binding.gyp +25 -491
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/geospatial/v128relaxed.h CHANGED Viewed

@@ -46,46 +46,50 @@ extern "C" {
  *  These require WASM trigonometric kernels from trigonometry/v128relaxed.h.
  */
-NK_INTERNAL v128_t nk_haversine_f64x2_v128relaxed_(  //
-    v128_t first_latitudes, v128_t first_longitudes, //
-    v128_t second_latitudes, v128_t second_longitudes) {
+NK_INTERNAL v128_t nk_haversine_f64x2_v128relaxed_(              //
+    v128_t first_latitudes_f64x2, v128_t first_longitudes_f64x2, //
+    v128_t second_latitudes_f64x2, v128_t second_longitudes_f64x2) {
-    v128_t const earth_radius = wasm_f64x2_splat(NK_EARTH_MEDIATORIAL_RADIUS);
-    v128_t const half = wasm_f64x2_splat(0.5);
-    v128_t const one = wasm_f64x2_splat(1.0);
-    v128_t const two = wasm_f64x2_splat(2.0);
+    v128_t const earth_radius_f64x2 = wasm_f64x2_splat(NK_EARTH_MEDIATORIAL_RADIUS);
+    v128_t const half_f64x2 = wasm_f64x2_splat(0.5);
+    v128_t const one_f64x2 = wasm_f64x2_splat(1.0);
+    v128_t const two_f64x2 = wasm_f64x2_splat(2.0);
-    v128_t latitude_delta = wasm_f64x2_sub(second_latitudes, first_latitudes);
-    v128_t longitude_delta = wasm_f64x2_sub(second_longitudes, first_longitudes);
+    v128_t latitude_delta_f64x2 = wasm_f64x2_sub(second_latitudes_f64x2, first_latitudes_f64x2);
+    v128_t longitude_delta_f64x2 = wasm_f64x2_sub(second_longitudes_f64x2, first_longitudes_f64x2);
     // Haversine terms: sin^2(delta/2)
-    v128_t latitude_delta_half = wasm_f64x2_mul(latitude_delta, half);
-    v128_t longitude_delta_half = wasm_f64x2_mul(longitude_delta, half);
-    v128_t sin_latitude_delta_half = nk_f64x2_sin_v128relaxed_(latitude_delta_half);
-    v128_t sin_longitude_delta_half = nk_f64x2_sin_v128relaxed_(longitude_delta_half);
-    v128_t sin_squared_latitude_delta_half = wasm_f64x2_mul(sin_latitude_delta_half, sin_latitude_delta_half);
-    v128_t sin_squared_longitude_delta_half = wasm_f64x2_mul(sin_longitude_delta_half, sin_longitude_delta_half);
+    v128_t latitude_delta_half_f64x2 = wasm_f64x2_mul(latitude_delta_f64x2, half_f64x2);
+    v128_t longitude_delta_half_f64x2 = wasm_f64x2_mul(longitude_delta_f64x2, half_f64x2);
+    v128_t sin_latitude_delta_half_f64x2 = nk_f64x2_sin_v128relaxed_(latitude_delta_half_f64x2);
+    v128_t sin_longitude_delta_half_f64x2 = nk_f64x2_sin_v128relaxed_(longitude_delta_half_f64x2);
+    v128_t sin_squared_latitude_delta_half_f64x2 = wasm_f64x2_mul(sin_latitude_delta_half_f64x2,
+                                                                  sin_latitude_delta_half_f64x2);
+    v128_t sin_squared_longitude_delta_half_f64x2 = wasm_f64x2_mul(sin_longitude_delta_half_f64x2,
+                                                                   sin_longitude_delta_half_f64x2);
     // Latitude cosine product
-    v128_t cos_first_latitude = nk_f64x2_cos_v128relaxed_(first_latitudes);
-    v128_t cos_second_latitude = nk_f64x2_cos_v128relaxed_(second_latitudes);
-    v128_t cos_latitude_product = wasm_f64x2_mul(cos_first_latitude, cos_second_latitude);
+    v128_t cos_first_latitude_f64x2 = nk_f64x2_cos_v128relaxed_(first_latitudes_f64x2);
+    v128_t cos_second_latitude_f64x2 = nk_f64x2_cos_v128relaxed_(second_latitudes_f64x2);
+    v128_t cos_latitude_product_f64x2 = wasm_f64x2_mul(cos_first_latitude_f64x2, cos_second_latitude_f64x2);
     // a = sin^2(dlat/2) + cos(lat1) * cos(lat2) * sin^2(dlon/2)
-    v128_t haversine_term = wasm_f64x2_add(sin_squared_latitude_delta_half,
-                                           wasm_f64x2_mul(cos_latitude_product, sin_squared_longitude_delta_half));
-    // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
-    // relaxed_min/max: 1 instruction (minpd/maxpd) vs 6-9 (with NaN/signed-zero fixup) on x86.
-    // Safe because haversine_term is a product of finite sin/cos values — NaN is impossible.
-    v128_t zero = wasm_f64x2_splat(0.0);
-    haversine_term = wasm_f64x2_relaxed_max(zero, wasm_f64x2_relaxed_min(one, haversine_term));
+    v128_t haversine_term_f64x2 = wasm_f64x2_add(
+        sin_squared_latitude_delta_half_f64x2,
+        wasm_f64x2_mul(cos_latitude_product_f64x2, sin_squared_longitude_delta_half_f64x2));
+    // Clamp haversine_term_f64x2 to [0, 1] to prevent NaN from sqrt of negative values
+    // relaxed_min/max: 1 instruction (minpd/maxpd) vs 6-9 (with NaN/signed-zero_f64x2 fixup) on x86.
+    // Safe because haversine_term_f64x2 is a product of finite sin/cos values — NaN is impossible.
+    v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
+    haversine_term_f64x2 = wasm_f64x2_relaxed_max(zero_f64x2, wasm_f64x2_relaxed_min(one_f64x2, haversine_term_f64x2));
     // Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
-    v128_t sqrt_haversine = wasm_f64x2_sqrt(haversine_term);
-    v128_t sqrt_complement = wasm_f64x2_sqrt(wasm_f64x2_sub(one, haversine_term));
-    v128_t central_angle = wasm_f64x2_mul(two, nk_f64x2_atan2_v128relaxed_(sqrt_haversine, sqrt_complement));
+    v128_t sqrt_haversine_f64x2 = wasm_f64x2_sqrt(haversine_term_f64x2);
+    v128_t sqrt_complement_f64x2 = wasm_f64x2_sqrt(wasm_f64x2_sub(one_f64x2, haversine_term_f64x2));
+    v128_t central_angle_f64x2 = wasm_f64x2_mul(
+        two_f64x2, nk_f64x2_atan2_v128relaxed_(sqrt_haversine_f64x2, sqrt_complement_f64x2));
-    return wasm_f64x2_mul(earth_radius, central_angle);
+    return wasm_f64x2_mul(earth_radius_f64x2, central_angle_f64x2);
 }
 NK_PUBLIC void nk_haversine_f64_v128relaxed(        //
@@ -94,14 +98,14 @@ NK_PUBLIC void nk_haversine_f64_v128relaxed(        //
     nk_size_t n, nk_f64_t *results) {
     while (n >= 2) {
-        v128_t first_latitudes = wasm_v128_load(a_lats);
-        v128_t first_longitudes = wasm_v128_load(a_lons);
-        v128_t second_latitudes = wasm_v128_load(b_lats);
-        v128_t second_longitudes = wasm_v128_load(b_lons);
+        v128_t first_latitudes_f64x2 = wasm_v128_load(a_lats);
+        v128_t first_longitudes_f64x2 = wasm_v128_load(a_lons);
+        v128_t second_latitudes_f64x2 = wasm_v128_load(b_lats);
+        v128_t second_longitudes_f64x2 = wasm_v128_load(b_lons);
-        v128_t distances = nk_haversine_f64x2_v128relaxed_(first_latitudes, first_longitudes, second_latitudes,
-                                                           second_longitudes);
-        wasm_v128_store(results, distances);
+        v128_t distances_f64x2 = nk_haversine_f64x2_v128relaxed_(first_latitudes_f64x2, first_longitudes_f64x2,
+                                                                 second_latitudes_f64x2, second_longitudes_f64x2);
+        wasm_v128_store(results, distances_f64x2);
         a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
     }
@@ -113,54 +117,58 @@ NK_PUBLIC void nk_haversine_f64_v128relaxed(        //
         nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
         nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
         nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
-        v128_t distances = nk_haversine_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
-                                                           b_lon_vec.v128);
-        result_vec.v128 = distances;
+        v128_t distances_f64x2 = nk_haversine_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
+                                                                 b_lon_vec.v128);
+        result_vec.v128 = distances_f64x2;
         nk_partial_store_b64x2_serial_(&result_vec, results, n);
     }
 }
-NK_INTERNAL v128_t nk_haversine_f32x4_v128relaxed_(  //
-    v128_t first_latitudes, v128_t first_longitudes, //
-    v128_t second_latitudes, v128_t second_longitudes) {
+NK_INTERNAL v128_t nk_haversine_f32x4_v128relaxed_(              //
+    v128_t first_latitudes_f32x4, v128_t first_longitudes_f32x4, //
+    v128_t second_latitudes_f32x4, v128_t second_longitudes_f32x4) {
-    v128_t const earth_radius = wasm_f32x4_splat((float)NK_EARTH_MEDIATORIAL_RADIUS);
-    v128_t const half = wasm_f32x4_splat(0.5f);
-    v128_t const one = wasm_f32x4_splat(1.0f);
-    v128_t const two = wasm_f32x4_splat(2.0f);
+    v128_t const earth_radius_f32x4 = wasm_f32x4_splat((float)NK_EARTH_MEDIATORIAL_RADIUS);
+    v128_t const half_f32x4 = wasm_f32x4_splat(0.5f);
+    v128_t const one_f32x4 = wasm_f32x4_splat(1.0f);
+    v128_t const two_f32x4 = wasm_f32x4_splat(2.0f);
-    v128_t latitude_delta = wasm_f32x4_sub(second_latitudes, first_latitudes);
-    v128_t longitude_delta = wasm_f32x4_sub(second_longitudes, first_longitudes);
+    v128_t latitude_delta_f32x4 = wasm_f32x4_sub(second_latitudes_f32x4, first_latitudes_f32x4);
+    v128_t longitude_delta_f32x4 = wasm_f32x4_sub(second_longitudes_f32x4, first_longitudes_f32x4);
     // Haversine terms: sin^2(delta/2)
-    v128_t latitude_delta_half = wasm_f32x4_mul(latitude_delta, half);
-    v128_t longitude_delta_half = wasm_f32x4_mul(longitude_delta, half);
-    v128_t sin_latitude_delta_half = nk_f32x4_sin_v128relaxed_(latitude_delta_half);
-    v128_t sin_longitude_delta_half = nk_f32x4_sin_v128relaxed_(longitude_delta_half);
-    v128_t sin_squared_latitude_delta_half = wasm_f32x4_mul(sin_latitude_delta_half, sin_latitude_delta_half);
-    v128_t sin_squared_longitude_delta_half = wasm_f32x4_mul(sin_longitude_delta_half, sin_longitude_delta_half);
+    v128_t latitude_delta_half_f32x4 = wasm_f32x4_mul(latitude_delta_f32x4, half_f32x4);
+    v128_t longitude_delta_half_f32x4 = wasm_f32x4_mul(longitude_delta_f32x4, half_f32x4);
+    v128_t sin_latitude_delta_half_f32x4 = nk_f32x4_sin_v128relaxed_(latitude_delta_half_f32x4);
+    v128_t sin_longitude_delta_half_f32x4 = nk_f32x4_sin_v128relaxed_(longitude_delta_half_f32x4);
+    v128_t sin_squared_latitude_delta_half_f32x4 = wasm_f32x4_mul(sin_latitude_delta_half_f32x4,
+                                                                  sin_latitude_delta_half_f32x4);
+    v128_t sin_squared_longitude_delta_half_f32x4 = wasm_f32x4_mul(sin_longitude_delta_half_f32x4,
+                                                                   sin_longitude_delta_half_f32x4);
     // Latitude cosine product
-    v128_t cos_first_latitude = nk_f32x4_cos_v128relaxed_(first_latitudes);
-    v128_t cos_second_latitude = nk_f32x4_cos_v128relaxed_(second_latitudes);
-    v128_t cos_latitude_product = wasm_f32x4_mul(cos_first_latitude, cos_second_latitude);
+    v128_t cos_first_latitude_f32x4 = nk_f32x4_cos_v128relaxed_(first_latitudes_f32x4);
+    v128_t cos_second_latitude_f32x4 = nk_f32x4_cos_v128relaxed_(second_latitudes_f32x4);
+    v128_t cos_latitude_product_f32x4 = wasm_f32x4_mul(cos_first_latitude_f32x4, cos_second_latitude_f32x4);
     // a = sin^2(dlat/2) + cos(lat1) * cos(lat2) * sin^2(dlon/2)
-    v128_t haversine_term = wasm_f32x4_add(sin_squared_latitude_delta_half,
-                                           wasm_f32x4_mul(cos_latitude_product, sin_squared_longitude_delta_half));
+    v128_t haversine_term_f32x4 = wasm_f32x4_add(
+        sin_squared_latitude_delta_half_f32x4,
+        wasm_f32x4_mul(cos_latitude_product_f32x4, sin_squared_longitude_delta_half_f32x4));
     // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
-    // relaxed_min/max: 1 instruction (minps/maxps) vs 6-9 (with NaN/signed-zero fixup) on x86.
-    // Safe because haversine_term is a product of finite sin/cos values — NaN is impossible.
-    v128_t zero = wasm_f32x4_splat(0.0f);
-    haversine_term = wasm_f32x4_relaxed_max(zero, wasm_f32x4_relaxed_min(one, haversine_term));
+    // relaxed_min/max: 1 instruction (minps/maxps) vs 6-9 (with NaN/signed-zero_f32x4 fixup) on x86.
+    // Safe because haversine_term_f32x4 is a product of finite sin/cos values — NaN is impossible.
+    v128_t zero_f32x4 = wasm_f32x4_splat(0.0f);
+    haversine_term_f32x4 = wasm_f32x4_relaxed_max(zero_f32x4, wasm_f32x4_relaxed_min(one_f32x4, haversine_term_f32x4));
     // Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
-    v128_t sqrt_haversine = wasm_f32x4_sqrt(haversine_term);
-    v128_t sqrt_complement = wasm_f32x4_sqrt(wasm_f32x4_sub(one, haversine_term));
-    v128_t central_angle = wasm_f32x4_mul(two, nk_f32x4_atan2_v128relaxed_(sqrt_haversine, sqrt_complement));
+    v128_t sqrt_haversine_f32x4 = wasm_f32x4_sqrt(haversine_term_f32x4);
+    v128_t sqrt_complement_f32x4 = wasm_f32x4_sqrt(wasm_f32x4_sub(one_f32x4, haversine_term_f32x4));
+    v128_t central_angle_f32x4 = wasm_f32x4_mul(
+        two_f32x4, nk_f32x4_atan2_v128relaxed_(sqrt_haversine_f32x4, sqrt_complement_f32x4));
-    return wasm_f32x4_mul(earth_radius, central_angle);
+    return wasm_f32x4_mul(earth_radius_f32x4, central_angle_f32x4);
 }
 NK_PUBLIC void nk_haversine_f32_v128relaxed(        //
@@ -169,14 +177,14 @@ NK_PUBLIC void nk_haversine_f32_v128relaxed(        //
     nk_size_t n, nk_f32_t *results) {
     while (n >= 4) {
-        v128_t first_latitudes = wasm_v128_load(a_lats);
-        v128_t first_longitudes = wasm_v128_load(a_lons);
-        v128_t second_latitudes = wasm_v128_load(b_lats);
-        v128_t second_longitudes = wasm_v128_load(b_lons);
+        v128_t first_latitudes_f32x4 = wasm_v128_load(a_lats);
+        v128_t first_longitudes_f32x4 = wasm_v128_load(a_lons);
+        v128_t second_latitudes_f32x4 = wasm_v128_load(b_lats);
+        v128_t second_longitudes_f32x4 = wasm_v128_load(b_lons);
-        v128_t distances = nk_haversine_f32x4_v128relaxed_(first_latitudes, first_longitudes, second_latitudes,
-                                                           second_longitudes);
-        wasm_v128_store(results, distances);
+        v128_t distances_f32x4 = nk_haversine_f32x4_v128relaxed_(first_latitudes_f32x4, first_longitudes_f32x4,
+                                                                 second_latitudes_f32x4, second_longitudes_f32x4);
+        wasm_v128_store(results, distances_f32x4);
         a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
     }
@@ -188,9 +196,9 @@ NK_PUBLIC void nk_haversine_f32_v128relaxed(        //
         nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
         nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
         nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
-        v128_t distances = nk_haversine_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
-                                                           b_lon_vec.v128);
-        result_vec.v128 = distances;
+        v128_t distances_f32x4 = nk_haversine_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
+                                                                 b_lon_vec.v128);
+        result_vec.v128 = distances_f32x4;
         nk_partial_store_b32x4_serial_(&result_vec, results, n);
     }
 }
@@ -199,174 +207,189 @@ NK_PUBLIC void nk_haversine_f32_v128relaxed(        //
  *  @brief  WASM Relaxed SIMD helper for Vincenty's geodesic distance on 2 f64 point pairs.
  *  @note   This is a true SIMD implementation using masked convergence tracking via blending.
  */
-NK_INTERNAL v128_t nk_vincenty_f64x2_v128relaxed_(   //
-    v128_t first_latitudes, v128_t first_longitudes, //
-    v128_t second_latitudes, v128_t second_longitudes) {
-    v128_t const equatorial_radius = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
-    v128_t const polar_radius = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
-    v128_t const flattening = wasm_f64x2_splat(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
-    v128_t const convergence_threshold = wasm_f64x2_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
-    v128_t const one = wasm_f64x2_splat(1.0);
-    v128_t const two = wasm_f64x2_splat(2.0);
-    v128_t const three = wasm_f64x2_splat(3.0);
-    v128_t const four = wasm_f64x2_splat(4.0);
-    v128_t const six = wasm_f64x2_splat(6.0);
-    v128_t const sixteen = wasm_f64x2_splat(16.0);
-    v128_t const epsilon = wasm_f64x2_splat(1e-15);
+NK_INTERNAL v128_t nk_vincenty_f64x2_v128relaxed_(               //
+    v128_t first_latitudes_f64x2, v128_t first_longitudes_f64x2, //
+    v128_t second_latitudes_f64x2, v128_t second_longitudes_f64x2) {
+    v128_t const equatorial_radius_f64x2 = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
+    v128_t const polar_radius_f64x2 = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
+    v128_t const flattening_f64x2 = wasm_f64x2_splat(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
+    v128_t const convergence_threshold_f64x2 = wasm_f64x2_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
+    v128_t const one_f64x2 = wasm_f64x2_splat(1.0);
+    v128_t const two_f64x2 = wasm_f64x2_splat(2.0);
+    v128_t const three_f64x2 = wasm_f64x2_splat(3.0);
+    v128_t const four_f64x2 = wasm_f64x2_splat(4.0);
+    v128_t const six_f64x2 = wasm_f64x2_splat(6.0);
+    v128_t const sixteen_f64x2 = wasm_f64x2_splat(16.0);
+    v128_t const epsilon_f64x2 = wasm_f64x2_splat(1e-15);
     // Longitude difference
-    v128_t longitude_difference = wasm_f64x2_sub(second_longitudes, first_longitudes);
+    v128_t longitude_difference_f64x2 = wasm_f64x2_sub(second_longitudes_f64x2, first_longitudes_f64x2);
     // Reduced latitudes: tan(U) = (1-f) * tan(lat)
-    v128_t one_minus_f = wasm_f64x2_sub(one, flattening);
-    v128_t tan_first = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(first_latitudes),
-                                      nk_f64x2_cos_v128relaxed_(first_latitudes));
-    v128_t tan_second = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(second_latitudes),
-                                       nk_f64x2_cos_v128relaxed_(second_latitudes));
-    v128_t tan_reduced_first = wasm_f64x2_mul(one_minus_f, tan_first);
-    v128_t tan_reduced_second = wasm_f64x2_mul(one_minus_f, tan_second);
+    v128_t one_minus_f_f64x2 = wasm_f64x2_sub(one_f64x2, flattening_f64x2);
+    v128_t tan_first_f64x2 = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(first_latitudes_f64x2),
+                                            nk_f64x2_cos_v128relaxed_(first_latitudes_f64x2));
+    v128_t tan_second_f64x2 = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(second_latitudes_f64x2),
+                                             nk_f64x2_cos_v128relaxed_(second_latitudes_f64x2));
+    v128_t tan_reduced_first_f64x2 = wasm_f64x2_mul(one_minus_f_f64x2, tan_first_f64x2);
+    v128_t tan_reduced_second_f64x2 = wasm_f64x2_mul(one_minus_f_f64x2, tan_second_f64x2);
     // cos(U) = 1/sqrt(1 + tan^2(U)), sin(U) = tan(U) * cos(U)
-    v128_t cos_reduced_first = wasm_f64x2_div(
-        one, wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_first, tan_reduced_first, one)));
-    v128_t sin_reduced_first = wasm_f64x2_mul(tan_reduced_first, cos_reduced_first);
-    v128_t cos_reduced_second = wasm_f64x2_div(
-        one, wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_second, tan_reduced_second, one)));
-    v128_t sin_reduced_second = wasm_f64x2_mul(tan_reduced_second, cos_reduced_second);
-    // Initialize lambda and tracking variables
-    v128_t lambda = longitude_difference;
-    v128_t sin_angular_distance, cos_angular_distance, angular_distance;
-    v128_t sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
+    v128_t cos_reduced_first_f64x2 = wasm_f64x2_div(
+        one_f64x2,
+        wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_first_f64x2, tan_reduced_first_f64x2, one_f64x2)));
+    v128_t sin_reduced_first_f64x2 = wasm_f64x2_mul(tan_reduced_first_f64x2, cos_reduced_first_f64x2);
+    v128_t cos_reduced_second_f64x2 = wasm_f64x2_div(
+        one_f64x2,
+        wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_second_f64x2, tan_reduced_second_f64x2, one_f64x2)));
+    v128_t sin_reduced_second_f64x2 = wasm_f64x2_mul(tan_reduced_second_f64x2, cos_reduced_second_f64x2);
+    // Initialize lambda_f64x2 and tracking variables
+    v128_t lambda_f64x2 = longitude_difference_f64x2;
+    v128_t sin_angular_distance_f64x2, cos_angular_distance_f64x2, angular_distance_f64x2;
+    v128_t sin_azimuth_f64x2, cos_squared_azimuth_f64x2, cos_double_angular_midpoint_f64x2;
     // Track convergence and coincident points using masks
-    v128_t converged_mask = wasm_i64x2_splat(0);
-    v128_t coincident_mask = wasm_i64x2_splat(0);
+    v128_t converged_mask_i64x2 = wasm_i64x2_splat(0);
+    v128_t coincident_mask_i64x2 = wasm_i64x2_splat(0);
     for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
         // Check if all lanes converged
-        if (wasm_i8x16_all_true(converged_mask)) break;
+        if (wasm_i8x16_all_true(converged_mask_i64x2)) break;
-        v128_t sin_lambda = nk_f64x2_sin_v128relaxed_(lambda);
-        v128_t cos_lambda = nk_f64x2_cos_v128relaxed_(lambda);
+        v128_t sin_lambda_f64x2 = nk_f64x2_sin_v128relaxed_(lambda_f64x2);
+        v128_t cos_lambda_f64x2 = nk_f64x2_cos_v128relaxed_(lambda_f64x2);
-        // sin^2(angular_distance) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
-        v128_t cross_term = wasm_f64x2_mul(cos_reduced_second, sin_lambda);
-        v128_t mixed_term = wasm_f64x2_sub(
-            wasm_f64x2_mul(cos_reduced_first, sin_reduced_second),
-            wasm_f64x2_mul(wasm_f64x2_mul(sin_reduced_first, cos_reduced_second), cos_lambda));
-        v128_t sin_angular_dist_sq = wasm_f64x2_relaxed_madd(cross_term, cross_term,
-                                                             wasm_f64x2_mul(mixed_term, mixed_term));
-        sin_angular_distance = wasm_f64x2_sqrt(sin_angular_dist_sq);
+        // sin^2(angular_distance_f64x2) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
+        v128_t cross_term_f64x2 = wasm_f64x2_mul(cos_reduced_second_f64x2, sin_lambda_f64x2);
+        v128_t mixed_term_f64x2 = wasm_f64x2_sub(
+            wasm_f64x2_mul(cos_reduced_first_f64x2, sin_reduced_second_f64x2),
+            wasm_f64x2_mul(wasm_f64x2_mul(sin_reduced_first_f64x2, cos_reduced_second_f64x2), cos_lambda_f64x2));
+        v128_t sin_angular_dist_sq_f64x2 = wasm_f64x2_relaxed_madd(cross_term_f64x2, cross_term_f64x2,
+                                                                   wasm_f64x2_mul(mixed_term_f64x2, mixed_term_f64x2));
+        sin_angular_distance_f64x2 = wasm_f64x2_sqrt(sin_angular_dist_sq_f64x2);
-        // Check for coincident points (sin_angular_distance ~ 0)
-        coincident_mask = wasm_f64x2_lt(sin_angular_distance, epsilon);
+        // Check for coincident points (sin_angular_distance_f64x2 ~ 0)
+        coincident_mask_i64x2 = wasm_f64x2_lt(sin_angular_distance_f64x2, epsilon_f64x2);
-        // cos(angular_distance) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
-        cos_angular_distance = wasm_f64x2_relaxed_madd(wasm_f64x2_mul(cos_reduced_first, cos_reduced_second),
-                                                       cos_lambda,
-                                                       wasm_f64x2_mul(sin_reduced_first, sin_reduced_second));
+        // cos(angular_distance_f64x2) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
+        cos_angular_distance_f64x2 = wasm_f64x2_relaxed_madd(
+            wasm_f64x2_mul(cos_reduced_first_f64x2, cos_reduced_second_f64x2), cos_lambda_f64x2,
+            wasm_f64x2_mul(sin_reduced_first_f64x2, sin_reduced_second_f64x2));
-        // angular_distance = atan2(sin, cos)
-        angular_distance = nk_f64x2_atan2_v128relaxed_(sin_angular_distance, cos_angular_distance);
+        // angular_distance_f64x2 = atan2(sin, cos)
+        angular_distance_f64x2 = nk_f64x2_atan2_v128relaxed_(sin_angular_distance_f64x2, cos_angular_distance_f64x2);
-        // sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance)
+        // sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance_f64x2)
         // Avoid division by zero by using blending
         // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
         // Safe because mask is from comparison (all-ones or all-zeros per lane).
-        v128_t safe_sin_angular = wasm_i64x2_relaxed_laneselect(one, sin_angular_distance, coincident_mask);
-        sin_azimuth = wasm_f64x2_div(wasm_f64x2_mul(wasm_f64x2_mul(cos_reduced_first, cos_reduced_second), sin_lambda),
-                                     safe_sin_angular);
-        cos_squared_azimuth = wasm_f64x2_relaxed_nmadd(sin_azimuth, sin_azimuth, one);
+        v128_t safe_sin_angular_i64x2 = wasm_i64x2_relaxed_laneselect(one_f64x2, sin_angular_distance_f64x2,
+                                                                      coincident_mask_i64x2);
+        sin_azimuth_f64x2 = wasm_f64x2_div(
+            wasm_f64x2_mul(wasm_f64x2_mul(cos_reduced_first_f64x2, cos_reduced_second_f64x2), sin_lambda_f64x2),
+            safe_sin_angular_i64x2);
+        cos_squared_azimuth_f64x2 = wasm_f64x2_relaxed_nmadd(sin_azimuth_f64x2, sin_azimuth_f64x2, one_f64x2);
         // Handle equatorial case: cos^2(a) ~ 0
-        v128_t equatorial_mask = wasm_f64x2_lt(cos_squared_azimuth, epsilon);
-        v128_t safe_cos_sq_azimuth = wasm_i64x2_relaxed_laneselect(one, cos_squared_azimuth, equatorial_mask);
+        v128_t equatorial_mask_f64x2 = wasm_f64x2_lt(cos_squared_azimuth_f64x2, epsilon_f64x2);
+        v128_t safe_cos_sq_azimuth_i64x2 = wasm_i64x2_relaxed_laneselect(one_f64x2, cos_squared_azimuth_f64x2,
+                                                                         equatorial_mask_f64x2);
         // cos(2sm) = cos(s) - 2 * sin(U1) * sin(U2) / cos^2(a)
-        v128_t sin_product = wasm_f64x2_mul(sin_reduced_first, sin_reduced_second);
-        cos_double_angular_midpoint = wasm_f64x2_sub(
-            cos_angular_distance, wasm_f64x2_div(wasm_f64x2_mul(two, sin_product), safe_cos_sq_azimuth));
-        cos_double_angular_midpoint = wasm_i64x2_relaxed_laneselect(wasm_f64x2_splat(0.0), cos_double_angular_midpoint,
-                                                                    equatorial_mask);
+        v128_t sin_product_f64x2 = wasm_f64x2_mul(sin_reduced_first_f64x2, sin_reduced_second_f64x2);
+        cos_double_angular_midpoint_f64x2 = wasm_f64x2_sub(
+            cos_angular_distance_f64x2,
+            wasm_f64x2_div(wasm_f64x2_mul(two_f64x2, sin_product_f64x2), safe_cos_sq_azimuth_i64x2));
+        cos_double_angular_midpoint_f64x2 = wasm_i64x2_relaxed_laneselect(
+            wasm_f64x2_splat(0.0), cos_double_angular_midpoint_f64x2, equatorial_mask_f64x2);
         // C = f/16 * cos^2(a) * (4 + f*(4 - 3*cos^2(a)))
-        v128_t correction_factor = wasm_f64x2_mul(
-            wasm_f64x2_div(flattening, sixteen),
+        v128_t correction_factor_f64x2 = wasm_f64x2_mul(
+            wasm_f64x2_div(flattening_f64x2, sixteen_f64x2),
             wasm_f64x2_mul(
-                cos_squared_azimuth,
-                wasm_f64x2_relaxed_madd(flattening, wasm_f64x2_relaxed_nmadd(three, cos_squared_azimuth, four), four)));
+                cos_squared_azimuth_f64x2,
+                wasm_f64x2_relaxed_madd(flattening_f64x2,
+                                        wasm_f64x2_relaxed_nmadd(three_f64x2, cos_squared_azimuth_f64x2, four_f64x2),
+                                        four_f64x2)));
         // l' = L + (1-C) * f * sin(a) * (s + C * sin(s) * (cos(2sm) + C * cos(s) * (-1 + 2 * cos^2(2sm))))
-        v128_t cos_2sm_sq = wasm_f64x2_mul(cos_double_angular_midpoint, cos_double_angular_midpoint);
-        // innermost = -1 + 2 * cos^2(2sm)
-        v128_t innermost = wasm_f64x2_relaxed_madd(two, cos_2sm_sq, wasm_f64x2_splat(-1.0));
-        // middle = cos(2sm) + C * cos(s) * innermost
-        v128_t middle = wasm_f64x2_relaxed_madd(wasm_f64x2_mul(correction_factor, cos_angular_distance), innermost,
-                                                cos_double_angular_midpoint);
-        // inner = C * sin(s) * middle
-        v128_t inner = wasm_f64x2_mul(wasm_f64x2_mul(correction_factor, sin_angular_distance), middle);
-        // l' = L + (1-C) * f * sin_a * (s + inner)
-        v128_t lambda_new = wasm_f64x2_relaxed_madd(
-            wasm_f64x2_mul(wasm_f64x2_mul(wasm_f64x2_sub(one, correction_factor), flattening), sin_azimuth),
-            wasm_f64x2_add(angular_distance, inner), longitude_difference);
+        v128_t cos_2sm_sq_f64x2 = wasm_f64x2_mul(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
+        // innermost_f64x2 = -1 + 2 * cos^2(2sm)
+        v128_t innermost_f64x2 = wasm_f64x2_relaxed_madd(two_f64x2, cos_2sm_sq_f64x2, wasm_f64x2_splat(-1.0));
+        // middle_f64x2 = cos(2sm) + C * cos(s) * innermost_f64x2
+        v128_t middle_f64x2 = wasm_f64x2_relaxed_madd(
+            wasm_f64x2_mul(correction_factor_f64x2, cos_angular_distance_f64x2), innermost_f64x2,
+            cos_double_angular_midpoint_f64x2);
+        // inner_f64x2 = C * sin(s) * middle_f64x2
+        v128_t inner_f64x2 = wasm_f64x2_mul(wasm_f64x2_mul(correction_factor_f64x2, sin_angular_distance_f64x2),
+                                            middle_f64x2);
+        // l' = L + (1-C) * f * sin_a * (s + inner_f64x2)
+        v128_t lambda_new_f64x2 = wasm_f64x2_relaxed_madd(
+            wasm_f64x2_mul(wasm_f64x2_mul(wasm_f64x2_sub(one_f64x2, correction_factor_f64x2), flattening_f64x2),
+                           sin_azimuth_f64x2),
+            wasm_f64x2_add(angular_distance_f64x2, inner_f64x2), longitude_difference_f64x2);
         // Check convergence: |l - l'| < threshold
-        v128_t lambda_diff = wasm_f64x2_sub(lambda_new, lambda);
-        v128_t lambda_diff_abs = wasm_f64x2_abs(lambda_diff);
-        v128_t newly_converged = wasm_f64x2_lt(lambda_diff_abs, convergence_threshold);
-        converged_mask = wasm_v128_or(converged_mask, newly_converged);
+        v128_t lambda_diff_f64x2 = wasm_f64x2_sub(lambda_new_f64x2, lambda_f64x2);
+        v128_t lambda_diff_abs_f64x2 = wasm_f64x2_abs(lambda_diff_f64x2);
+        v128_t newly_converged_f64x2 = wasm_f64x2_lt(lambda_diff_abs_f64x2, convergence_threshold_f64x2);
+        converged_mask_i64x2 = wasm_v128_or(converged_mask_i64x2, newly_converged_f64x2);
-        // Only update lambda for non-converged lanes
+        // Only update lambda_f64x2 for non-converged lanes
         // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
         // Safe because mask is from comparison (all-ones or all-zeros per lane).
-        lambda = wasm_i64x2_relaxed_laneselect(lambda, lambda_new, converged_mask);
+        lambda_f64x2 = wasm_i64x2_relaxed_laneselect(lambda_f64x2, lambda_new_f64x2, converged_mask_i64x2);
     }
     // Final distance calculation
     // u^2 = cos^2(a) * (a^2 - b^2) / b^2
-    v128_t a_sq = wasm_f64x2_mul(equatorial_radius, equatorial_radius);
-    v128_t b_sq = wasm_f64x2_mul(polar_radius, polar_radius);
-    v128_t u_squared = wasm_f64x2_div(wasm_f64x2_mul(cos_squared_azimuth, wasm_f64x2_sub(a_sq, b_sq)), b_sq);
+    v128_t a_sq_f64x2 = wasm_f64x2_mul(equatorial_radius_f64x2, equatorial_radius_f64x2);
+    v128_t b_sq_f64x2 = wasm_f64x2_mul(polar_radius_f64x2, polar_radius_f64x2);
+    v128_t u_squared_f64x2 = wasm_f64x2_div(
+        wasm_f64x2_mul(cos_squared_azimuth_f64x2, wasm_f64x2_sub(a_sq_f64x2, b_sq_f64x2)), b_sq_f64x2);
     // A = 1 + u^2/16384 * (4096 + u^2*(-768 + u^2*(320 - 175*u^2)))
-    v128_t series_a = wasm_f64x2_relaxed_madd(u_squared, wasm_f64x2_splat(-175.0), wasm_f64x2_splat(320.0));
-    series_a = wasm_f64x2_relaxed_madd(u_squared, series_a, wasm_f64x2_splat(-768.0));
-    series_a = wasm_f64x2_relaxed_madd(u_squared, series_a, wasm_f64x2_splat(4096.0));
-    series_a = wasm_f64x2_relaxed_madd(wasm_f64x2_div(u_squared, wasm_f64x2_splat(16384.0)), series_a, one);
+    v128_t series_a_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, wasm_f64x2_splat(-175.0), wasm_f64x2_splat(320.0));
+    series_a_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_a_f64x2, wasm_f64x2_splat(-768.0));
+    series_a_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_a_f64x2, wasm_f64x2_splat(4096.0));
+    series_a_f64x2 = wasm_f64x2_relaxed_madd(wasm_f64x2_div(u_squared_f64x2, wasm_f64x2_splat(16384.0)), series_a_f64x2,
+                                             one_f64x2);
     // B = u^2/1024 * (256 + u^2*(-128 + u^2*(74 - 47*u^2)))
-    v128_t series_b = wasm_f64x2_relaxed_madd(u_squared, wasm_f64x2_splat(-47.0), wasm_f64x2_splat(74.0));
-    series_b = wasm_f64x2_relaxed_madd(u_squared, series_b, wasm_f64x2_splat(-128.0));
-    series_b = wasm_f64x2_relaxed_madd(u_squared, series_b, wasm_f64x2_splat(256.0));
-    series_b = wasm_f64x2_mul(wasm_f64x2_div(u_squared, wasm_f64x2_splat(1024.0)), series_b);
+    v128_t series_b_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, wasm_f64x2_splat(-47.0), wasm_f64x2_splat(74.0));
+    series_b_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_b_f64x2, wasm_f64x2_splat(-128.0));
+    series_b_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_b_f64x2, wasm_f64x2_splat(256.0));
+    series_b_f64x2 = wasm_f64x2_mul(wasm_f64x2_div(u_squared_f64x2, wasm_f64x2_splat(1024.0)), series_b_f64x2);
     // Delta-sigma calculation
-    v128_t cos_2sm_sq = wasm_f64x2_mul(cos_double_angular_midpoint, cos_double_angular_midpoint);
-    v128_t sin_sq = wasm_f64x2_mul(sin_angular_distance, sin_angular_distance);
-    v128_t term1 = wasm_f64x2_relaxed_madd(two, cos_2sm_sq, wasm_f64x2_splat(-1.0));
-    term1 = wasm_f64x2_mul(cos_angular_distance, term1);
-    v128_t term2 = wasm_f64x2_relaxed_madd(four, sin_sq, wasm_f64x2_splat(-3.0));
-    v128_t term3 = wasm_f64x2_relaxed_madd(four, cos_2sm_sq, wasm_f64x2_splat(-3.0));
-    term2 = wasm_f64x2_mul(wasm_f64x2_mul(wasm_f64x2_div(series_b, six), cos_double_angular_midpoint),
-                           wasm_f64x2_mul(term2, term3));
-    v128_t delta_sigma = wasm_f64x2_mul(
-        series_b, wasm_f64x2_mul(sin_angular_distance, wasm_f64x2_add(cos_double_angular_midpoint,
-                                                                      wasm_f64x2_mul(wasm_f64x2_div(series_b, four),
-                                                                                     wasm_f64x2_sub(term1, term2)))));
+    v128_t cos_2sm_sq_f64x2 = wasm_f64x2_mul(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
+    v128_t sin_sq_f64x2 = wasm_f64x2_mul(sin_angular_distance_f64x2, sin_angular_distance_f64x2);
+    v128_t term1_f64x2 = wasm_f64x2_relaxed_madd(two_f64x2, cos_2sm_sq_f64x2, wasm_f64x2_splat(-1.0));
+    term1_f64x2 = wasm_f64x2_mul(cos_angular_distance_f64x2, term1_f64x2);
+    v128_t term2_f64x2 = wasm_f64x2_relaxed_madd(four_f64x2, sin_sq_f64x2, wasm_f64x2_splat(-3.0));
+    v128_t term3_f64x2 = wasm_f64x2_relaxed_madd(four_f64x2, cos_2sm_sq_f64x2, wasm_f64x2_splat(-3.0));
+    term2_f64x2 = wasm_f64x2_mul(
+        wasm_f64x2_mul(wasm_f64x2_div(series_b_f64x2, six_f64x2), cos_double_angular_midpoint_f64x2),
+        wasm_f64x2_mul(term2_f64x2, term3_f64x2));
+    v128_t delta_sigma_f64x2 = wasm_f64x2_mul(
+        series_b_f64x2, wasm_f64x2_mul(sin_angular_distance_f64x2,
+                                       wasm_f64x2_add(cos_double_angular_midpoint_f64x2,
+                                                      wasm_f64x2_mul(wasm_f64x2_div(series_b_f64x2, four_f64x2),
+                                                                     wasm_f64x2_sub(term1_f64x2, term2_f64x2)))));
     // s = b * A * (s - ds)
-    v128_t distances = wasm_f64x2_mul(wasm_f64x2_mul(polar_radius, series_a),
-                                      wasm_f64x2_sub(angular_distance, delta_sigma));
+    v128_t distances_f64x2 = wasm_f64x2_mul(wasm_f64x2_mul(polar_radius_f64x2, series_a_f64x2),
+                                            wasm_f64x2_sub(angular_distance_f64x2, delta_sigma_f64x2));
     // Set coincident points to zero
     // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
     // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    distances = wasm_i64x2_relaxed_laneselect(wasm_f64x2_splat(0.0), distances, coincident_mask);
+    distances_f64x2 = wasm_i64x2_relaxed_laneselect(wasm_f64x2_splat(0.0), distances_f64x2, coincident_mask_i64x2);
-    return distances;
+    return distances_f64x2;
 }
 NK_PUBLIC void nk_vincenty_f64_v128relaxed(         //
@@ -375,14 +398,14 @@ NK_PUBLIC void nk_vincenty_f64_v128relaxed(         //
     nk_size_t n, nk_f64_t *results) {
     while (n >= 2) {
-        v128_t first_latitudes = wasm_v128_load(a_lats);
-        v128_t first_longitudes = wasm_v128_load(a_lons);
-        v128_t second_latitudes = wasm_v128_load(b_lats);
-        v128_t second_longitudes = wasm_v128_load(b_lons);
+        v128_t first_latitudes_f64x2 = wasm_v128_load(a_lats);
+        v128_t first_longitudes_f64x2 = wasm_v128_load(a_lons);
+        v128_t second_latitudes_f64x2 = wasm_v128_load(b_lats);
+        v128_t second_longitudes_f64x2 = wasm_v128_load(b_lons);
-        v128_t distances = nk_vincenty_f64x2_v128relaxed_(first_latitudes, first_longitudes, second_latitudes,
-                                                          second_longitudes);
-        wasm_v128_store(results, distances);
+        v128_t distances_f64x2 = nk_vincenty_f64x2_v128relaxed_(first_latitudes_f64x2, first_longitudes_f64x2,
+                                                                second_latitudes_f64x2, second_longitudes_f64x2);
+        wasm_v128_store(results, distances_f64x2);
         a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
     }
@@ -394,9 +417,9 @@ NK_PUBLIC void nk_vincenty_f64_v128relaxed(         //
         nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
         nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
         nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
-        v128_t distances = nk_vincenty_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
-                                                          b_lon_vec.v128);
-        result_vec.v128 = distances;
+        v128_t distances_f64x2 = nk_vincenty_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
+                                                                b_lon_vec.v128);
+        result_vec.v128 = distances_f64x2;
         nk_partial_store_b64x2_serial_(&result_vec, results, n);
     }
 }
@@ -405,168 +428,184 @@ NK_PUBLIC void nk_vincenty_f64_v128relaxed(         //
  *  @brief  WASM Relaxed SIMD helper for Vincenty's geodesic distance on 4 f32 point pairs.
  *  @note   This is a true SIMD implementation using masked convergence tracking via blending.
  */
-NK_INTERNAL v128_t nk_vincenty_f32x4_v128relaxed_(   //
-    v128_t first_latitudes, v128_t first_longitudes, //
-    v128_t second_latitudes, v128_t second_longitudes) {
-    v128_t const equatorial_radius = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
-    v128_t const polar_radius = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
-    v128_t const flattening = wasm_f32x4_splat(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
-    v128_t const convergence_threshold = wasm_f32x4_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
-    v128_t const one = wasm_f32x4_splat(1.0f);
-    v128_t const two = wasm_f32x4_splat(2.0f);
-    v128_t const three = wasm_f32x4_splat(3.0f);
-    v128_t const four = wasm_f32x4_splat(4.0f);
-    v128_t const six = wasm_f32x4_splat(6.0f);
-    v128_t const sixteen = wasm_f32x4_splat(16.0f);
-    v128_t const epsilon = wasm_f32x4_splat(1e-7f);
+NK_INTERNAL v128_t nk_vincenty_f32x4_v128relaxed_(               //
+    v128_t first_latitudes_f32x4, v128_t first_longitudes_f32x4, //
+    v128_t second_latitudes_f32x4, v128_t second_longitudes_f32x4) {
+    v128_t const equatorial_radius_f32x4 = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
+    v128_t const polar_radius_f32x4 = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
+    v128_t const flattening_f32x4 = wasm_f32x4_splat(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
+    v128_t const convergence_threshold_f32x4 = wasm_f32x4_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
+    v128_t const one_f32x4 = wasm_f32x4_splat(1.0f);
+    v128_t const two_f32x4 = wasm_f32x4_splat(2.0f);
+    v128_t const three_f32x4 = wasm_f32x4_splat(3.0f);
+    v128_t const four_f32x4 = wasm_f32x4_splat(4.0f);
+    v128_t const six_f32x4 = wasm_f32x4_splat(6.0f);
+    v128_t const sixteen_f32x4 = wasm_f32x4_splat(16.0f);
+    v128_t const epsilon_f32x4 = wasm_f32x4_splat(1e-7f);
     // Longitude difference
-    v128_t longitude_difference = wasm_f32x4_sub(second_longitudes, first_longitudes);
+    v128_t longitude_difference_f32x4 = wasm_f32x4_sub(second_longitudes_f32x4, first_longitudes_f32x4);
     // Reduced latitudes: tan(U) = (1-f) * tan(lat)
-    v128_t one_minus_f = wasm_f32x4_sub(one, flattening);
-    v128_t tan_first = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(first_latitudes),
-                                      nk_f32x4_cos_v128relaxed_(first_latitudes));
-    v128_t tan_second = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(second_latitudes),
-                                       nk_f32x4_cos_v128relaxed_(second_latitudes));
-    v128_t tan_reduced_first = wasm_f32x4_mul(one_minus_f, tan_first);
-    v128_t tan_reduced_second = wasm_f32x4_mul(one_minus_f, tan_second);
+    v128_t one_minus_f_f32x4 = wasm_f32x4_sub(one_f32x4, flattening_f32x4);
+    v128_t tan_first_f32x4 = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(first_latitudes_f32x4),
+                                            nk_f32x4_cos_v128relaxed_(first_latitudes_f32x4));
+    v128_t tan_second_f32x4 = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(second_latitudes_f32x4),
+                                             nk_f32x4_cos_v128relaxed_(second_latitudes_f32x4));
+    v128_t tan_reduced_first_f32x4 = wasm_f32x4_mul(one_minus_f_f32x4, tan_first_f32x4);
+    v128_t tan_reduced_second_f32x4 = wasm_f32x4_mul(one_minus_f_f32x4, tan_second_f32x4);
     // cos(U) = 1/sqrt(1 + tan^2(U)), sin(U) = tan(U) * cos(U)
-    v128_t cos_reduced_first = wasm_f32x4_div(
-        one, wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_first, tan_reduced_first, one)));
-    v128_t sin_reduced_first = wasm_f32x4_mul(tan_reduced_first, cos_reduced_first);
-    v128_t cos_reduced_second = wasm_f32x4_div(
-        one, wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_second, tan_reduced_second, one)));
-    v128_t sin_reduced_second = wasm_f32x4_mul(tan_reduced_second, cos_reduced_second);
-    // Initialize lambda and tracking variables
-    v128_t lambda = longitude_difference;
-    v128_t sin_angular_distance, cos_angular_distance, angular_distance;
-    v128_t sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
+    v128_t cos_reduced_first_f32x4 = wasm_f32x4_div(
+        one_f32x4,
+        wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_first_f32x4, tan_reduced_first_f32x4, one_f32x4)));
+    v128_t sin_reduced_first_f32x4 = wasm_f32x4_mul(tan_reduced_first_f32x4, cos_reduced_first_f32x4);
+    v128_t cos_reduced_second_f32x4 = wasm_f32x4_div(
+        one_f32x4,
+        wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_second_f32x4, tan_reduced_second_f32x4, one_f32x4)));
+    v128_t sin_reduced_second_f32x4 = wasm_f32x4_mul(tan_reduced_second_f32x4, cos_reduced_second_f32x4);
+    // Initialize lambda_f32x4 and tracking variables
+    v128_t lambda_f32x4 = longitude_difference_f32x4;
+    v128_t sin_angular_distance_f32x4, cos_angular_distance_f32x4, angular_distance_f32x4;
+    v128_t sin_azimuth_f32x4, cos_squared_azimuth_f32x4, cos_double_angular_midpoint_f32x4;
     // Track convergence and coincident points using masks
-    v128_t converged_mask = wasm_i32x4_splat(0);
-    v128_t coincident_mask = wasm_i32x4_splat(0);
+    v128_t converged_mask_i32x4 = wasm_i32x4_splat(0);
+    v128_t coincident_mask_i32x4 = wasm_i32x4_splat(0);
     for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
         // Check if all lanes converged
-        if (wasm_i8x16_all_true(converged_mask)) break;
+        if (wasm_i8x16_all_true(converged_mask_i32x4)) break;
-        v128_t sin_lambda = nk_f32x4_sin_v128relaxed_(lambda);
-        v128_t cos_lambda = nk_f32x4_cos_v128relaxed_(lambda);
+        v128_t sin_lambda_f32x4 = nk_f32x4_sin_v128relaxed_(lambda_f32x4);
+        v128_t cos_lambda_f32x4 = nk_f32x4_cos_v128relaxed_(lambda_f32x4);
-        // sin^2(angular_distance) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
-        v128_t cross_term = wasm_f32x4_mul(cos_reduced_second, sin_lambda);
-        v128_t mixed_term = wasm_f32x4_sub(
-            wasm_f32x4_mul(cos_reduced_first, sin_reduced_second),
-            wasm_f32x4_mul(wasm_f32x4_mul(sin_reduced_first, cos_reduced_second), cos_lambda));
-        v128_t sin_angular_dist_sq = wasm_f32x4_relaxed_madd(cross_term, cross_term,
-                                                             wasm_f32x4_mul(mixed_term, mixed_term));
-        sin_angular_distance = wasm_f32x4_sqrt(sin_angular_dist_sq);
+        // sin^2(angular_distance_f32x4) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
+        v128_t cross_term_f32x4 = wasm_f32x4_mul(cos_reduced_second_f32x4, sin_lambda_f32x4);
+        v128_t mixed_term_f32x4 = wasm_f32x4_sub(
+            wasm_f32x4_mul(cos_reduced_first_f32x4, sin_reduced_second_f32x4),
+            wasm_f32x4_mul(wasm_f32x4_mul(sin_reduced_first_f32x4, cos_reduced_second_f32x4), cos_lambda_f32x4));
+        v128_t sin_angular_dist_sq_f32x4 = wasm_f32x4_relaxed_madd(cross_term_f32x4, cross_term_f32x4,
+                                                                   wasm_f32x4_mul(mixed_term_f32x4, mixed_term_f32x4));
+        sin_angular_distance_f32x4 = wasm_f32x4_sqrt(sin_angular_dist_sq_f32x4);
-        // Check for coincident points (sin_angular_distance ~ 0)
-        coincident_mask = wasm_f32x4_lt(sin_angular_distance, epsilon);
+        // Check for coincident points (sin_angular_distance_f32x4 ~ 0)
+        coincident_mask_i32x4 = wasm_f32x4_lt(sin_angular_distance_f32x4, epsilon_f32x4);
-        // cos(angular_distance) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
-        cos_angular_distance = wasm_f32x4_relaxed_madd(wasm_f32x4_mul(cos_reduced_first, cos_reduced_second),
-                                                       cos_lambda,
-                                                       wasm_f32x4_mul(sin_reduced_first, sin_reduced_second));
+        // cos(angular_distance_f32x4) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
+        cos_angular_distance_f32x4 = wasm_f32x4_relaxed_madd(
+            wasm_f32x4_mul(cos_reduced_first_f32x4, cos_reduced_second_f32x4), cos_lambda_f32x4,
+            wasm_f32x4_mul(sin_reduced_first_f32x4, sin_reduced_second_f32x4));
-        // angular_distance = atan2(sin, cos)
-        angular_distance = nk_f32x4_atan2_v128relaxed_(sin_angular_distance, cos_angular_distance);
+        // angular_distance_f32x4 = atan2(sin, cos)
+        angular_distance_f32x4 = nk_f32x4_atan2_v128relaxed_(sin_angular_distance_f32x4, cos_angular_distance_f32x4);
-        // sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance)
+        // sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance_f32x4)
         // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
         // Safe because mask is from comparison (all-ones or all-zeros per lane).
-        v128_t safe_sin_angular = wasm_i32x4_relaxed_laneselect(one, sin_angular_distance, coincident_mask);
-        sin_azimuth = wasm_f32x4_div(wasm_f32x4_mul(wasm_f32x4_mul(cos_reduced_first, cos_reduced_second), sin_lambda),
-                                     safe_sin_angular);
-        cos_squared_azimuth = wasm_f32x4_relaxed_nmadd(sin_azimuth, sin_azimuth, one);
+        v128_t safe_sin_angular_i32x4 = wasm_i32x4_relaxed_laneselect(one_f32x4, sin_angular_distance_f32x4,
+                                                                      coincident_mask_i32x4);
+        sin_azimuth_f32x4 = wasm_f32x4_div(
+            wasm_f32x4_mul(wasm_f32x4_mul(cos_reduced_first_f32x4, cos_reduced_second_f32x4), sin_lambda_f32x4),
+            safe_sin_angular_i32x4);
+        cos_squared_azimuth_f32x4 = wasm_f32x4_relaxed_nmadd(sin_azimuth_f32x4, sin_azimuth_f32x4, one_f32x4);
         // Handle equatorial case: cos^2(a) ~ 0
-        v128_t equatorial_mask = wasm_f32x4_lt(cos_squared_azimuth, epsilon);
-        v128_t safe_cos_sq_azimuth = wasm_i32x4_relaxed_laneselect(one, cos_squared_azimuth, equatorial_mask);
+        v128_t equatorial_mask_f32x4 = wasm_f32x4_lt(cos_squared_azimuth_f32x4, epsilon_f32x4);
+        v128_t safe_cos_sq_azimuth_i32x4 = wasm_i32x4_relaxed_laneselect(one_f32x4, cos_squared_azimuth_f32x4,
+                                                                         equatorial_mask_f32x4);
         // cos(2sm) = cos(s) - 2 * sin(U1) * sin(U2) / cos^2(a)
-        v128_t sin_product = wasm_f32x4_mul(sin_reduced_first, sin_reduced_second);
-        cos_double_angular_midpoint = wasm_f32x4_sub(
-            cos_angular_distance, wasm_f32x4_div(wasm_f32x4_mul(two, sin_product), safe_cos_sq_azimuth));
-        cos_double_angular_midpoint = wasm_i32x4_relaxed_laneselect(wasm_f32x4_splat(0.0f), cos_double_angular_midpoint,
-                                                                    equatorial_mask);
+        v128_t sin_product_f32x4 = wasm_f32x4_mul(sin_reduced_first_f32x4, sin_reduced_second_f32x4);
+        cos_double_angular_midpoint_f32x4 = wasm_f32x4_sub(
+            cos_angular_distance_f32x4,
+            wasm_f32x4_div(wasm_f32x4_mul(two_f32x4, sin_product_f32x4), safe_cos_sq_azimuth_i32x4));
+        cos_double_angular_midpoint_f32x4 = wasm_i32x4_relaxed_laneselect(
+            wasm_f32x4_splat(0.0f), cos_double_angular_midpoint_f32x4, equatorial_mask_f32x4);
         // C = f/16 * cos^2(a) * (4 + f*(4 - 3*cos^2(a)))
-        v128_t correction_factor = wasm_f32x4_mul(
-            wasm_f32x4_div(flattening, sixteen),
+        v128_t correction_factor_f32x4 = wasm_f32x4_mul(
+            wasm_f32x4_div(flattening_f32x4, sixteen_f32x4),
             wasm_f32x4_mul(
-                cos_squared_azimuth,
-                wasm_f32x4_relaxed_madd(flattening, wasm_f32x4_relaxed_nmadd(three, cos_squared_azimuth, four), four)));
+                cos_squared_azimuth_f32x4,
+                wasm_f32x4_relaxed_madd(flattening_f32x4,
+                                        wasm_f32x4_relaxed_nmadd(three_f32x4, cos_squared_azimuth_f32x4, four_f32x4),
+                                        four_f32x4)));
         // l' = L + (1-C) * f * sin(a) * (s + C * sin(s) * (cos(2sm) + C * cos(s) * (-1 + 2 * cos^2(2sm))))
-        v128_t cos_2sm_sq = wasm_f32x4_mul(cos_double_angular_midpoint, cos_double_angular_midpoint);
-        v128_t innermost = wasm_f32x4_relaxed_madd(two, cos_2sm_sq, wasm_f32x4_splat(-1.0f));
-        v128_t middle = wasm_f32x4_relaxed_madd(wasm_f32x4_mul(correction_factor, cos_angular_distance), innermost,
-                                                cos_double_angular_midpoint);
-        v128_t inner = wasm_f32x4_mul(wasm_f32x4_mul(correction_factor, sin_angular_distance), middle);
-        v128_t lambda_new = wasm_f32x4_relaxed_madd(
-            wasm_f32x4_mul(wasm_f32x4_mul(wasm_f32x4_sub(one, correction_factor), flattening), sin_azimuth),
-            wasm_f32x4_add(angular_distance, inner), longitude_difference);
+        v128_t cos_2sm_sq_f32x4 = wasm_f32x4_mul(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
+        v128_t innermost_f32x4 = wasm_f32x4_relaxed_madd(two_f32x4, cos_2sm_sq_f32x4, wasm_f32x4_splat(-1.0f));
+        v128_t middle_f32x4 = wasm_f32x4_relaxed_madd(
+            wasm_f32x4_mul(correction_factor_f32x4, cos_angular_distance_f32x4), innermost_f32x4,
+            cos_double_angular_midpoint_f32x4);
+        v128_t inner_f32x4 = wasm_f32x4_mul(wasm_f32x4_mul(correction_factor_f32x4, sin_angular_distance_f32x4),
+                                            middle_f32x4);
+        v128_t lambda_new_f32x4 = wasm_f32x4_relaxed_madd(
+            wasm_f32x4_mul(wasm_f32x4_mul(wasm_f32x4_sub(one_f32x4, correction_factor_f32x4), flattening_f32x4),
+                           sin_azimuth_f32x4),
+            wasm_f32x4_add(angular_distance_f32x4, inner_f32x4), longitude_difference_f32x4);
         // Check convergence: |l - l'| < threshold
-        v128_t lambda_diff = wasm_f32x4_sub(lambda_new, lambda);
-        v128_t lambda_diff_abs = wasm_f32x4_abs(lambda_diff);
-        v128_t newly_converged = wasm_f32x4_lt(lambda_diff_abs, convergence_threshold);
-        converged_mask = wasm_v128_or(converged_mask, newly_converged);
+        v128_t lambda_diff_f32x4 = wasm_f32x4_sub(lambda_new_f32x4, lambda_f32x4);
+        v128_t lambda_diff_abs_f32x4 = wasm_f32x4_abs(lambda_diff_f32x4);
+        v128_t newly_converged_f32x4 = wasm_f32x4_lt(lambda_diff_abs_f32x4, convergence_threshold_f32x4);
+        converged_mask_i32x4 = wasm_v128_or(converged_mask_i32x4, newly_converged_f32x4);
-        // Only update lambda for non-converged lanes
+        // Only update lambda_f32x4 for non-converged lanes
         // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
         // Safe because mask is from comparison (all-ones or all-zeros per lane).
-        lambda = wasm_i32x4_relaxed_laneselect(lambda, lambda_new, converged_mask);
+        lambda_f32x4 = wasm_i32x4_relaxed_laneselect(lambda_f32x4, lambda_new_f32x4, converged_mask_i32x4);
     }
     // Final distance calculation
-    v128_t a_sq = wasm_f32x4_mul(equatorial_radius, equatorial_radius);
-    v128_t b_sq = wasm_f32x4_mul(polar_radius, polar_radius);
-    v128_t u_squared = wasm_f32x4_div(wasm_f32x4_mul(cos_squared_azimuth, wasm_f32x4_sub(a_sq, b_sq)), b_sq);
+    v128_t a_sq_f32x4 = wasm_f32x4_mul(equatorial_radius_f32x4, equatorial_radius_f32x4);
+    v128_t b_sq_f32x4 = wasm_f32x4_mul(polar_radius_f32x4, polar_radius_f32x4);
+    v128_t u_squared_f32x4 = wasm_f32x4_div(
+        wasm_f32x4_mul(cos_squared_azimuth_f32x4, wasm_f32x4_sub(a_sq_f32x4, b_sq_f32x4)), b_sq_f32x4);
     // A = 1 + u^2/16384 * (4096 + u^2*(-768 + u^2*(320 - 175*u^2)))
-    v128_t series_a = wasm_f32x4_relaxed_madd(u_squared, wasm_f32x4_splat(-175.0f), wasm_f32x4_splat(320.0f));
-    series_a = wasm_f32x4_relaxed_madd(u_squared, series_a, wasm_f32x4_splat(-768.0f));
-    series_a = wasm_f32x4_relaxed_madd(u_squared, series_a, wasm_f32x4_splat(4096.0f));
-    series_a = wasm_f32x4_relaxed_madd(wasm_f32x4_div(u_squared, wasm_f32x4_splat(16384.0f)), series_a, one);
+    v128_t series_a_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, wasm_f32x4_splat(-175.0f),
+                                                    wasm_f32x4_splat(320.0f));
+    series_a_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_a_f32x4, wasm_f32x4_splat(-768.0f));
+    series_a_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_a_f32x4, wasm_f32x4_splat(4096.0f));
+    series_a_f32x4 = wasm_f32x4_relaxed_madd(wasm_f32x4_div(u_squared_f32x4, wasm_f32x4_splat(16384.0f)),
+                                             series_a_f32x4, one_f32x4);
     // B = u^2/1024 * (256 + u^2*(-128 + u^2*(74 - 47*u^2)))
-    v128_t series_b = wasm_f32x4_relaxed_madd(u_squared, wasm_f32x4_splat(-47.0f), wasm_f32x4_splat(74.0f));
-    series_b = wasm_f32x4_relaxed_madd(u_squared, series_b, wasm_f32x4_splat(-128.0f));
-    series_b = wasm_f32x4_relaxed_madd(u_squared, series_b, wasm_f32x4_splat(256.0f));
-    series_b = wasm_f32x4_mul(wasm_f32x4_div(u_squared, wasm_f32x4_splat(1024.0f)), series_b);
+    v128_t series_b_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, wasm_f32x4_splat(-47.0f), wasm_f32x4_splat(74.0f));
+    series_b_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_b_f32x4, wasm_f32x4_splat(-128.0f));
+    series_b_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_b_f32x4, wasm_f32x4_splat(256.0f));
+    series_b_f32x4 = wasm_f32x4_mul(wasm_f32x4_div(u_squared_f32x4, wasm_f32x4_splat(1024.0f)), series_b_f32x4);
     // Delta-sigma calculation
-    v128_t cos_2sm_sq = wasm_f32x4_mul(cos_double_angular_midpoint, cos_double_angular_midpoint);
-    v128_t sin_sq = wasm_f32x4_mul(sin_angular_distance, sin_angular_distance);
-    v128_t term1 = wasm_f32x4_relaxed_madd(two, cos_2sm_sq, wasm_f32x4_splat(-1.0f));
-    term1 = wasm_f32x4_mul(cos_angular_distance, term1);
-    v128_t term2 = wasm_f32x4_relaxed_madd(four, sin_sq, wasm_f32x4_splat(-3.0f));
-    v128_t term3 = wasm_f32x4_relaxed_madd(four, cos_2sm_sq, wasm_f32x4_splat(-3.0f));
-    term2 = wasm_f32x4_mul(wasm_f32x4_mul(wasm_f32x4_div(series_b, six), cos_double_angular_midpoint),
-                           wasm_f32x4_mul(term2, term3));
-    v128_t delta_sigma = wasm_f32x4_mul(
-        series_b, wasm_f32x4_mul(sin_angular_distance, wasm_f32x4_add(cos_double_angular_midpoint,
-                                                                      wasm_f32x4_mul(wasm_f32x4_div(series_b, four),
-                                                                                     wasm_f32x4_sub(term1, term2)))));
+    v128_t cos_2sm_sq_f32x4 = wasm_f32x4_mul(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
+    v128_t sin_sq_f32x4 = wasm_f32x4_mul(sin_angular_distance_f32x4, sin_angular_distance_f32x4);
+    v128_t term1_f32x4 = wasm_f32x4_relaxed_madd(two_f32x4, cos_2sm_sq_f32x4, wasm_f32x4_splat(-1.0f));
+    term1_f32x4 = wasm_f32x4_mul(cos_angular_distance_f32x4, term1_f32x4);
+    v128_t term2_f32x4 = wasm_f32x4_relaxed_madd(four_f32x4, sin_sq_f32x4, wasm_f32x4_splat(-3.0f));
+    v128_t term3_f32x4 = wasm_f32x4_relaxed_madd(four_f32x4, cos_2sm_sq_f32x4, wasm_f32x4_splat(-3.0f));
+    term2_f32x4 = wasm_f32x4_mul(
+        wasm_f32x4_mul(wasm_f32x4_div(series_b_f32x4, six_f32x4), cos_double_angular_midpoint_f32x4),
+        wasm_f32x4_mul(term2_f32x4, term3_f32x4));
+    v128_t delta_sigma_f32x4 = wasm_f32x4_mul(
+        series_b_f32x4, wasm_f32x4_mul(sin_angular_distance_f32x4,
+                                       wasm_f32x4_add(cos_double_angular_midpoint_f32x4,
+                                                      wasm_f32x4_mul(wasm_f32x4_div(series_b_f32x4, four_f32x4),
+                                                                     wasm_f32x4_sub(term1_f32x4, term2_f32x4)))));
     // s = b * A * (s - ds)
-    v128_t distances = wasm_f32x4_mul(wasm_f32x4_mul(polar_radius, series_a),
-                                      wasm_f32x4_sub(angular_distance, delta_sigma));
+    v128_t distances_f32x4 = wasm_f32x4_mul(wasm_f32x4_mul(polar_radius_f32x4, series_a_f32x4),
+                                            wasm_f32x4_sub(angular_distance_f32x4, delta_sigma_f32x4));
     // Set coincident points to zero
     // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
     // Safe because mask is from comparison (all-ones or all-zeros per lane).
-    distances = wasm_i32x4_relaxed_laneselect(wasm_f32x4_splat(0.0f), distances, coincident_mask);
+    distances_f32x4 = wasm_i32x4_relaxed_laneselect(wasm_f32x4_splat(0.0f), distances_f32x4, coincident_mask_i32x4);
-    return distances;
+    return distances_f32x4;
 }
 NK_PUBLIC void nk_vincenty_f32_v128relaxed(         //
@@ -575,14 +614,14 @@ NK_PUBLIC void nk_vincenty_f32_v128relaxed(         //
     nk_size_t n, nk_f32_t *results) {
     while (n >= 4) {
-        v128_t first_latitudes = wasm_v128_load(a_lats);
-        v128_t first_longitudes = wasm_v128_load(a_lons);
-        v128_t second_latitudes = wasm_v128_load(b_lats);
-        v128_t second_longitudes = wasm_v128_load(b_lons);
+        v128_t first_latitudes_f32x4 = wasm_v128_load(a_lats);
+        v128_t first_longitudes_f32x4 = wasm_v128_load(a_lons);
+        v128_t second_latitudes_f32x4 = wasm_v128_load(b_lats);
+        v128_t second_longitudes_f32x4 = wasm_v128_load(b_lons);
-        v128_t distances = nk_vincenty_f32x4_v128relaxed_(first_latitudes, first_longitudes, second_latitudes,
-                                                          second_longitudes);
-        wasm_v128_store(results, distances);
+        v128_t distances_f32x4 = nk_vincenty_f32x4_v128relaxed_(first_latitudes_f32x4, first_longitudes_f32x4,
+                                                                second_latitudes_f32x4, second_longitudes_f32x4);
+        wasm_v128_store(results, distances_f32x4);
         a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
     }
@@ -594,9 +633,9 @@ NK_PUBLIC void nk_vincenty_f32_v128relaxed(         //
         nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
         nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
         nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
-        v128_t distances = nk_vincenty_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
-                                                          b_lon_vec.v128);
-        result_vec.v128 = distances;
+        v128_t distances_f32x4 = nk_vincenty_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
+                                                                b_lon_vec.v128);
+        result_vec.v128 = distances_f32x4;
         nk_partial_store_b32x4_serial_(&result_vec, results, n);
     }
 }