numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -46,46 +46,50 @@ extern "C" {
46
46
  * These require WASM trigonometric kernels from trigonometry/v128relaxed.h.
47
47
  */
48
48
 
49
- NK_INTERNAL v128_t nk_haversine_f64x2_v128relaxed_( //
50
- v128_t first_latitudes, v128_t first_longitudes, //
51
- v128_t second_latitudes, v128_t second_longitudes) {
49
+ NK_INTERNAL v128_t nk_haversine_f64x2_v128relaxed_( //
50
+ v128_t first_latitudes_f64x2, v128_t first_longitudes_f64x2, //
51
+ v128_t second_latitudes_f64x2, v128_t second_longitudes_f64x2) {
52
52
 
53
- v128_t const earth_radius = wasm_f64x2_splat(NK_EARTH_MEDIATORIAL_RADIUS);
54
- v128_t const half = wasm_f64x2_splat(0.5);
55
- v128_t const one = wasm_f64x2_splat(1.0);
56
- v128_t const two = wasm_f64x2_splat(2.0);
53
+ v128_t const earth_radius_f64x2 = wasm_f64x2_splat(NK_EARTH_MEDIATORIAL_RADIUS);
54
+ v128_t const half_f64x2 = wasm_f64x2_splat(0.5);
55
+ v128_t const one_f64x2 = wasm_f64x2_splat(1.0);
56
+ v128_t const two_f64x2 = wasm_f64x2_splat(2.0);
57
57
 
58
- v128_t latitude_delta = wasm_f64x2_sub(second_latitudes, first_latitudes);
59
- v128_t longitude_delta = wasm_f64x2_sub(second_longitudes, first_longitudes);
58
+ v128_t latitude_delta_f64x2 = wasm_f64x2_sub(second_latitudes_f64x2, first_latitudes_f64x2);
59
+ v128_t longitude_delta_f64x2 = wasm_f64x2_sub(second_longitudes_f64x2, first_longitudes_f64x2);
60
60
 
61
61
  // Haversine terms: sin^2(delta/2)
62
- v128_t latitude_delta_half = wasm_f64x2_mul(latitude_delta, half);
63
- v128_t longitude_delta_half = wasm_f64x2_mul(longitude_delta, half);
64
- v128_t sin_latitude_delta_half = nk_f64x2_sin_v128relaxed_(latitude_delta_half);
65
- v128_t sin_longitude_delta_half = nk_f64x2_sin_v128relaxed_(longitude_delta_half);
66
- v128_t sin_squared_latitude_delta_half = wasm_f64x2_mul(sin_latitude_delta_half, sin_latitude_delta_half);
67
- v128_t sin_squared_longitude_delta_half = wasm_f64x2_mul(sin_longitude_delta_half, sin_longitude_delta_half);
62
+ v128_t latitude_delta_half_f64x2 = wasm_f64x2_mul(latitude_delta_f64x2, half_f64x2);
63
+ v128_t longitude_delta_half_f64x2 = wasm_f64x2_mul(longitude_delta_f64x2, half_f64x2);
64
+ v128_t sin_latitude_delta_half_f64x2 = nk_f64x2_sin_v128relaxed_(latitude_delta_half_f64x2);
65
+ v128_t sin_longitude_delta_half_f64x2 = nk_f64x2_sin_v128relaxed_(longitude_delta_half_f64x2);
66
+ v128_t sin_squared_latitude_delta_half_f64x2 = wasm_f64x2_mul(sin_latitude_delta_half_f64x2,
67
+ sin_latitude_delta_half_f64x2);
68
+ v128_t sin_squared_longitude_delta_half_f64x2 = wasm_f64x2_mul(sin_longitude_delta_half_f64x2,
69
+ sin_longitude_delta_half_f64x2);
68
70
 
69
71
  // Latitude cosine product
70
- v128_t cos_first_latitude = nk_f64x2_cos_v128relaxed_(first_latitudes);
71
- v128_t cos_second_latitude = nk_f64x2_cos_v128relaxed_(second_latitudes);
72
- v128_t cos_latitude_product = wasm_f64x2_mul(cos_first_latitude, cos_second_latitude);
72
+ v128_t cos_first_latitude_f64x2 = nk_f64x2_cos_v128relaxed_(first_latitudes_f64x2);
73
+ v128_t cos_second_latitude_f64x2 = nk_f64x2_cos_v128relaxed_(second_latitudes_f64x2);
74
+ v128_t cos_latitude_product_f64x2 = wasm_f64x2_mul(cos_first_latitude_f64x2, cos_second_latitude_f64x2);
73
75
 
74
76
  // a = sin^2(dlat/2) + cos(lat1) * cos(lat2) * sin^2(dlon/2)
75
- v128_t haversine_term = wasm_f64x2_add(sin_squared_latitude_delta_half,
76
- wasm_f64x2_mul(cos_latitude_product, sin_squared_longitude_delta_half));
77
- // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
78
- // relaxed_min/max: 1 instruction (minpd/maxpd) vs 6-9 (with NaN/signed-zero fixup) on x86.
79
- // Safe because haversine_term is a product of finite sin/cos values NaN is impossible.
80
- v128_t zero = wasm_f64x2_splat(0.0);
81
- haversine_term = wasm_f64x2_relaxed_max(zero, wasm_f64x2_relaxed_min(one, haversine_term));
77
+ v128_t haversine_term_f64x2 = wasm_f64x2_add(
78
+ sin_squared_latitude_delta_half_f64x2,
79
+ wasm_f64x2_mul(cos_latitude_product_f64x2, sin_squared_longitude_delta_half_f64x2));
80
+ // Clamp haversine_term_f64x2 to [0, 1] to prevent NaN from sqrt of negative values
81
+ // relaxed_min/max: 1 instruction (minpd/maxpd) vs 6-9 (with NaN/signed-zero_f64x2 fixup) on x86.
82
+ // Safe because haversine_term_f64x2 is a product of finite sin/cos values — NaN is impossible.
83
+ v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
84
+ haversine_term_f64x2 = wasm_f64x2_relaxed_max(zero_f64x2, wasm_f64x2_relaxed_min(one_f64x2, haversine_term_f64x2));
82
85
 
83
86
  // Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
84
- v128_t sqrt_haversine = wasm_f64x2_sqrt(haversine_term);
85
- v128_t sqrt_complement = wasm_f64x2_sqrt(wasm_f64x2_sub(one, haversine_term));
86
- v128_t central_angle = wasm_f64x2_mul(two, nk_f64x2_atan2_v128relaxed_(sqrt_haversine, sqrt_complement));
87
+ v128_t sqrt_haversine_f64x2 = wasm_f64x2_sqrt(haversine_term_f64x2);
88
+ v128_t sqrt_complement_f64x2 = wasm_f64x2_sqrt(wasm_f64x2_sub(one_f64x2, haversine_term_f64x2));
89
+ v128_t central_angle_f64x2 = wasm_f64x2_mul(
90
+ two_f64x2, nk_f64x2_atan2_v128relaxed_(sqrt_haversine_f64x2, sqrt_complement_f64x2));
87
91
 
88
- return wasm_f64x2_mul(earth_radius, central_angle);
92
+ return wasm_f64x2_mul(earth_radius_f64x2, central_angle_f64x2);
89
93
  }
90
94
 
91
95
  NK_PUBLIC void nk_haversine_f64_v128relaxed( //
@@ -94,14 +98,14 @@ NK_PUBLIC void nk_haversine_f64_v128relaxed( //
94
98
  nk_size_t n, nk_f64_t *results) {
95
99
 
96
100
  while (n >= 2) {
97
- v128_t first_latitudes = wasm_v128_load(a_lats);
98
- v128_t first_longitudes = wasm_v128_load(a_lons);
99
- v128_t second_latitudes = wasm_v128_load(b_lats);
100
- v128_t second_longitudes = wasm_v128_load(b_lons);
101
+ v128_t first_latitudes_f64x2 = wasm_v128_load(a_lats);
102
+ v128_t first_longitudes_f64x2 = wasm_v128_load(a_lons);
103
+ v128_t second_latitudes_f64x2 = wasm_v128_load(b_lats);
104
+ v128_t second_longitudes_f64x2 = wasm_v128_load(b_lons);
101
105
 
102
- v128_t distances = nk_haversine_f64x2_v128relaxed_(first_latitudes, first_longitudes, second_latitudes,
103
- second_longitudes);
104
- wasm_v128_store(results, distances);
106
+ v128_t distances_f64x2 = nk_haversine_f64x2_v128relaxed_(first_latitudes_f64x2, first_longitudes_f64x2,
107
+ second_latitudes_f64x2, second_longitudes_f64x2);
108
+ wasm_v128_store(results, distances_f64x2);
105
109
 
106
110
  a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
107
111
  }
@@ -113,54 +117,58 @@ NK_PUBLIC void nk_haversine_f64_v128relaxed( //
113
117
  nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
114
118
  nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
115
119
  nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
116
- v128_t distances = nk_haversine_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
117
- b_lon_vec.v128);
118
- result_vec.v128 = distances;
120
+ v128_t distances_f64x2 = nk_haversine_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
121
+ b_lon_vec.v128);
122
+ result_vec.v128 = distances_f64x2;
119
123
  nk_partial_store_b64x2_serial_(&result_vec, results, n);
120
124
  }
121
125
  }
122
126
 
123
- NK_INTERNAL v128_t nk_haversine_f32x4_v128relaxed_( //
124
- v128_t first_latitudes, v128_t first_longitudes, //
125
- v128_t second_latitudes, v128_t second_longitudes) {
127
+ NK_INTERNAL v128_t nk_haversine_f32x4_v128relaxed_( //
128
+ v128_t first_latitudes_f32x4, v128_t first_longitudes_f32x4, //
129
+ v128_t second_latitudes_f32x4, v128_t second_longitudes_f32x4) {
126
130
 
127
- v128_t const earth_radius = wasm_f32x4_splat((float)NK_EARTH_MEDIATORIAL_RADIUS);
128
- v128_t const half = wasm_f32x4_splat(0.5f);
129
- v128_t const one = wasm_f32x4_splat(1.0f);
130
- v128_t const two = wasm_f32x4_splat(2.0f);
131
+ v128_t const earth_radius_f32x4 = wasm_f32x4_splat((float)NK_EARTH_MEDIATORIAL_RADIUS);
132
+ v128_t const half_f32x4 = wasm_f32x4_splat(0.5f);
133
+ v128_t const one_f32x4 = wasm_f32x4_splat(1.0f);
134
+ v128_t const two_f32x4 = wasm_f32x4_splat(2.0f);
131
135
 
132
- v128_t latitude_delta = wasm_f32x4_sub(second_latitudes, first_latitudes);
133
- v128_t longitude_delta = wasm_f32x4_sub(second_longitudes, first_longitudes);
136
+ v128_t latitude_delta_f32x4 = wasm_f32x4_sub(second_latitudes_f32x4, first_latitudes_f32x4);
137
+ v128_t longitude_delta_f32x4 = wasm_f32x4_sub(second_longitudes_f32x4, first_longitudes_f32x4);
134
138
 
135
139
  // Haversine terms: sin^2(delta/2)
136
- v128_t latitude_delta_half = wasm_f32x4_mul(latitude_delta, half);
137
- v128_t longitude_delta_half = wasm_f32x4_mul(longitude_delta, half);
138
- v128_t sin_latitude_delta_half = nk_f32x4_sin_v128relaxed_(latitude_delta_half);
139
- v128_t sin_longitude_delta_half = nk_f32x4_sin_v128relaxed_(longitude_delta_half);
140
- v128_t sin_squared_latitude_delta_half = wasm_f32x4_mul(sin_latitude_delta_half, sin_latitude_delta_half);
141
- v128_t sin_squared_longitude_delta_half = wasm_f32x4_mul(sin_longitude_delta_half, sin_longitude_delta_half);
140
+ v128_t latitude_delta_half_f32x4 = wasm_f32x4_mul(latitude_delta_f32x4, half_f32x4);
141
+ v128_t longitude_delta_half_f32x4 = wasm_f32x4_mul(longitude_delta_f32x4, half_f32x4);
142
+ v128_t sin_latitude_delta_half_f32x4 = nk_f32x4_sin_v128relaxed_(latitude_delta_half_f32x4);
143
+ v128_t sin_longitude_delta_half_f32x4 = nk_f32x4_sin_v128relaxed_(longitude_delta_half_f32x4);
144
+ v128_t sin_squared_latitude_delta_half_f32x4 = wasm_f32x4_mul(sin_latitude_delta_half_f32x4,
145
+ sin_latitude_delta_half_f32x4);
146
+ v128_t sin_squared_longitude_delta_half_f32x4 = wasm_f32x4_mul(sin_longitude_delta_half_f32x4,
147
+ sin_longitude_delta_half_f32x4);
142
148
 
143
149
  // Latitude cosine product
144
- v128_t cos_first_latitude = nk_f32x4_cos_v128relaxed_(first_latitudes);
145
- v128_t cos_second_latitude = nk_f32x4_cos_v128relaxed_(second_latitudes);
146
- v128_t cos_latitude_product = wasm_f32x4_mul(cos_first_latitude, cos_second_latitude);
150
+ v128_t cos_first_latitude_f32x4 = nk_f32x4_cos_v128relaxed_(first_latitudes_f32x4);
151
+ v128_t cos_second_latitude_f32x4 = nk_f32x4_cos_v128relaxed_(second_latitudes_f32x4);
152
+ v128_t cos_latitude_product_f32x4 = wasm_f32x4_mul(cos_first_latitude_f32x4, cos_second_latitude_f32x4);
147
153
 
148
154
  // a = sin^2(dlat/2) + cos(lat1) * cos(lat2) * sin^2(dlon/2)
149
- v128_t haversine_term = wasm_f32x4_add(sin_squared_latitude_delta_half,
150
- wasm_f32x4_mul(cos_latitude_product, sin_squared_longitude_delta_half));
155
+ v128_t haversine_term_f32x4 = wasm_f32x4_add(
156
+ sin_squared_latitude_delta_half_f32x4,
157
+ wasm_f32x4_mul(cos_latitude_product_f32x4, sin_squared_longitude_delta_half_f32x4));
151
158
 
152
159
  // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
153
- // relaxed_min/max: 1 instruction (minps/maxps) vs 6-9 (with NaN/signed-zero fixup) on x86.
154
- // Safe because haversine_term is a product of finite sin/cos values — NaN is impossible.
155
- v128_t zero = wasm_f32x4_splat(0.0f);
156
- haversine_term = wasm_f32x4_relaxed_max(zero, wasm_f32x4_relaxed_min(one, haversine_term));
160
+ // relaxed_min/max: 1 instruction (minps/maxps) vs 6-9 (with NaN/signed-zero_f32x4 fixup) on x86.
161
+ // Safe because haversine_term_f32x4 is a product of finite sin/cos values — NaN is impossible.
162
+ v128_t zero_f32x4 = wasm_f32x4_splat(0.0f);
163
+ haversine_term_f32x4 = wasm_f32x4_relaxed_max(zero_f32x4, wasm_f32x4_relaxed_min(one_f32x4, haversine_term_f32x4));
157
164
 
158
165
  // Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
159
- v128_t sqrt_haversine = wasm_f32x4_sqrt(haversine_term);
160
- v128_t sqrt_complement = wasm_f32x4_sqrt(wasm_f32x4_sub(one, haversine_term));
161
- v128_t central_angle = wasm_f32x4_mul(two, nk_f32x4_atan2_v128relaxed_(sqrt_haversine, sqrt_complement));
166
+ v128_t sqrt_haversine_f32x4 = wasm_f32x4_sqrt(haversine_term_f32x4);
167
+ v128_t sqrt_complement_f32x4 = wasm_f32x4_sqrt(wasm_f32x4_sub(one_f32x4, haversine_term_f32x4));
168
+ v128_t central_angle_f32x4 = wasm_f32x4_mul(
169
+ two_f32x4, nk_f32x4_atan2_v128relaxed_(sqrt_haversine_f32x4, sqrt_complement_f32x4));
162
170
 
163
- return wasm_f32x4_mul(earth_radius, central_angle);
171
+ return wasm_f32x4_mul(earth_radius_f32x4, central_angle_f32x4);
164
172
  }
165
173
 
166
174
  NK_PUBLIC void nk_haversine_f32_v128relaxed( //
@@ -169,14 +177,14 @@ NK_PUBLIC void nk_haversine_f32_v128relaxed( //
169
177
  nk_size_t n, nk_f32_t *results) {
170
178
 
171
179
  while (n >= 4) {
172
- v128_t first_latitudes = wasm_v128_load(a_lats);
173
- v128_t first_longitudes = wasm_v128_load(a_lons);
174
- v128_t second_latitudes = wasm_v128_load(b_lats);
175
- v128_t second_longitudes = wasm_v128_load(b_lons);
180
+ v128_t first_latitudes_f32x4 = wasm_v128_load(a_lats);
181
+ v128_t first_longitudes_f32x4 = wasm_v128_load(a_lons);
182
+ v128_t second_latitudes_f32x4 = wasm_v128_load(b_lats);
183
+ v128_t second_longitudes_f32x4 = wasm_v128_load(b_lons);
176
184
 
177
- v128_t distances = nk_haversine_f32x4_v128relaxed_(first_latitudes, first_longitudes, second_latitudes,
178
- second_longitudes);
179
- wasm_v128_store(results, distances);
185
+ v128_t distances_f32x4 = nk_haversine_f32x4_v128relaxed_(first_latitudes_f32x4, first_longitudes_f32x4,
186
+ second_latitudes_f32x4, second_longitudes_f32x4);
187
+ wasm_v128_store(results, distances_f32x4);
180
188
 
181
189
  a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
182
190
  }
@@ -188,9 +196,9 @@ NK_PUBLIC void nk_haversine_f32_v128relaxed( //
188
196
  nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
189
197
  nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
190
198
  nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
191
- v128_t distances = nk_haversine_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
192
- b_lon_vec.v128);
193
- result_vec.v128 = distances;
199
+ v128_t distances_f32x4 = nk_haversine_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
200
+ b_lon_vec.v128);
201
+ result_vec.v128 = distances_f32x4;
194
202
  nk_partial_store_b32x4_serial_(&result_vec, results, n);
195
203
  }
196
204
  }
@@ -199,174 +207,189 @@ NK_PUBLIC void nk_haversine_f32_v128relaxed( //
199
207
  * @brief WASM Relaxed SIMD helper for Vincenty's geodesic distance on 2 f64 point pairs.
200
208
  * @note This is a true SIMD implementation using masked convergence tracking via blending.
201
209
  */
202
- NK_INTERNAL v128_t nk_vincenty_f64x2_v128relaxed_( //
203
- v128_t first_latitudes, v128_t first_longitudes, //
204
- v128_t second_latitudes, v128_t second_longitudes) {
205
-
206
- v128_t const equatorial_radius = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
207
- v128_t const polar_radius = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
208
- v128_t const flattening = wasm_f64x2_splat(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
209
- v128_t const convergence_threshold = wasm_f64x2_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
210
- v128_t const one = wasm_f64x2_splat(1.0);
211
- v128_t const two = wasm_f64x2_splat(2.0);
212
- v128_t const three = wasm_f64x2_splat(3.0);
213
- v128_t const four = wasm_f64x2_splat(4.0);
214
- v128_t const six = wasm_f64x2_splat(6.0);
215
- v128_t const sixteen = wasm_f64x2_splat(16.0);
216
- v128_t const epsilon = wasm_f64x2_splat(1e-15);
210
+ NK_INTERNAL v128_t nk_vincenty_f64x2_v128relaxed_( //
211
+ v128_t first_latitudes_f64x2, v128_t first_longitudes_f64x2, //
212
+ v128_t second_latitudes_f64x2, v128_t second_longitudes_f64x2) {
213
+
214
+ v128_t const equatorial_radius_f64x2 = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
215
+ v128_t const polar_radius_f64x2 = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
216
+ v128_t const flattening_f64x2 = wasm_f64x2_splat(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
217
+ v128_t const convergence_threshold_f64x2 = wasm_f64x2_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
218
+ v128_t const one_f64x2 = wasm_f64x2_splat(1.0);
219
+ v128_t const two_f64x2 = wasm_f64x2_splat(2.0);
220
+ v128_t const three_f64x2 = wasm_f64x2_splat(3.0);
221
+ v128_t const four_f64x2 = wasm_f64x2_splat(4.0);
222
+ v128_t const six_f64x2 = wasm_f64x2_splat(6.0);
223
+ v128_t const sixteen_f64x2 = wasm_f64x2_splat(16.0);
224
+ v128_t const epsilon_f64x2 = wasm_f64x2_splat(1e-15);
217
225
 
218
226
  // Longitude difference
219
- v128_t longitude_difference = wasm_f64x2_sub(second_longitudes, first_longitudes);
227
+ v128_t longitude_difference_f64x2 = wasm_f64x2_sub(second_longitudes_f64x2, first_longitudes_f64x2);
220
228
 
221
229
  // Reduced latitudes: tan(U) = (1-f) * tan(lat)
222
- v128_t one_minus_f = wasm_f64x2_sub(one, flattening);
223
- v128_t tan_first = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(first_latitudes),
224
- nk_f64x2_cos_v128relaxed_(first_latitudes));
225
- v128_t tan_second = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(second_latitudes),
226
- nk_f64x2_cos_v128relaxed_(second_latitudes));
227
- v128_t tan_reduced_first = wasm_f64x2_mul(one_minus_f, tan_first);
228
- v128_t tan_reduced_second = wasm_f64x2_mul(one_minus_f, tan_second);
230
+ v128_t one_minus_f_f64x2 = wasm_f64x2_sub(one_f64x2, flattening_f64x2);
231
+ v128_t tan_first_f64x2 = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(first_latitudes_f64x2),
232
+ nk_f64x2_cos_v128relaxed_(first_latitudes_f64x2));
233
+ v128_t tan_second_f64x2 = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(second_latitudes_f64x2),
234
+ nk_f64x2_cos_v128relaxed_(second_latitudes_f64x2));
235
+ v128_t tan_reduced_first_f64x2 = wasm_f64x2_mul(one_minus_f_f64x2, tan_first_f64x2);
236
+ v128_t tan_reduced_second_f64x2 = wasm_f64x2_mul(one_minus_f_f64x2, tan_second_f64x2);
229
237
 
230
238
  // cos(U) = 1/sqrt(1 + tan^2(U)), sin(U) = tan(U) * cos(U)
231
- v128_t cos_reduced_first = wasm_f64x2_div(
232
- one, wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_first, tan_reduced_first, one)));
233
- v128_t sin_reduced_first = wasm_f64x2_mul(tan_reduced_first, cos_reduced_first);
234
- v128_t cos_reduced_second = wasm_f64x2_div(
235
- one, wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_second, tan_reduced_second, one)));
236
- v128_t sin_reduced_second = wasm_f64x2_mul(tan_reduced_second, cos_reduced_second);
237
-
238
- // Initialize lambda and tracking variables
239
- v128_t lambda = longitude_difference;
240
- v128_t sin_angular_distance, cos_angular_distance, angular_distance;
241
- v128_t sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
239
+ v128_t cos_reduced_first_f64x2 = wasm_f64x2_div(
240
+ one_f64x2,
241
+ wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_first_f64x2, tan_reduced_first_f64x2, one_f64x2)));
242
+ v128_t sin_reduced_first_f64x2 = wasm_f64x2_mul(tan_reduced_first_f64x2, cos_reduced_first_f64x2);
243
+ v128_t cos_reduced_second_f64x2 = wasm_f64x2_div(
244
+ one_f64x2,
245
+ wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_second_f64x2, tan_reduced_second_f64x2, one_f64x2)));
246
+ v128_t sin_reduced_second_f64x2 = wasm_f64x2_mul(tan_reduced_second_f64x2, cos_reduced_second_f64x2);
247
+
248
+ // Initialize lambda_f64x2 and tracking variables
249
+ v128_t lambda_f64x2 = longitude_difference_f64x2;
250
+ v128_t sin_angular_distance_f64x2, cos_angular_distance_f64x2, angular_distance_f64x2;
251
+ v128_t sin_azimuth_f64x2, cos_squared_azimuth_f64x2, cos_double_angular_midpoint_f64x2;
242
252
 
243
253
  // Track convergence and coincident points using masks
244
- v128_t converged_mask = wasm_i64x2_splat(0);
245
- v128_t coincident_mask = wasm_i64x2_splat(0);
254
+ v128_t converged_mask_i64x2 = wasm_i64x2_splat(0);
255
+ v128_t coincident_mask_i64x2 = wasm_i64x2_splat(0);
246
256
 
247
257
  for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
248
258
  // Check if all lanes converged
249
- if (wasm_i8x16_all_true(converged_mask)) break;
259
+ if (wasm_i8x16_all_true(converged_mask_i64x2)) break;
250
260
 
251
- v128_t sin_lambda = nk_f64x2_sin_v128relaxed_(lambda);
252
- v128_t cos_lambda = nk_f64x2_cos_v128relaxed_(lambda);
261
+ v128_t sin_lambda_f64x2 = nk_f64x2_sin_v128relaxed_(lambda_f64x2);
262
+ v128_t cos_lambda_f64x2 = nk_f64x2_cos_v128relaxed_(lambda_f64x2);
253
263
 
254
- // sin^2(angular_distance) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
255
- v128_t cross_term = wasm_f64x2_mul(cos_reduced_second, sin_lambda);
256
- v128_t mixed_term = wasm_f64x2_sub(
257
- wasm_f64x2_mul(cos_reduced_first, sin_reduced_second),
258
- wasm_f64x2_mul(wasm_f64x2_mul(sin_reduced_first, cos_reduced_second), cos_lambda));
259
- v128_t sin_angular_dist_sq = wasm_f64x2_relaxed_madd(cross_term, cross_term,
260
- wasm_f64x2_mul(mixed_term, mixed_term));
261
- sin_angular_distance = wasm_f64x2_sqrt(sin_angular_dist_sq);
264
+ // sin^2(angular_distance_f64x2) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
265
+ v128_t cross_term_f64x2 = wasm_f64x2_mul(cos_reduced_second_f64x2, sin_lambda_f64x2);
266
+ v128_t mixed_term_f64x2 = wasm_f64x2_sub(
267
+ wasm_f64x2_mul(cos_reduced_first_f64x2, sin_reduced_second_f64x2),
268
+ wasm_f64x2_mul(wasm_f64x2_mul(sin_reduced_first_f64x2, cos_reduced_second_f64x2), cos_lambda_f64x2));
269
+ v128_t sin_angular_dist_sq_f64x2 = wasm_f64x2_relaxed_madd(cross_term_f64x2, cross_term_f64x2,
270
+ wasm_f64x2_mul(mixed_term_f64x2, mixed_term_f64x2));
271
+ sin_angular_distance_f64x2 = wasm_f64x2_sqrt(sin_angular_dist_sq_f64x2);
262
272
 
263
- // Check for coincident points (sin_angular_distance ~ 0)
264
- coincident_mask = wasm_f64x2_lt(sin_angular_distance, epsilon);
273
+ // Check for coincident points (sin_angular_distance_f64x2 ~ 0)
274
+ coincident_mask_i64x2 = wasm_f64x2_lt(sin_angular_distance_f64x2, epsilon_f64x2);
265
275
 
266
- // cos(angular_distance) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
267
- cos_angular_distance = wasm_f64x2_relaxed_madd(wasm_f64x2_mul(cos_reduced_first, cos_reduced_second),
268
- cos_lambda,
269
- wasm_f64x2_mul(sin_reduced_first, sin_reduced_second));
276
+ // cos(angular_distance_f64x2) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
277
+ cos_angular_distance_f64x2 = wasm_f64x2_relaxed_madd(
278
+ wasm_f64x2_mul(cos_reduced_first_f64x2, cos_reduced_second_f64x2), cos_lambda_f64x2,
279
+ wasm_f64x2_mul(sin_reduced_first_f64x2, sin_reduced_second_f64x2));
270
280
 
271
- // angular_distance = atan2(sin, cos)
272
- angular_distance = nk_f64x2_atan2_v128relaxed_(sin_angular_distance, cos_angular_distance);
281
+ // angular_distance_f64x2 = atan2(sin, cos)
282
+ angular_distance_f64x2 = nk_f64x2_atan2_v128relaxed_(sin_angular_distance_f64x2, cos_angular_distance_f64x2);
273
283
 
274
- // sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance)
284
+ // sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance_f64x2)
275
285
  // Avoid division by zero by using blending
276
286
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
277
287
  // Safe because mask is from comparison (all-ones or all-zeros per lane).
278
- v128_t safe_sin_angular = wasm_i64x2_relaxed_laneselect(one, sin_angular_distance, coincident_mask);
279
- sin_azimuth = wasm_f64x2_div(wasm_f64x2_mul(wasm_f64x2_mul(cos_reduced_first, cos_reduced_second), sin_lambda),
280
- safe_sin_angular);
281
- cos_squared_azimuth = wasm_f64x2_relaxed_nmadd(sin_azimuth, sin_azimuth, one);
288
+ v128_t safe_sin_angular_i64x2 = wasm_i64x2_relaxed_laneselect(one_f64x2, sin_angular_distance_f64x2,
289
+ coincident_mask_i64x2);
290
+ sin_azimuth_f64x2 = wasm_f64x2_div(
291
+ wasm_f64x2_mul(wasm_f64x2_mul(cos_reduced_first_f64x2, cos_reduced_second_f64x2), sin_lambda_f64x2),
292
+ safe_sin_angular_i64x2);
293
+ cos_squared_azimuth_f64x2 = wasm_f64x2_relaxed_nmadd(sin_azimuth_f64x2, sin_azimuth_f64x2, one_f64x2);
282
294
 
283
295
  // Handle equatorial case: cos^2(a) ~ 0
284
- v128_t equatorial_mask = wasm_f64x2_lt(cos_squared_azimuth, epsilon);
285
- v128_t safe_cos_sq_azimuth = wasm_i64x2_relaxed_laneselect(one, cos_squared_azimuth, equatorial_mask);
296
+ v128_t equatorial_mask_f64x2 = wasm_f64x2_lt(cos_squared_azimuth_f64x2, epsilon_f64x2);
297
+ v128_t safe_cos_sq_azimuth_i64x2 = wasm_i64x2_relaxed_laneselect(one_f64x2, cos_squared_azimuth_f64x2,
298
+ equatorial_mask_f64x2);
286
299
 
287
300
  // cos(2sm) = cos(s) - 2 * sin(U1) * sin(U2) / cos^2(a)
288
- v128_t sin_product = wasm_f64x2_mul(sin_reduced_first, sin_reduced_second);
289
- cos_double_angular_midpoint = wasm_f64x2_sub(
290
- cos_angular_distance, wasm_f64x2_div(wasm_f64x2_mul(two, sin_product), safe_cos_sq_azimuth));
291
- cos_double_angular_midpoint = wasm_i64x2_relaxed_laneselect(wasm_f64x2_splat(0.0), cos_double_angular_midpoint,
292
- equatorial_mask);
301
+ v128_t sin_product_f64x2 = wasm_f64x2_mul(sin_reduced_first_f64x2, sin_reduced_second_f64x2);
302
+ cos_double_angular_midpoint_f64x2 = wasm_f64x2_sub(
303
+ cos_angular_distance_f64x2,
304
+ wasm_f64x2_div(wasm_f64x2_mul(two_f64x2, sin_product_f64x2), safe_cos_sq_azimuth_i64x2));
305
+ cos_double_angular_midpoint_f64x2 = wasm_i64x2_relaxed_laneselect(
306
+ wasm_f64x2_splat(0.0), cos_double_angular_midpoint_f64x2, equatorial_mask_f64x2);
293
307
 
294
308
  // C = f/16 * cos^2(a) * (4 + f*(4 - 3*cos^2(a)))
295
- v128_t correction_factor = wasm_f64x2_mul(
296
- wasm_f64x2_div(flattening, sixteen),
309
+ v128_t correction_factor_f64x2 = wasm_f64x2_mul(
310
+ wasm_f64x2_div(flattening_f64x2, sixteen_f64x2),
297
311
  wasm_f64x2_mul(
298
- cos_squared_azimuth,
299
- wasm_f64x2_relaxed_madd(flattening, wasm_f64x2_relaxed_nmadd(three, cos_squared_azimuth, four), four)));
312
+ cos_squared_azimuth_f64x2,
313
+ wasm_f64x2_relaxed_madd(flattening_f64x2,
314
+ wasm_f64x2_relaxed_nmadd(three_f64x2, cos_squared_azimuth_f64x2, four_f64x2),
315
+ four_f64x2)));
300
316
 
301
317
  // l' = L + (1-C) * f * sin(a) * (s + C * sin(s) * (cos(2sm) + C * cos(s) * (-1 + 2 * cos^2(2sm))))
302
- v128_t cos_2sm_sq = wasm_f64x2_mul(cos_double_angular_midpoint, cos_double_angular_midpoint);
303
- // innermost = -1 + 2 * cos^2(2sm)
304
- v128_t innermost = wasm_f64x2_relaxed_madd(two, cos_2sm_sq, wasm_f64x2_splat(-1.0));
305
- // middle = cos(2sm) + C * cos(s) * innermost
306
- v128_t middle = wasm_f64x2_relaxed_madd(wasm_f64x2_mul(correction_factor, cos_angular_distance), innermost,
307
- cos_double_angular_midpoint);
308
- // inner = C * sin(s) * middle
309
- v128_t inner = wasm_f64x2_mul(wasm_f64x2_mul(correction_factor, sin_angular_distance), middle);
310
-
311
- // l' = L + (1-C) * f * sin_a * (s + inner)
312
- v128_t lambda_new = wasm_f64x2_relaxed_madd(
313
- wasm_f64x2_mul(wasm_f64x2_mul(wasm_f64x2_sub(one, correction_factor), flattening), sin_azimuth),
314
- wasm_f64x2_add(angular_distance, inner), longitude_difference);
318
+ v128_t cos_2sm_sq_f64x2 = wasm_f64x2_mul(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
319
+ // innermost_f64x2 = -1 + 2 * cos^2(2sm)
320
+ v128_t innermost_f64x2 = wasm_f64x2_relaxed_madd(two_f64x2, cos_2sm_sq_f64x2, wasm_f64x2_splat(-1.0));
321
+ // middle_f64x2 = cos(2sm) + C * cos(s) * innermost_f64x2
322
+ v128_t middle_f64x2 = wasm_f64x2_relaxed_madd(
323
+ wasm_f64x2_mul(correction_factor_f64x2, cos_angular_distance_f64x2), innermost_f64x2,
324
+ cos_double_angular_midpoint_f64x2);
325
+ // inner_f64x2 = C * sin(s) * middle_f64x2
326
+ v128_t inner_f64x2 = wasm_f64x2_mul(wasm_f64x2_mul(correction_factor_f64x2, sin_angular_distance_f64x2),
327
+ middle_f64x2);
328
+
329
+ // l' = L + (1-C) * f * sin_a * (s + inner_f64x2)
330
+ v128_t lambda_new_f64x2 = wasm_f64x2_relaxed_madd(
331
+ wasm_f64x2_mul(wasm_f64x2_mul(wasm_f64x2_sub(one_f64x2, correction_factor_f64x2), flattening_f64x2),
332
+ sin_azimuth_f64x2),
333
+ wasm_f64x2_add(angular_distance_f64x2, inner_f64x2), longitude_difference_f64x2);
315
334
 
316
335
  // Check convergence: |l - l'| < threshold
317
- v128_t lambda_diff = wasm_f64x2_sub(lambda_new, lambda);
318
- v128_t lambda_diff_abs = wasm_f64x2_abs(lambda_diff);
319
- v128_t newly_converged = wasm_f64x2_lt(lambda_diff_abs, convergence_threshold);
320
- converged_mask = wasm_v128_or(converged_mask, newly_converged);
336
+ v128_t lambda_diff_f64x2 = wasm_f64x2_sub(lambda_new_f64x2, lambda_f64x2);
337
+ v128_t lambda_diff_abs_f64x2 = wasm_f64x2_abs(lambda_diff_f64x2);
338
+ v128_t newly_converged_f64x2 = wasm_f64x2_lt(lambda_diff_abs_f64x2, convergence_threshold_f64x2);
339
+ converged_mask_i64x2 = wasm_v128_or(converged_mask_i64x2, newly_converged_f64x2);
321
340
 
322
- // Only update lambda for non-converged lanes
341
+ // Only update lambda_f64x2 for non-converged lanes
323
342
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
324
343
  // Safe because mask is from comparison (all-ones or all-zeros per lane).
325
- lambda = wasm_i64x2_relaxed_laneselect(lambda, lambda_new, converged_mask);
344
+ lambda_f64x2 = wasm_i64x2_relaxed_laneselect(lambda_f64x2, lambda_new_f64x2, converged_mask_i64x2);
326
345
  }
327
346
 
328
347
  // Final distance calculation
329
348
  // u^2 = cos^2(a) * (a^2 - b^2) / b^2
330
- v128_t a_sq = wasm_f64x2_mul(equatorial_radius, equatorial_radius);
331
- v128_t b_sq = wasm_f64x2_mul(polar_radius, polar_radius);
332
- v128_t u_squared = wasm_f64x2_div(wasm_f64x2_mul(cos_squared_azimuth, wasm_f64x2_sub(a_sq, b_sq)), b_sq);
349
+ v128_t a_sq_f64x2 = wasm_f64x2_mul(equatorial_radius_f64x2, equatorial_radius_f64x2);
350
+ v128_t b_sq_f64x2 = wasm_f64x2_mul(polar_radius_f64x2, polar_radius_f64x2);
351
+ v128_t u_squared_f64x2 = wasm_f64x2_div(
352
+ wasm_f64x2_mul(cos_squared_azimuth_f64x2, wasm_f64x2_sub(a_sq_f64x2, b_sq_f64x2)), b_sq_f64x2);
333
353
 
334
354
  // A = 1 + u^2/16384 * (4096 + u^2*(-768 + u^2*(320 - 175*u^2)))
335
- v128_t series_a = wasm_f64x2_relaxed_madd(u_squared, wasm_f64x2_splat(-175.0), wasm_f64x2_splat(320.0));
336
- series_a = wasm_f64x2_relaxed_madd(u_squared, series_a, wasm_f64x2_splat(-768.0));
337
- series_a = wasm_f64x2_relaxed_madd(u_squared, series_a, wasm_f64x2_splat(4096.0));
338
- series_a = wasm_f64x2_relaxed_madd(wasm_f64x2_div(u_squared, wasm_f64x2_splat(16384.0)), series_a, one);
355
+ v128_t series_a_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, wasm_f64x2_splat(-175.0), wasm_f64x2_splat(320.0));
356
+ series_a_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_a_f64x2, wasm_f64x2_splat(-768.0));
357
+ series_a_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_a_f64x2, wasm_f64x2_splat(4096.0));
358
+ series_a_f64x2 = wasm_f64x2_relaxed_madd(wasm_f64x2_div(u_squared_f64x2, wasm_f64x2_splat(16384.0)), series_a_f64x2,
359
+ one_f64x2);
339
360
 
340
361
  // B = u^2/1024 * (256 + u^2*(-128 + u^2*(74 - 47*u^2)))
341
- v128_t series_b = wasm_f64x2_relaxed_madd(u_squared, wasm_f64x2_splat(-47.0), wasm_f64x2_splat(74.0));
342
- series_b = wasm_f64x2_relaxed_madd(u_squared, series_b, wasm_f64x2_splat(-128.0));
343
- series_b = wasm_f64x2_relaxed_madd(u_squared, series_b, wasm_f64x2_splat(256.0));
344
- series_b = wasm_f64x2_mul(wasm_f64x2_div(u_squared, wasm_f64x2_splat(1024.0)), series_b);
362
+ v128_t series_b_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, wasm_f64x2_splat(-47.0), wasm_f64x2_splat(74.0));
363
+ series_b_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_b_f64x2, wasm_f64x2_splat(-128.0));
364
+ series_b_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_b_f64x2, wasm_f64x2_splat(256.0));
365
+ series_b_f64x2 = wasm_f64x2_mul(wasm_f64x2_div(u_squared_f64x2, wasm_f64x2_splat(1024.0)), series_b_f64x2);
345
366
 
346
367
  // Delta-sigma calculation
347
- v128_t cos_2sm_sq = wasm_f64x2_mul(cos_double_angular_midpoint, cos_double_angular_midpoint);
348
- v128_t sin_sq = wasm_f64x2_mul(sin_angular_distance, sin_angular_distance);
349
- v128_t term1 = wasm_f64x2_relaxed_madd(two, cos_2sm_sq, wasm_f64x2_splat(-1.0));
350
- term1 = wasm_f64x2_mul(cos_angular_distance, term1);
351
- v128_t term2 = wasm_f64x2_relaxed_madd(four, sin_sq, wasm_f64x2_splat(-3.0));
352
- v128_t term3 = wasm_f64x2_relaxed_madd(four, cos_2sm_sq, wasm_f64x2_splat(-3.0));
353
- term2 = wasm_f64x2_mul(wasm_f64x2_mul(wasm_f64x2_div(series_b, six), cos_double_angular_midpoint),
354
- wasm_f64x2_mul(term2, term3));
355
- v128_t delta_sigma = wasm_f64x2_mul(
356
- series_b, wasm_f64x2_mul(sin_angular_distance, wasm_f64x2_add(cos_double_angular_midpoint,
357
- wasm_f64x2_mul(wasm_f64x2_div(series_b, four),
358
- wasm_f64x2_sub(term1, term2)))));
368
+ v128_t cos_2sm_sq_f64x2 = wasm_f64x2_mul(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
369
+ v128_t sin_sq_f64x2 = wasm_f64x2_mul(sin_angular_distance_f64x2, sin_angular_distance_f64x2);
370
+ v128_t term1_f64x2 = wasm_f64x2_relaxed_madd(two_f64x2, cos_2sm_sq_f64x2, wasm_f64x2_splat(-1.0));
371
+ term1_f64x2 = wasm_f64x2_mul(cos_angular_distance_f64x2, term1_f64x2);
372
+ v128_t term2_f64x2 = wasm_f64x2_relaxed_madd(four_f64x2, sin_sq_f64x2, wasm_f64x2_splat(-3.0));
373
+ v128_t term3_f64x2 = wasm_f64x2_relaxed_madd(four_f64x2, cos_2sm_sq_f64x2, wasm_f64x2_splat(-3.0));
374
+ term2_f64x2 = wasm_f64x2_mul(
375
+ wasm_f64x2_mul(wasm_f64x2_div(series_b_f64x2, six_f64x2), cos_double_angular_midpoint_f64x2),
376
+ wasm_f64x2_mul(term2_f64x2, term3_f64x2));
377
+ v128_t delta_sigma_f64x2 = wasm_f64x2_mul(
378
+ series_b_f64x2, wasm_f64x2_mul(sin_angular_distance_f64x2,
379
+ wasm_f64x2_add(cos_double_angular_midpoint_f64x2,
380
+ wasm_f64x2_mul(wasm_f64x2_div(series_b_f64x2, four_f64x2),
381
+ wasm_f64x2_sub(term1_f64x2, term2_f64x2)))));
359
382
 
360
383
  // s = b * A * (s - ds)
361
- v128_t distances = wasm_f64x2_mul(wasm_f64x2_mul(polar_radius, series_a),
362
- wasm_f64x2_sub(angular_distance, delta_sigma));
384
+ v128_t distances_f64x2 = wasm_f64x2_mul(wasm_f64x2_mul(polar_radius_f64x2, series_a_f64x2),
385
+ wasm_f64x2_sub(angular_distance_f64x2, delta_sigma_f64x2));
363
386
 
364
387
  // Set coincident points to zero
365
388
  // relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
366
389
  // Safe because mask is from comparison (all-ones or all-zeros per lane).
367
- distances = wasm_i64x2_relaxed_laneselect(wasm_f64x2_splat(0.0), distances, coincident_mask);
390
+ distances_f64x2 = wasm_i64x2_relaxed_laneselect(wasm_f64x2_splat(0.0), distances_f64x2, coincident_mask_i64x2);
368
391
 
369
- return distances;
392
+ return distances_f64x2;
370
393
  }
371
394
 
372
395
  NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
@@ -375,14 +398,14 @@ NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
375
398
  nk_size_t n, nk_f64_t *results) {
376
399
 
377
400
  while (n >= 2) {
378
- v128_t first_latitudes = wasm_v128_load(a_lats);
379
- v128_t first_longitudes = wasm_v128_load(a_lons);
380
- v128_t second_latitudes = wasm_v128_load(b_lats);
381
- v128_t second_longitudes = wasm_v128_load(b_lons);
401
+ v128_t first_latitudes_f64x2 = wasm_v128_load(a_lats);
402
+ v128_t first_longitudes_f64x2 = wasm_v128_load(a_lons);
403
+ v128_t second_latitudes_f64x2 = wasm_v128_load(b_lats);
404
+ v128_t second_longitudes_f64x2 = wasm_v128_load(b_lons);
382
405
 
383
- v128_t distances = nk_vincenty_f64x2_v128relaxed_(first_latitudes, first_longitudes, second_latitudes,
384
- second_longitudes);
385
- wasm_v128_store(results, distances);
406
+ v128_t distances_f64x2 = nk_vincenty_f64x2_v128relaxed_(first_latitudes_f64x2, first_longitudes_f64x2,
407
+ second_latitudes_f64x2, second_longitudes_f64x2);
408
+ wasm_v128_store(results, distances_f64x2);
386
409
 
387
410
  a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
388
411
  }
@@ -394,9 +417,9 @@ NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
394
417
  nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
395
418
  nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
396
419
  nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
397
- v128_t distances = nk_vincenty_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
398
- b_lon_vec.v128);
399
- result_vec.v128 = distances;
420
+ v128_t distances_f64x2 = nk_vincenty_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
421
+ b_lon_vec.v128);
422
+ result_vec.v128 = distances_f64x2;
400
423
  nk_partial_store_b64x2_serial_(&result_vec, results, n);
401
424
  }
402
425
  }
@@ -405,168 +428,184 @@ NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
405
428
  * @brief WASM Relaxed SIMD helper for Vincenty's geodesic distance on 4 f32 point pairs.
406
429
  * @note This is a true SIMD implementation using masked convergence tracking via blending.
407
430
  */
408
- NK_INTERNAL v128_t nk_vincenty_f32x4_v128relaxed_( //
409
- v128_t first_latitudes, v128_t first_longitudes, //
410
- v128_t second_latitudes, v128_t second_longitudes) {
411
-
412
- v128_t const equatorial_radius = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
413
- v128_t const polar_radius = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
414
- v128_t const flattening = wasm_f32x4_splat(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
415
- v128_t const convergence_threshold = wasm_f32x4_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
416
- v128_t const one = wasm_f32x4_splat(1.0f);
417
- v128_t const two = wasm_f32x4_splat(2.0f);
418
- v128_t const three = wasm_f32x4_splat(3.0f);
419
- v128_t const four = wasm_f32x4_splat(4.0f);
420
- v128_t const six = wasm_f32x4_splat(6.0f);
421
- v128_t const sixteen = wasm_f32x4_splat(16.0f);
422
- v128_t const epsilon = wasm_f32x4_splat(1e-7f);
431
+ NK_INTERNAL v128_t nk_vincenty_f32x4_v128relaxed_( //
432
+ v128_t first_latitudes_f32x4, v128_t first_longitudes_f32x4, //
433
+ v128_t second_latitudes_f32x4, v128_t second_longitudes_f32x4) {
434
+
435
+ v128_t const equatorial_radius_f32x4 = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
436
+ v128_t const polar_radius_f32x4 = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
437
+ v128_t const flattening_f32x4 = wasm_f32x4_splat(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
438
+ v128_t const convergence_threshold_f32x4 = wasm_f32x4_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
439
+ v128_t const one_f32x4 = wasm_f32x4_splat(1.0f);
440
+ v128_t const two_f32x4 = wasm_f32x4_splat(2.0f);
441
+ v128_t const three_f32x4 = wasm_f32x4_splat(3.0f);
442
+ v128_t const four_f32x4 = wasm_f32x4_splat(4.0f);
443
+ v128_t const six_f32x4 = wasm_f32x4_splat(6.0f);
444
+ v128_t const sixteen_f32x4 = wasm_f32x4_splat(16.0f);
445
+ v128_t const epsilon_f32x4 = wasm_f32x4_splat(1e-7f);
423
446
 
424
447
  // Longitude difference
425
- v128_t longitude_difference = wasm_f32x4_sub(second_longitudes, first_longitudes);
448
+ v128_t longitude_difference_f32x4 = wasm_f32x4_sub(second_longitudes_f32x4, first_longitudes_f32x4);
426
449
 
427
450
  // Reduced latitudes: tan(U) = (1-f) * tan(lat)
428
- v128_t one_minus_f = wasm_f32x4_sub(one, flattening);
429
- v128_t tan_first = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(first_latitudes),
430
- nk_f32x4_cos_v128relaxed_(first_latitudes));
431
- v128_t tan_second = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(second_latitudes),
432
- nk_f32x4_cos_v128relaxed_(second_latitudes));
433
- v128_t tan_reduced_first = wasm_f32x4_mul(one_minus_f, tan_first);
434
- v128_t tan_reduced_second = wasm_f32x4_mul(one_minus_f, tan_second);
451
+ v128_t one_minus_f_f32x4 = wasm_f32x4_sub(one_f32x4, flattening_f32x4);
452
+ v128_t tan_first_f32x4 = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(first_latitudes_f32x4),
453
+ nk_f32x4_cos_v128relaxed_(first_latitudes_f32x4));
454
+ v128_t tan_second_f32x4 = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(second_latitudes_f32x4),
455
+ nk_f32x4_cos_v128relaxed_(second_latitudes_f32x4));
456
+ v128_t tan_reduced_first_f32x4 = wasm_f32x4_mul(one_minus_f_f32x4, tan_first_f32x4);
457
+ v128_t tan_reduced_second_f32x4 = wasm_f32x4_mul(one_minus_f_f32x4, tan_second_f32x4);
435
458
 
436
459
  // cos(U) = 1/sqrt(1 + tan^2(U)), sin(U) = tan(U) * cos(U)
437
- v128_t cos_reduced_first = wasm_f32x4_div(
438
- one, wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_first, tan_reduced_first, one)));
439
- v128_t sin_reduced_first = wasm_f32x4_mul(tan_reduced_first, cos_reduced_first);
440
- v128_t cos_reduced_second = wasm_f32x4_div(
441
- one, wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_second, tan_reduced_second, one)));
442
- v128_t sin_reduced_second = wasm_f32x4_mul(tan_reduced_second, cos_reduced_second);
443
-
444
- // Initialize lambda and tracking variables
445
- v128_t lambda = longitude_difference;
446
- v128_t sin_angular_distance, cos_angular_distance, angular_distance;
447
- v128_t sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
460
+ v128_t cos_reduced_first_f32x4 = wasm_f32x4_div(
461
+ one_f32x4,
462
+ wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_first_f32x4, tan_reduced_first_f32x4, one_f32x4)));
463
+ v128_t sin_reduced_first_f32x4 = wasm_f32x4_mul(tan_reduced_first_f32x4, cos_reduced_first_f32x4);
464
+ v128_t cos_reduced_second_f32x4 = wasm_f32x4_div(
465
+ one_f32x4,
466
+ wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_second_f32x4, tan_reduced_second_f32x4, one_f32x4)));
467
+ v128_t sin_reduced_second_f32x4 = wasm_f32x4_mul(tan_reduced_second_f32x4, cos_reduced_second_f32x4);
468
+
469
+ // Initialize lambda_f32x4 and tracking variables
470
+ v128_t lambda_f32x4 = longitude_difference_f32x4;
471
+ v128_t sin_angular_distance_f32x4, cos_angular_distance_f32x4, angular_distance_f32x4;
472
+ v128_t sin_azimuth_f32x4, cos_squared_azimuth_f32x4, cos_double_angular_midpoint_f32x4;
448
473
 
449
474
  // Track convergence and coincident points using masks
450
- v128_t converged_mask = wasm_i32x4_splat(0);
451
- v128_t coincident_mask = wasm_i32x4_splat(0);
475
+ v128_t converged_mask_i32x4 = wasm_i32x4_splat(0);
476
+ v128_t coincident_mask_i32x4 = wasm_i32x4_splat(0);
452
477
 
453
478
  for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
454
479
  // Check if all lanes converged
455
- if (wasm_i8x16_all_true(converged_mask)) break;
480
+ if (wasm_i8x16_all_true(converged_mask_i32x4)) break;
456
481
 
457
- v128_t sin_lambda = nk_f32x4_sin_v128relaxed_(lambda);
458
- v128_t cos_lambda = nk_f32x4_cos_v128relaxed_(lambda);
482
+ v128_t sin_lambda_f32x4 = nk_f32x4_sin_v128relaxed_(lambda_f32x4);
483
+ v128_t cos_lambda_f32x4 = nk_f32x4_cos_v128relaxed_(lambda_f32x4);
459
484
 
460
- // sin^2(angular_distance) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
461
- v128_t cross_term = wasm_f32x4_mul(cos_reduced_second, sin_lambda);
462
- v128_t mixed_term = wasm_f32x4_sub(
463
- wasm_f32x4_mul(cos_reduced_first, sin_reduced_second),
464
- wasm_f32x4_mul(wasm_f32x4_mul(sin_reduced_first, cos_reduced_second), cos_lambda));
465
- v128_t sin_angular_dist_sq = wasm_f32x4_relaxed_madd(cross_term, cross_term,
466
- wasm_f32x4_mul(mixed_term, mixed_term));
467
- sin_angular_distance = wasm_f32x4_sqrt(sin_angular_dist_sq);
485
+ // sin^2(angular_distance_f32x4) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
486
+ v128_t cross_term_f32x4 = wasm_f32x4_mul(cos_reduced_second_f32x4, sin_lambda_f32x4);
487
+ v128_t mixed_term_f32x4 = wasm_f32x4_sub(
488
+ wasm_f32x4_mul(cos_reduced_first_f32x4, sin_reduced_second_f32x4),
489
+ wasm_f32x4_mul(wasm_f32x4_mul(sin_reduced_first_f32x4, cos_reduced_second_f32x4), cos_lambda_f32x4));
490
+ v128_t sin_angular_dist_sq_f32x4 = wasm_f32x4_relaxed_madd(cross_term_f32x4, cross_term_f32x4,
491
+ wasm_f32x4_mul(mixed_term_f32x4, mixed_term_f32x4));
492
+ sin_angular_distance_f32x4 = wasm_f32x4_sqrt(sin_angular_dist_sq_f32x4);
468
493
 
469
- // Check for coincident points (sin_angular_distance ~ 0)
470
- coincident_mask = wasm_f32x4_lt(sin_angular_distance, epsilon);
494
+ // Check for coincident points (sin_angular_distance_f32x4 ~ 0)
495
+ coincident_mask_i32x4 = wasm_f32x4_lt(sin_angular_distance_f32x4, epsilon_f32x4);
471
496
 
472
- // cos(angular_distance) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
473
- cos_angular_distance = wasm_f32x4_relaxed_madd(wasm_f32x4_mul(cos_reduced_first, cos_reduced_second),
474
- cos_lambda,
475
- wasm_f32x4_mul(sin_reduced_first, sin_reduced_second));
497
+ // cos(angular_distance_f32x4) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
498
+ cos_angular_distance_f32x4 = wasm_f32x4_relaxed_madd(
499
+ wasm_f32x4_mul(cos_reduced_first_f32x4, cos_reduced_second_f32x4), cos_lambda_f32x4,
500
+ wasm_f32x4_mul(sin_reduced_first_f32x4, sin_reduced_second_f32x4));
476
501
 
477
- // angular_distance = atan2(sin, cos)
478
- angular_distance = nk_f32x4_atan2_v128relaxed_(sin_angular_distance, cos_angular_distance);
502
+ // angular_distance_f32x4 = atan2(sin, cos)
503
+ angular_distance_f32x4 = nk_f32x4_atan2_v128relaxed_(sin_angular_distance_f32x4, cos_angular_distance_f32x4);
479
504
 
480
- // sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance)
505
+ // sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance_f32x4)
481
506
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
482
507
  // Safe because mask is from comparison (all-ones or all-zeros per lane).
483
- v128_t safe_sin_angular = wasm_i32x4_relaxed_laneselect(one, sin_angular_distance, coincident_mask);
484
- sin_azimuth = wasm_f32x4_div(wasm_f32x4_mul(wasm_f32x4_mul(cos_reduced_first, cos_reduced_second), sin_lambda),
485
- safe_sin_angular);
486
- cos_squared_azimuth = wasm_f32x4_relaxed_nmadd(sin_azimuth, sin_azimuth, one);
508
+ v128_t safe_sin_angular_i32x4 = wasm_i32x4_relaxed_laneselect(one_f32x4, sin_angular_distance_f32x4,
509
+ coincident_mask_i32x4);
510
+ sin_azimuth_f32x4 = wasm_f32x4_div(
511
+ wasm_f32x4_mul(wasm_f32x4_mul(cos_reduced_first_f32x4, cos_reduced_second_f32x4), sin_lambda_f32x4),
512
+ safe_sin_angular_i32x4);
513
+ cos_squared_azimuth_f32x4 = wasm_f32x4_relaxed_nmadd(sin_azimuth_f32x4, sin_azimuth_f32x4, one_f32x4);
487
514
 
488
515
  // Handle equatorial case: cos^2(a) ~ 0
489
- v128_t equatorial_mask = wasm_f32x4_lt(cos_squared_azimuth, epsilon);
490
- v128_t safe_cos_sq_azimuth = wasm_i32x4_relaxed_laneselect(one, cos_squared_azimuth, equatorial_mask);
516
+ v128_t equatorial_mask_f32x4 = wasm_f32x4_lt(cos_squared_azimuth_f32x4, epsilon_f32x4);
517
+ v128_t safe_cos_sq_azimuth_i32x4 = wasm_i32x4_relaxed_laneselect(one_f32x4, cos_squared_azimuth_f32x4,
518
+ equatorial_mask_f32x4);
491
519
 
492
520
  // cos(2sm) = cos(s) - 2 * sin(U1) * sin(U2) / cos^2(a)
493
- v128_t sin_product = wasm_f32x4_mul(sin_reduced_first, sin_reduced_second);
494
- cos_double_angular_midpoint = wasm_f32x4_sub(
495
- cos_angular_distance, wasm_f32x4_div(wasm_f32x4_mul(two, sin_product), safe_cos_sq_azimuth));
496
- cos_double_angular_midpoint = wasm_i32x4_relaxed_laneselect(wasm_f32x4_splat(0.0f), cos_double_angular_midpoint,
497
- equatorial_mask);
521
+ v128_t sin_product_f32x4 = wasm_f32x4_mul(sin_reduced_first_f32x4, sin_reduced_second_f32x4);
522
+ cos_double_angular_midpoint_f32x4 = wasm_f32x4_sub(
523
+ cos_angular_distance_f32x4,
524
+ wasm_f32x4_div(wasm_f32x4_mul(two_f32x4, sin_product_f32x4), safe_cos_sq_azimuth_i32x4));
525
+ cos_double_angular_midpoint_f32x4 = wasm_i32x4_relaxed_laneselect(
526
+ wasm_f32x4_splat(0.0f), cos_double_angular_midpoint_f32x4, equatorial_mask_f32x4);
498
527
 
499
528
  // C = f/16 * cos^2(a) * (4 + f*(4 - 3*cos^2(a)))
500
- v128_t correction_factor = wasm_f32x4_mul(
501
- wasm_f32x4_div(flattening, sixteen),
529
+ v128_t correction_factor_f32x4 = wasm_f32x4_mul(
530
+ wasm_f32x4_div(flattening_f32x4, sixteen_f32x4),
502
531
  wasm_f32x4_mul(
503
- cos_squared_azimuth,
504
- wasm_f32x4_relaxed_madd(flattening, wasm_f32x4_relaxed_nmadd(three, cos_squared_azimuth, four), four)));
532
+ cos_squared_azimuth_f32x4,
533
+ wasm_f32x4_relaxed_madd(flattening_f32x4,
534
+ wasm_f32x4_relaxed_nmadd(three_f32x4, cos_squared_azimuth_f32x4, four_f32x4),
535
+ four_f32x4)));
505
536
 
506
537
  // l' = L + (1-C) * f * sin(a) * (s + C * sin(s) * (cos(2sm) + C * cos(s) * (-1 + 2 * cos^2(2sm))))
507
- v128_t cos_2sm_sq = wasm_f32x4_mul(cos_double_angular_midpoint, cos_double_angular_midpoint);
508
- v128_t innermost = wasm_f32x4_relaxed_madd(two, cos_2sm_sq, wasm_f32x4_splat(-1.0f));
509
- v128_t middle = wasm_f32x4_relaxed_madd(wasm_f32x4_mul(correction_factor, cos_angular_distance), innermost,
510
- cos_double_angular_midpoint);
511
- v128_t inner = wasm_f32x4_mul(wasm_f32x4_mul(correction_factor, sin_angular_distance), middle);
512
-
513
- v128_t lambda_new = wasm_f32x4_relaxed_madd(
514
- wasm_f32x4_mul(wasm_f32x4_mul(wasm_f32x4_sub(one, correction_factor), flattening), sin_azimuth),
515
- wasm_f32x4_add(angular_distance, inner), longitude_difference);
538
+ v128_t cos_2sm_sq_f32x4 = wasm_f32x4_mul(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
539
+ v128_t innermost_f32x4 = wasm_f32x4_relaxed_madd(two_f32x4, cos_2sm_sq_f32x4, wasm_f32x4_splat(-1.0f));
540
+ v128_t middle_f32x4 = wasm_f32x4_relaxed_madd(
541
+ wasm_f32x4_mul(correction_factor_f32x4, cos_angular_distance_f32x4), innermost_f32x4,
542
+ cos_double_angular_midpoint_f32x4);
543
+ v128_t inner_f32x4 = wasm_f32x4_mul(wasm_f32x4_mul(correction_factor_f32x4, sin_angular_distance_f32x4),
544
+ middle_f32x4);
545
+
546
+ v128_t lambda_new_f32x4 = wasm_f32x4_relaxed_madd(
547
+ wasm_f32x4_mul(wasm_f32x4_mul(wasm_f32x4_sub(one_f32x4, correction_factor_f32x4), flattening_f32x4),
548
+ sin_azimuth_f32x4),
549
+ wasm_f32x4_add(angular_distance_f32x4, inner_f32x4), longitude_difference_f32x4);
516
550
 
517
551
  // Check convergence: |l - l'| < threshold
518
- v128_t lambda_diff = wasm_f32x4_sub(lambda_new, lambda);
519
- v128_t lambda_diff_abs = wasm_f32x4_abs(lambda_diff);
520
- v128_t newly_converged = wasm_f32x4_lt(lambda_diff_abs, convergence_threshold);
521
- converged_mask = wasm_v128_or(converged_mask, newly_converged);
552
+ v128_t lambda_diff_f32x4 = wasm_f32x4_sub(lambda_new_f32x4, lambda_f32x4);
553
+ v128_t lambda_diff_abs_f32x4 = wasm_f32x4_abs(lambda_diff_f32x4);
554
+ v128_t newly_converged_f32x4 = wasm_f32x4_lt(lambda_diff_abs_f32x4, convergence_threshold_f32x4);
555
+ converged_mask_i32x4 = wasm_v128_or(converged_mask_i32x4, newly_converged_f32x4);
522
556
 
523
- // Only update lambda for non-converged lanes
557
+ // Only update lambda_f32x4 for non-converged lanes
524
558
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
525
559
  // Safe because mask is from comparison (all-ones or all-zeros per lane).
526
- lambda = wasm_i32x4_relaxed_laneselect(lambda, lambda_new, converged_mask);
560
+ lambda_f32x4 = wasm_i32x4_relaxed_laneselect(lambda_f32x4, lambda_new_f32x4, converged_mask_i32x4);
527
561
  }
528
562
 
529
563
  // Final distance calculation
530
- v128_t a_sq = wasm_f32x4_mul(equatorial_radius, equatorial_radius);
531
- v128_t b_sq = wasm_f32x4_mul(polar_radius, polar_radius);
532
- v128_t u_squared = wasm_f32x4_div(wasm_f32x4_mul(cos_squared_azimuth, wasm_f32x4_sub(a_sq, b_sq)), b_sq);
564
+ v128_t a_sq_f32x4 = wasm_f32x4_mul(equatorial_radius_f32x4, equatorial_radius_f32x4);
565
+ v128_t b_sq_f32x4 = wasm_f32x4_mul(polar_radius_f32x4, polar_radius_f32x4);
566
+ v128_t u_squared_f32x4 = wasm_f32x4_div(
567
+ wasm_f32x4_mul(cos_squared_azimuth_f32x4, wasm_f32x4_sub(a_sq_f32x4, b_sq_f32x4)), b_sq_f32x4);
533
568
 
534
569
  // A = 1 + u^2/16384 * (4096 + u^2*(-768 + u^2*(320 - 175*u^2)))
535
- v128_t series_a = wasm_f32x4_relaxed_madd(u_squared, wasm_f32x4_splat(-175.0f), wasm_f32x4_splat(320.0f));
536
- series_a = wasm_f32x4_relaxed_madd(u_squared, series_a, wasm_f32x4_splat(-768.0f));
537
- series_a = wasm_f32x4_relaxed_madd(u_squared, series_a, wasm_f32x4_splat(4096.0f));
538
- series_a = wasm_f32x4_relaxed_madd(wasm_f32x4_div(u_squared, wasm_f32x4_splat(16384.0f)), series_a, one);
570
+ v128_t series_a_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, wasm_f32x4_splat(-175.0f),
571
+ wasm_f32x4_splat(320.0f));
572
+ series_a_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_a_f32x4, wasm_f32x4_splat(-768.0f));
573
+ series_a_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_a_f32x4, wasm_f32x4_splat(4096.0f));
574
+ series_a_f32x4 = wasm_f32x4_relaxed_madd(wasm_f32x4_div(u_squared_f32x4, wasm_f32x4_splat(16384.0f)),
575
+ series_a_f32x4, one_f32x4);
539
576
 
540
577
  // B = u^2/1024 * (256 + u^2*(-128 + u^2*(74 - 47*u^2)))
541
- v128_t series_b = wasm_f32x4_relaxed_madd(u_squared, wasm_f32x4_splat(-47.0f), wasm_f32x4_splat(74.0f));
542
- series_b = wasm_f32x4_relaxed_madd(u_squared, series_b, wasm_f32x4_splat(-128.0f));
543
- series_b = wasm_f32x4_relaxed_madd(u_squared, series_b, wasm_f32x4_splat(256.0f));
544
- series_b = wasm_f32x4_mul(wasm_f32x4_div(u_squared, wasm_f32x4_splat(1024.0f)), series_b);
578
+ v128_t series_b_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, wasm_f32x4_splat(-47.0f), wasm_f32x4_splat(74.0f));
579
+ series_b_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_b_f32x4, wasm_f32x4_splat(-128.0f));
580
+ series_b_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_b_f32x4, wasm_f32x4_splat(256.0f));
581
+ series_b_f32x4 = wasm_f32x4_mul(wasm_f32x4_div(u_squared_f32x4, wasm_f32x4_splat(1024.0f)), series_b_f32x4);
545
582
 
546
583
  // Delta-sigma calculation
547
- v128_t cos_2sm_sq = wasm_f32x4_mul(cos_double_angular_midpoint, cos_double_angular_midpoint);
548
- v128_t sin_sq = wasm_f32x4_mul(sin_angular_distance, sin_angular_distance);
549
- v128_t term1 = wasm_f32x4_relaxed_madd(two, cos_2sm_sq, wasm_f32x4_splat(-1.0f));
550
- term1 = wasm_f32x4_mul(cos_angular_distance, term1);
551
- v128_t term2 = wasm_f32x4_relaxed_madd(four, sin_sq, wasm_f32x4_splat(-3.0f));
552
- v128_t term3 = wasm_f32x4_relaxed_madd(four, cos_2sm_sq, wasm_f32x4_splat(-3.0f));
553
- term2 = wasm_f32x4_mul(wasm_f32x4_mul(wasm_f32x4_div(series_b, six), cos_double_angular_midpoint),
554
- wasm_f32x4_mul(term2, term3));
555
- v128_t delta_sigma = wasm_f32x4_mul(
556
- series_b, wasm_f32x4_mul(sin_angular_distance, wasm_f32x4_add(cos_double_angular_midpoint,
557
- wasm_f32x4_mul(wasm_f32x4_div(series_b, four),
558
- wasm_f32x4_sub(term1, term2)))));
584
+ v128_t cos_2sm_sq_f32x4 = wasm_f32x4_mul(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
585
+ v128_t sin_sq_f32x4 = wasm_f32x4_mul(sin_angular_distance_f32x4, sin_angular_distance_f32x4);
586
+ v128_t term1_f32x4 = wasm_f32x4_relaxed_madd(two_f32x4, cos_2sm_sq_f32x4, wasm_f32x4_splat(-1.0f));
587
+ term1_f32x4 = wasm_f32x4_mul(cos_angular_distance_f32x4, term1_f32x4);
588
+ v128_t term2_f32x4 = wasm_f32x4_relaxed_madd(four_f32x4, sin_sq_f32x4, wasm_f32x4_splat(-3.0f));
589
+ v128_t term3_f32x4 = wasm_f32x4_relaxed_madd(four_f32x4, cos_2sm_sq_f32x4, wasm_f32x4_splat(-3.0f));
590
+ term2_f32x4 = wasm_f32x4_mul(
591
+ wasm_f32x4_mul(wasm_f32x4_div(series_b_f32x4, six_f32x4), cos_double_angular_midpoint_f32x4),
592
+ wasm_f32x4_mul(term2_f32x4, term3_f32x4));
593
+ v128_t delta_sigma_f32x4 = wasm_f32x4_mul(
594
+ series_b_f32x4, wasm_f32x4_mul(sin_angular_distance_f32x4,
595
+ wasm_f32x4_add(cos_double_angular_midpoint_f32x4,
596
+ wasm_f32x4_mul(wasm_f32x4_div(series_b_f32x4, four_f32x4),
597
+ wasm_f32x4_sub(term1_f32x4, term2_f32x4)))));
559
598
 
560
599
  // s = b * A * (s - ds)
561
- v128_t distances = wasm_f32x4_mul(wasm_f32x4_mul(polar_radius, series_a),
562
- wasm_f32x4_sub(angular_distance, delta_sigma));
600
+ v128_t distances_f32x4 = wasm_f32x4_mul(wasm_f32x4_mul(polar_radius_f32x4, series_a_f32x4),
601
+ wasm_f32x4_sub(angular_distance_f32x4, delta_sigma_f32x4));
563
602
 
564
603
  // Set coincident points to zero
565
604
  // relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
566
605
  // Safe because mask is from comparison (all-ones or all-zeros per lane).
567
- distances = wasm_i32x4_relaxed_laneselect(wasm_f32x4_splat(0.0f), distances, coincident_mask);
606
+ distances_f32x4 = wasm_i32x4_relaxed_laneselect(wasm_f32x4_splat(0.0f), distances_f32x4, coincident_mask_i32x4);
568
607
 
569
- return distances;
608
+ return distances_f32x4;
570
609
  }
571
610
 
572
611
  NK_PUBLIC void nk_vincenty_f32_v128relaxed( //
@@ -575,14 +614,14 @@ NK_PUBLIC void nk_vincenty_f32_v128relaxed( //
575
614
  nk_size_t n, nk_f32_t *results) {
576
615
 
577
616
  while (n >= 4) {
578
- v128_t first_latitudes = wasm_v128_load(a_lats);
579
- v128_t first_longitudes = wasm_v128_load(a_lons);
580
- v128_t second_latitudes = wasm_v128_load(b_lats);
581
- v128_t second_longitudes = wasm_v128_load(b_lons);
617
+ v128_t first_latitudes_f32x4 = wasm_v128_load(a_lats);
618
+ v128_t first_longitudes_f32x4 = wasm_v128_load(a_lons);
619
+ v128_t second_latitudes_f32x4 = wasm_v128_load(b_lats);
620
+ v128_t second_longitudes_f32x4 = wasm_v128_load(b_lons);
582
621
 
583
- v128_t distances = nk_vincenty_f32x4_v128relaxed_(first_latitudes, first_longitudes, second_latitudes,
584
- second_longitudes);
585
- wasm_v128_store(results, distances);
622
+ v128_t distances_f32x4 = nk_vincenty_f32x4_v128relaxed_(first_latitudes_f32x4, first_longitudes_f32x4,
623
+ second_latitudes_f32x4, second_longitudes_f32x4);
624
+ wasm_v128_store(results, distances_f32x4);
586
625
 
587
626
  a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
588
627
  }
@@ -594,9 +633,9 @@ NK_PUBLIC void nk_vincenty_f32_v128relaxed( //
594
633
  nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
595
634
  nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
596
635
  nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
597
- v128_t distances = nk_vincenty_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
598
- b_lon_vec.v128);
599
- result_vec.v128 = distances;
636
+ v128_t distances_f32x4 = nk_vincenty_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
637
+ b_lon_vec.v128);
638
+ result_vec.v128 = distances_f32x4;
600
639
  nk_partial_store_b32x4_serial_(&result_vec, results, n);
601
640
  }
602
641
  }