numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -8,13 +8,14 @@
8
8
  *
9
9
  * @section geospatial_haswell_instructions Key AVX2 Geospatial Instructions
10
10
  *
11
- * Intrinsic Instruction Ice Genoa
12
- * _mm256_sqrt_ps VSQRTPS (YMM, YMM) 12c @ p0 15c @ p01
13
- * _mm256_sqrt_pd VSQRTPD (YMM, YMM) 13c @ p0 21c @ p01
14
- * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11c @ p0 11c @ p01
15
- * _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13c @ p0 13c @ p01
16
- * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4c @ p01 4c @ p01
17
- * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4c @ p01 4c @ p01
11
+ * Intrinsic Instruction Icelake Genoa
12
+ * _mm256_sqrt_ps VSQRTPS (YMM, YMM) 12cy @ p0 15cy @ p01
13
+ * _mm256_sqrt_pd VSQRTPD (YMM, YMM) 13cy @ p0 21cy @ p01
14
+ * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11cy @ p0 11cy @ p01
15
+ * _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13cy @ p0 13cy @ p01
16
+ * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
17
+ * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
18
+ * _mm256_cmp_ps VCMPPS (YMM, YMM, YMM, I8) 3cy @ p01 3cy @ p01
18
19
  */
19
20
  #ifndef NK_GEOSPATIAL_HASWELL_H
20
21
  #define NK_GEOSPATIAL_HASWELL_H
@@ -40,44 +41,48 @@ extern "C" {
40
41
  * These require AVX2 trigonometric kernels from trigonometry.h.
41
42
  */
42
43
 
43
- NK_INTERNAL __m256d nk_haversine_f64x4_haswell_( //
44
- __m256d first_latitudes, __m256d first_longitudes, //
45
- __m256d second_latitudes, __m256d second_longitudes) {
44
+ NK_INTERNAL __m256d nk_haversine_f64x4_haswell_( //
45
+ __m256d first_latitudes_f64x4, __m256d first_longitudes_f64x4, //
46
+ __m256d second_latitudes_f64x4, __m256d second_longitudes_f64x4) {
46
47
 
47
- __m256d const earth_radius = _mm256_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
48
- __m256d const half = _mm256_set1_pd(0.5);
49
- __m256d const one = _mm256_set1_pd(1.0);
50
- __m256d const two = _mm256_set1_pd(2.0);
48
+ __m256d const earth_radius_f64x4 = _mm256_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
49
+ __m256d const half_f64x4 = _mm256_set1_pd(0.5);
50
+ __m256d const one_f64x4 = _mm256_set1_pd(1.0);
51
+ __m256d const two_f64x4 = _mm256_set1_pd(2.0);
51
52
 
52
- __m256d latitude_delta = _mm256_sub_pd(second_latitudes, first_latitudes);
53
- __m256d longitude_delta = _mm256_sub_pd(second_longitudes, first_longitudes);
53
+ __m256d latitude_delta_f64x4 = _mm256_sub_pd(second_latitudes_f64x4, first_latitudes_f64x4);
54
+ __m256d longitude_delta_f64x4 = _mm256_sub_pd(second_longitudes_f64x4, first_longitudes_f64x4);
54
55
 
55
56
  // Haversine terms: sin²(Δ/2)
56
- __m256d latitude_delta_half = _mm256_mul_pd(latitude_delta, half);
57
- __m256d longitude_delta_half = _mm256_mul_pd(longitude_delta, half);
58
- __m256d sin_latitude_delta_half = nk_sin_f64x4_haswell_(latitude_delta_half);
59
- __m256d sin_longitude_delta_half = nk_sin_f64x4_haswell_(longitude_delta_half);
60
- __m256d sin_squared_latitude_delta_half = _mm256_mul_pd(sin_latitude_delta_half, sin_latitude_delta_half);
61
- __m256d sin_squared_longitude_delta_half = _mm256_mul_pd(sin_longitude_delta_half, sin_longitude_delta_half);
57
+ __m256d latitude_delta_half_f64x4 = _mm256_mul_pd(latitude_delta_f64x4, half_f64x4);
58
+ __m256d longitude_delta_half_f64x4 = _mm256_mul_pd(longitude_delta_f64x4, half_f64x4);
59
+ __m256d sin_latitude_delta_half_f64x4 = nk_sin_f64x4_haswell_(latitude_delta_half_f64x4);
60
+ __m256d sin_longitude_delta_half_f64x4 = nk_sin_f64x4_haswell_(longitude_delta_half_f64x4);
61
+ __m256d sin_squared_latitude_delta_half_f64x4 = _mm256_mul_pd(sin_latitude_delta_half_f64x4,
62
+ sin_latitude_delta_half_f64x4);
63
+ __m256d sin_squared_longitude_delta_half_f64x4 = _mm256_mul_pd(sin_longitude_delta_half_f64x4,
64
+ sin_longitude_delta_half_f64x4);
62
65
 
63
66
  // Latitude cosine product
64
- __m256d cos_first_latitude = nk_cos_f64x4_haswell_(first_latitudes);
65
- __m256d cos_second_latitude = nk_cos_f64x4_haswell_(second_latitudes);
66
- __m256d cos_latitude_product = _mm256_mul_pd(cos_first_latitude, cos_second_latitude);
67
+ __m256d cos_first_latitude_f64x4 = nk_cos_f64x4_haswell_(first_latitudes_f64x4);
68
+ __m256d cos_second_latitude_f64x4 = nk_cos_f64x4_haswell_(second_latitudes_f64x4);
69
+ __m256d cos_latitude_product_f64x4 = _mm256_mul_pd(cos_first_latitude_f64x4, cos_second_latitude_f64x4);
67
70
 
68
71
  // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
69
- __m256d haversine_term = _mm256_add_pd(sin_squared_latitude_delta_half,
70
- _mm256_mul_pd(cos_latitude_product, sin_squared_longitude_delta_half));
71
- // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
72
- __m256d zero = _mm256_setzero_pd();
73
- haversine_term = _mm256_max_pd(zero, _mm256_min_pd(one, haversine_term));
72
+ __m256d haversine_term_f64x4 = _mm256_add_pd(
73
+ sin_squared_latitude_delta_half_f64x4,
74
+ _mm256_mul_pd(cos_latitude_product_f64x4, sin_squared_longitude_delta_half_f64x4));
75
+ // Clamp haversine_term_f64x4 to [0, 1] to prevent NaN from sqrt of negative values
76
+ __m256d zero_f64x4 = _mm256_setzero_pd();
77
+ haversine_term_f64x4 = _mm256_max_pd(zero_f64x4, _mm256_min_pd(one_f64x4, haversine_term_f64x4));
74
78
 
75
79
  // Central angle: c = 2 × atan2(√a, √(1-a))
76
- __m256d sqrt_haversine = _mm256_sqrt_pd(haversine_term);
77
- __m256d sqrt_complement = _mm256_sqrt_pd(_mm256_sub_pd(one, haversine_term));
78
- __m256d central_angle = _mm256_mul_pd(two, nk_atan2_f64x4_haswell_(sqrt_haversine, sqrt_complement));
80
+ __m256d sqrt_haversine_f64x4 = _mm256_sqrt_pd(haversine_term_f64x4);
81
+ __m256d sqrt_complement_f64x4 = _mm256_sqrt_pd(_mm256_sub_pd(one_f64x4, haversine_term_f64x4));
82
+ __m256d central_angle_f64x4 = _mm256_mul_pd(two_f64x4,
83
+ nk_atan2_f64x4_haswell_(sqrt_haversine_f64x4, sqrt_complement_f64x4));
79
84
 
80
- return _mm256_mul_pd(earth_radius, central_angle);
85
+ return _mm256_mul_pd(earth_radius_f64x4, central_angle_f64x4);
81
86
  }
82
87
 
83
88
  NK_PUBLIC void nk_haversine_f64_haswell( //
@@ -86,14 +91,14 @@ NK_PUBLIC void nk_haversine_f64_haswell( //
86
91
  nk_size_t n, nk_f64_t *results) {
87
92
 
88
93
  while (n >= 4) {
89
- __m256d first_latitudes = _mm256_loadu_pd(a_lats);
90
- __m256d first_longitudes = _mm256_loadu_pd(a_lons);
91
- __m256d second_latitudes = _mm256_loadu_pd(b_lats);
92
- __m256d second_longitudes = _mm256_loadu_pd(b_lons);
94
+ __m256d first_latitudes_f64x4 = _mm256_loadu_pd(a_lats);
95
+ __m256d first_longitudes_f64x4 = _mm256_loadu_pd(a_lons);
96
+ __m256d second_latitudes_f64x4 = _mm256_loadu_pd(b_lats);
97
+ __m256d second_longitudes_f64x4 = _mm256_loadu_pd(b_lons);
93
98
 
94
- __m256d distances = nk_haversine_f64x4_haswell_(first_latitudes, first_longitudes, second_latitudes,
95
- second_longitudes);
96
- _mm256_storeu_pd(results, distances);
99
+ __m256d distances_f64x4 = nk_haversine_f64x4_haswell_(first_latitudes_f64x4, first_longitudes_f64x4,
100
+ second_latitudes_f64x4, second_longitudes_f64x4);
101
+ _mm256_storeu_pd(results, distances_f64x4);
97
102
 
98
103
  a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
99
104
  }
@@ -105,52 +110,56 @@ NK_PUBLIC void nk_haversine_f64_haswell( //
105
110
  nk_partial_load_b64x4_haswell_(a_lons, &a_lon_vec, n);
106
111
  nk_partial_load_b64x4_haswell_(b_lats, &b_lat_vec, n);
107
112
  nk_partial_load_b64x4_haswell_(b_lons, &b_lon_vec, n);
108
- __m256d distances = nk_haversine_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
109
- b_lon_vec.ymm_pd);
110
- result_vec.ymm_pd = distances;
113
+ __m256d distances_f64x4 = nk_haversine_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
114
+ b_lon_vec.ymm_pd);
115
+ result_vec.ymm_pd = distances_f64x4;
111
116
  nk_partial_store_b64x4_haswell_(&result_vec, results, n);
112
117
  }
113
118
  }
114
119
 
115
- NK_INTERNAL __m256 nk_haversine_f32x8_haswell_( //
116
- __m256 first_latitudes, __m256 first_longitudes, //
117
- __m256 second_latitudes, __m256 second_longitudes) {
120
+ NK_INTERNAL __m256 nk_haversine_f32x8_haswell_( //
121
+ __m256 first_latitudes_f32x8, __m256 first_longitudes_f32x8, //
122
+ __m256 second_latitudes_f32x8, __m256 second_longitudes_f32x8) {
118
123
 
119
- __m256 const earth_radius = _mm256_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
120
- __m256 const half = _mm256_set1_ps(0.5f);
121
- __m256 const one = _mm256_set1_ps(1.0f);
122
- __m256 const two = _mm256_set1_ps(2.0f);
124
+ __m256 const earth_radius_f32x8 = _mm256_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
125
+ __m256 const half_f32x8 = _mm256_set1_ps(0.5f);
126
+ __m256 const one_f32x8 = _mm256_set1_ps(1.0f);
127
+ __m256 const two_f32x8 = _mm256_set1_ps(2.0f);
123
128
 
124
- __m256 latitude_delta = _mm256_sub_ps(second_latitudes, first_latitudes);
125
- __m256 longitude_delta = _mm256_sub_ps(second_longitudes, first_longitudes);
129
+ __m256 latitude_delta_f32x8 = _mm256_sub_ps(second_latitudes_f32x8, first_latitudes_f32x8);
130
+ __m256 longitude_delta_f32x8 = _mm256_sub_ps(second_longitudes_f32x8, first_longitudes_f32x8);
126
131
 
127
132
  // Haversine terms: sin²(Δ/2)
128
- __m256 latitude_delta_half = _mm256_mul_ps(latitude_delta, half);
129
- __m256 longitude_delta_half = _mm256_mul_ps(longitude_delta, half);
130
- __m256 sin_latitude_delta_half = nk_sin_f32x8_haswell_(latitude_delta_half);
131
- __m256 sin_longitude_delta_half = nk_sin_f32x8_haswell_(longitude_delta_half);
132
- __m256 sin_squared_latitude_delta_half = _mm256_mul_ps(sin_latitude_delta_half, sin_latitude_delta_half);
133
- __m256 sin_squared_longitude_delta_half = _mm256_mul_ps(sin_longitude_delta_half, sin_longitude_delta_half);
133
+ __m256 latitude_delta_half_f32x8 = _mm256_mul_ps(latitude_delta_f32x8, half_f32x8);
134
+ __m256 longitude_delta_half_f32x8 = _mm256_mul_ps(longitude_delta_f32x8, half_f32x8);
135
+ __m256 sin_latitude_delta_half_f32x8 = nk_sin_f32x8_haswell_(latitude_delta_half_f32x8);
136
+ __m256 sin_longitude_delta_half_f32x8 = nk_sin_f32x8_haswell_(longitude_delta_half_f32x8);
137
+ __m256 sin_squared_latitude_delta_half_f32x8 = _mm256_mul_ps(sin_latitude_delta_half_f32x8,
138
+ sin_latitude_delta_half_f32x8);
139
+ __m256 sin_squared_longitude_delta_half_f32x8 = _mm256_mul_ps(sin_longitude_delta_half_f32x8,
140
+ sin_longitude_delta_half_f32x8);
134
141
 
135
142
  // Latitude cosine product
136
- __m256 cos_first_latitude = nk_cos_f32x8_haswell_(first_latitudes);
137
- __m256 cos_second_latitude = nk_cos_f32x8_haswell_(second_latitudes);
138
- __m256 cos_latitude_product = _mm256_mul_ps(cos_first_latitude, cos_second_latitude);
143
+ __m256 cos_first_latitude_f32x8 = nk_cos_f32x8_haswell_(first_latitudes_f32x8);
144
+ __m256 cos_second_latitude_f32x8 = nk_cos_f32x8_haswell_(second_latitudes_f32x8);
145
+ __m256 cos_latitude_product_f32x8 = _mm256_mul_ps(cos_first_latitude_f32x8, cos_second_latitude_f32x8);
139
146
 
140
147
  // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
141
- __m256 haversine_term = _mm256_add_ps(sin_squared_latitude_delta_half,
142
- _mm256_mul_ps(cos_latitude_product, sin_squared_longitude_delta_half));
148
+ __m256 haversine_term_f32x8 = _mm256_add_ps(
149
+ sin_squared_latitude_delta_half_f32x8,
150
+ _mm256_mul_ps(cos_latitude_product_f32x8, sin_squared_longitude_delta_half_f32x8));
143
151
 
144
152
  // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
145
- __m256 zero = _mm256_setzero_ps();
146
- haversine_term = _mm256_max_ps(zero, _mm256_min_ps(one, haversine_term));
153
+ __m256 zero_f32x8 = _mm256_setzero_ps();
154
+ haversine_term_f32x8 = _mm256_max_ps(zero_f32x8, _mm256_min_ps(one_f32x8, haversine_term_f32x8));
147
155
 
148
156
  // Central angle: c = 2 × atan2(√a, √(1-a))
149
- __m256 sqrt_haversine = _mm256_sqrt_ps(haversine_term);
150
- __m256 sqrt_complement = _mm256_sqrt_ps(_mm256_sub_ps(one, haversine_term));
151
- __m256 central_angle = _mm256_mul_ps(two, nk_atan2_f32x8_haswell_(sqrt_haversine, sqrt_complement));
157
+ __m256 sqrt_haversine_f32x8 = _mm256_sqrt_ps(haversine_term_f32x8);
158
+ __m256 sqrt_complement_f32x8 = _mm256_sqrt_ps(_mm256_sub_ps(one_f32x8, haversine_term_f32x8));
159
+ __m256 central_angle_f32x8 = _mm256_mul_ps(two_f32x8,
160
+ nk_atan2_f32x8_haswell_(sqrt_haversine_f32x8, sqrt_complement_f32x8));
152
161
 
153
- return _mm256_mul_ps(earth_radius, central_angle);
162
+ return _mm256_mul_ps(earth_radius_f32x8, central_angle_f32x8);
154
163
  }
155
164
 
156
165
  NK_PUBLIC void nk_haversine_f32_haswell( //
@@ -159,14 +168,14 @@ NK_PUBLIC void nk_haversine_f32_haswell( //
159
168
  nk_size_t n, nk_f32_t *results) {
160
169
 
161
170
  while (n >= 8) {
162
- __m256 first_latitudes = _mm256_loadu_ps(a_lats);
163
- __m256 first_longitudes = _mm256_loadu_ps(a_lons);
164
- __m256 second_latitudes = _mm256_loadu_ps(b_lats);
165
- __m256 second_longitudes = _mm256_loadu_ps(b_lons);
171
+ __m256 first_latitudes_f32x8 = _mm256_loadu_ps(a_lats);
172
+ __m256 first_longitudes_f32x8 = _mm256_loadu_ps(a_lons);
173
+ __m256 second_latitudes_f32x8 = _mm256_loadu_ps(b_lats);
174
+ __m256 second_longitudes_f32x8 = _mm256_loadu_ps(b_lons);
166
175
 
167
- __m256 distances = nk_haversine_f32x8_haswell_(first_latitudes, first_longitudes, second_latitudes,
168
- second_longitudes);
169
- _mm256_storeu_ps(results, distances);
176
+ __m256 distances_f32x8 = nk_haversine_f32x8_haswell_(first_latitudes_f32x8, first_longitudes_f32x8,
177
+ second_latitudes_f32x8, second_longitudes_f32x8);
178
+ _mm256_storeu_ps(results, distances_f32x8);
170
179
 
171
180
  a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
172
181
  }
@@ -178,9 +187,9 @@ NK_PUBLIC void nk_haversine_f32_haswell( //
178
187
  nk_partial_load_b32x8_serial_(a_lons, &a_lon_vec, n);
179
188
  nk_partial_load_b32x8_serial_(b_lats, &b_lat_vec, n);
180
189
  nk_partial_load_b32x8_serial_(b_lons, &b_lon_vec, n);
181
- __m256 distances = nk_haversine_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
182
- b_lon_vec.ymm_ps);
183
- result_vec.ymm_ps = distances;
190
+ __m256 distances_f32x8 = nk_haversine_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
191
+ b_lon_vec.ymm_ps);
192
+ result_vec.ymm_ps = distances_f32x8;
184
193
  nk_partial_store_b32x8_serial_(&result_vec, results, n);
185
194
  }
186
195
  }
@@ -189,165 +198,180 @@ NK_PUBLIC void nk_haversine_f32_haswell( //
189
198
  * @brief AVX2 helper for Vincenty's geodesic distance on 4 f64 point pairs.
190
199
  * @note This is a true SIMD implementation using masked convergence tracking via blending.
191
200
  */
192
- NK_INTERNAL __m256d nk_vincenty_f64x4_haswell_( //
193
- __m256d first_latitudes, __m256d first_longitudes, //
194
- __m256d second_latitudes, __m256d second_longitudes) {
195
-
196
- __m256d const equatorial_radius = _mm256_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
197
- __m256d const polar_radius = _mm256_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
198
- __m256d const flattening = _mm256_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
199
- __m256d const convergence_threshold = _mm256_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
200
- __m256d const one = _mm256_set1_pd(1.0);
201
- __m256d const two = _mm256_set1_pd(2.0);
202
- __m256d const three = _mm256_set1_pd(3.0);
203
- __m256d const four = _mm256_set1_pd(4.0);
204
- __m256d const six = _mm256_set1_pd(6.0);
205
- __m256d const sixteen = _mm256_set1_pd(16.0);
206
- __m256d const epsilon = _mm256_set1_pd(1e-15);
201
+ NK_INTERNAL __m256d nk_vincenty_f64x4_haswell_( //
202
+ __m256d first_latitudes_f64x4, __m256d first_longitudes_f64x4, //
203
+ __m256d second_latitudes_f64x4, __m256d second_longitudes_f64x4) {
204
+
205
+ __m256d const equatorial_radius_f64x4 = _mm256_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
206
+ __m256d const polar_radius_f64x4 = _mm256_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
207
+ __m256d const flattening_f64x4 = _mm256_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
208
+ __m256d const convergence_threshold_f64x4 = _mm256_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
209
+ __m256d const one_f64x4 = _mm256_set1_pd(1.0);
210
+ __m256d const two_f64x4 = _mm256_set1_pd(2.0);
211
+ __m256d const three_f64x4 = _mm256_set1_pd(3.0);
212
+ __m256d const four_f64x4 = _mm256_set1_pd(4.0);
213
+ __m256d const six_f64x4 = _mm256_set1_pd(6.0);
214
+ __m256d const sixteen_f64x4 = _mm256_set1_pd(16.0);
215
+ __m256d const epsilon_f64x4 = _mm256_set1_pd(1e-15);
207
216
 
208
217
  // Longitude difference
209
- __m256d longitude_difference = _mm256_sub_pd(second_longitudes, first_longitudes);
218
+ __m256d longitude_difference_f64x4 = _mm256_sub_pd(second_longitudes_f64x4, first_longitudes_f64x4);
210
219
 
211
220
  // Reduced latitudes: tan(U) = (1-f) * tan(lat)
212
- __m256d one_minus_f = _mm256_sub_pd(one, flattening);
213
- __m256d tan_first = _mm256_div_pd(nk_sin_f64x4_haswell_(first_latitudes), nk_cos_f64x4_haswell_(first_latitudes));
214
- __m256d tan_second = _mm256_div_pd(nk_sin_f64x4_haswell_(second_latitudes),
215
- nk_cos_f64x4_haswell_(second_latitudes));
216
- __m256d tan_reduced_first = _mm256_mul_pd(one_minus_f, tan_first);
217
- __m256d tan_reduced_second = _mm256_mul_pd(one_minus_f, tan_second);
221
+ __m256d one_minus_f_f64x4 = _mm256_sub_pd(one_f64x4, flattening_f64x4);
222
+ __m256d tan_first_f64x4 = _mm256_div_pd(nk_sin_f64x4_haswell_(first_latitudes_f64x4),
223
+ nk_cos_f64x4_haswell_(first_latitudes_f64x4));
224
+ __m256d tan_second_f64x4 = _mm256_div_pd(nk_sin_f64x4_haswell_(second_latitudes_f64x4),
225
+ nk_cos_f64x4_haswell_(second_latitudes_f64x4));
226
+ __m256d tan_reduced_first_f64x4 = _mm256_mul_pd(one_minus_f_f64x4, tan_first_f64x4);
227
+ __m256d tan_reduced_second_f64x4 = _mm256_mul_pd(one_minus_f_f64x4, tan_second_f64x4);
218
228
 
219
229
  // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
220
- __m256d cos_reduced_first = _mm256_div_pd(
221
- one, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_first, tan_reduced_first, one)));
222
- __m256d sin_reduced_first = _mm256_mul_pd(tan_reduced_first, cos_reduced_first);
223
- __m256d cos_reduced_second = _mm256_div_pd(
224
- one, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_second, tan_reduced_second, one)));
225
- __m256d sin_reduced_second = _mm256_mul_pd(tan_reduced_second, cos_reduced_second);
226
-
227
- // Initialize lambda and tracking variables
228
- __m256d lambda = longitude_difference;
229
- __m256d sin_angular_distance, cos_angular_distance, angular_distance;
230
- __m256d sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
230
+ __m256d cos_reduced_first_f64x4 = _mm256_div_pd(
231
+ one_f64x4, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_first_f64x4, tan_reduced_first_f64x4, one_f64x4)));
232
+ __m256d sin_reduced_first_f64x4 = _mm256_mul_pd(tan_reduced_first_f64x4, cos_reduced_first_f64x4);
233
+ __m256d cos_reduced_second_f64x4 = _mm256_div_pd(
234
+ one_f64x4, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_second_f64x4, tan_reduced_second_f64x4, one_f64x4)));
235
+ __m256d sin_reduced_second_f64x4 = _mm256_mul_pd(tan_reduced_second_f64x4, cos_reduced_second_f64x4);
236
+
237
+ // Initialize lambda_f64x4 and tracking variables
238
+ __m256d lambda_f64x4 = longitude_difference_f64x4;
239
+ __m256d sin_angular_distance_f64x4, cos_angular_distance_f64x4, angular_distance_f64x4;
240
+ __m256d sin_azimuth_f64x4, cos_squared_azimuth_f64x4, cos_double_angular_midpoint_f64x4;
231
241
 
232
242
  // Track convergence and coincident points using masks
233
- __m256d converged_mask = _mm256_setzero_pd();
234
- __m256d coincident_mask = _mm256_setzero_pd();
243
+ __m256d converged_mask_f64x4 = _mm256_setzero_pd();
244
+ __m256d coincident_mask_f64x4 = _mm256_setzero_pd();
235
245
 
236
246
  for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
237
247
  // Check if all lanes converged
238
- int converged_bits = _mm256_movemask_pd(converged_mask);
248
+ int converged_bits = _mm256_movemask_pd(converged_mask_f64x4);
239
249
  if (converged_bits == 0xF) break;
240
250
 
241
- __m256d sin_lambda = nk_sin_f64x4_haswell_(lambda);
242
- __m256d cos_lambda = nk_cos_f64x4_haswell_(lambda);
251
+ __m256d sin_lambda_f64x4 = nk_sin_f64x4_haswell_(lambda_f64x4);
252
+ __m256d cos_lambda_f64x4 = nk_cos_f64x4_haswell_(lambda_f64x4);
243
253
 
244
- // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
245
- __m256d cross_term = _mm256_mul_pd(cos_reduced_second, sin_lambda);
246
- __m256d mixed_term = _mm256_sub_pd(
247
- _mm256_mul_pd(cos_reduced_first, sin_reduced_second),
248
- _mm256_mul_pd(_mm256_mul_pd(sin_reduced_first, cos_reduced_second), cos_lambda));
249
- __m256d sin_angular_dist_sq = _mm256_fmadd_pd(cross_term, cross_term, _mm256_mul_pd(mixed_term, mixed_term));
250
- sin_angular_distance = _mm256_sqrt_pd(sin_angular_dist_sq);
254
+ // sin²(angular_distance_f64x4) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
255
+ __m256d cross_term_f64x4 = _mm256_mul_pd(cos_reduced_second_f64x4, sin_lambda_f64x4);
256
+ __m256d mixed_term_f64x4 = _mm256_sub_pd(
257
+ _mm256_mul_pd(cos_reduced_first_f64x4, sin_reduced_second_f64x4),
258
+ _mm256_mul_pd(_mm256_mul_pd(sin_reduced_first_f64x4, cos_reduced_second_f64x4), cos_lambda_f64x4));
259
+ __m256d sin_angular_dist_sq_f64x4 = _mm256_fmadd_pd(cross_term_f64x4, cross_term_f64x4,
260
+ _mm256_mul_pd(mixed_term_f64x4, mixed_term_f64x4));
261
+ sin_angular_distance_f64x4 = _mm256_sqrt_pd(sin_angular_dist_sq_f64x4);
251
262
 
252
- // Check for coincident points (sin_angular_distance ≈ 0)
253
- coincident_mask = _mm256_cmp_pd(sin_angular_distance, epsilon, _CMP_LT_OS);
263
+ // Check for coincident points (sin_angular_distance_f64x4 ≈ 0)
264
+ coincident_mask_f64x4 = _mm256_cmp_pd(sin_angular_distance_f64x4, epsilon_f64x4, _CMP_LT_OS);
254
265
 
255
- // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
256
- cos_angular_distance = _mm256_fmadd_pd(_mm256_mul_pd(cos_reduced_first, cos_reduced_second), cos_lambda,
257
- _mm256_mul_pd(sin_reduced_first, sin_reduced_second));
266
+ // cos(angular_distance_f64x4) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
267
+ cos_angular_distance_f64x4 = _mm256_fmadd_pd(_mm256_mul_pd(cos_reduced_first_f64x4, cos_reduced_second_f64x4),
268
+ cos_lambda_f64x4,
269
+ _mm256_mul_pd(sin_reduced_first_f64x4, sin_reduced_second_f64x4));
258
270
 
259
- // angular_distance = atan2(sin, cos)
260
- angular_distance = nk_atan2_f64x4_haswell_(sin_angular_distance, cos_angular_distance);
271
+ // angular_distance_f64x4 = atan2(sin, cos)
272
+ angular_distance_f64x4 = nk_atan2_f64x4_haswell_(sin_angular_distance_f64x4, cos_angular_distance_f64x4);
261
273
 
262
- // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
274
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f64x4)
263
275
  // Avoid division by zero by using blending
264
- __m256d safe_sin_angular = _mm256_blendv_pd(sin_angular_distance, one, coincident_mask);
265
- sin_azimuth = _mm256_div_pd(_mm256_mul_pd(_mm256_mul_pd(cos_reduced_first, cos_reduced_second), sin_lambda),
266
- safe_sin_angular);
267
- cos_squared_azimuth = _mm256_sub_pd(one, _mm256_mul_pd(sin_azimuth, sin_azimuth));
276
+ __m256d safe_sin_angular_f64x4 = _mm256_blendv_pd(sin_angular_distance_f64x4, one_f64x4, coincident_mask_f64x4);
277
+ sin_azimuth_f64x4 = _mm256_div_pd(
278
+ _mm256_mul_pd(_mm256_mul_pd(cos_reduced_first_f64x4, cos_reduced_second_f64x4), sin_lambda_f64x4),
279
+ safe_sin_angular_f64x4);
280
+ cos_squared_azimuth_f64x4 = _mm256_sub_pd(one_f64x4, _mm256_mul_pd(sin_azimuth_f64x4, sin_azimuth_f64x4));
268
281
 
269
282
  // Handle equatorial case: cos²α ≈ 0
270
- __m256d equatorial_mask = _mm256_cmp_pd(cos_squared_azimuth, epsilon, _CMP_LT_OS);
271
- __m256d safe_cos_sq_azimuth = _mm256_blendv_pd(cos_squared_azimuth, one, equatorial_mask);
283
+ __m256d equatorial_mask_f64x4 = _mm256_cmp_pd(cos_squared_azimuth_f64x4, epsilon_f64x4, _CMP_LT_OS);
284
+ __m256d safe_cos_sq_azimuth_f64x4 = _mm256_blendv_pd(cos_squared_azimuth_f64x4, one_f64x4,
285
+ equatorial_mask_f64x4);
272
286
 
273
287
  // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
274
- __m256d sin_product = _mm256_mul_pd(sin_reduced_first, sin_reduced_second);
275
- cos_double_angular_midpoint = _mm256_sub_pd(
276
- cos_angular_distance, _mm256_div_pd(_mm256_mul_pd(two, sin_product), safe_cos_sq_azimuth));
277
- cos_double_angular_midpoint = _mm256_blendv_pd(cos_double_angular_midpoint, _mm256_setzero_pd(),
278
- equatorial_mask);
288
+ __m256d sin_product_f64x4 = _mm256_mul_pd(sin_reduced_first_f64x4, sin_reduced_second_f64x4);
289
+ cos_double_angular_midpoint_f64x4 = _mm256_sub_pd(
290
+ cos_angular_distance_f64x4,
291
+ _mm256_div_pd(_mm256_mul_pd(two_f64x4, sin_product_f64x4), safe_cos_sq_azimuth_f64x4));
292
+ cos_double_angular_midpoint_f64x4 = _mm256_blendv_pd(cos_double_angular_midpoint_f64x4, _mm256_setzero_pd(),
293
+ equatorial_mask_f64x4);
279
294
 
280
295
  // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
281
- __m256d correction_factor = _mm256_mul_pd(
282
- _mm256_div_pd(flattening, sixteen),
283
- _mm256_mul_pd(cos_squared_azimuth,
284
- _mm256_fmadd_pd(flattening, _mm256_fnmadd_pd(three, cos_squared_azimuth, four), four)));
296
+ __m256d correction_factor_f64x4 = _mm256_mul_pd(
297
+ _mm256_div_pd(flattening_f64x4, sixteen_f64x4),
298
+ _mm256_mul_pd(
299
+ cos_squared_azimuth_f64x4,
300
+ _mm256_fmadd_pd(flattening_f64x4, _mm256_fnmadd_pd(three_f64x4, cos_squared_azimuth_f64x4, four_f64x4),
301
+ four_f64x4)));
285
302
 
286
303
  // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
287
- __m256d cos_2sm_sq = _mm256_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
288
- // innermost = -1 + 2 × cos²(2σₘ)
289
- __m256d innermost = _mm256_fmadd_pd(two, cos_2sm_sq, _mm256_set1_pd(-1.0));
290
- // middle = cos(2σₘ) + C × cos(σ) × innermost
291
- __m256d middle = _mm256_fmadd_pd(_mm256_mul_pd(correction_factor, cos_angular_distance), innermost,
292
- cos_double_angular_midpoint);
293
- // inner = C × sin(σ) × middle
294
- __m256d inner = _mm256_mul_pd(_mm256_mul_pd(correction_factor, sin_angular_distance), middle);
295
-
296
- // λ' = L + (1-C) * f * sin_α * (σ + inner)
297
- __m256d lambda_new = _mm256_fmadd_pd(
298
- _mm256_mul_pd(_mm256_mul_pd(_mm256_sub_pd(one, correction_factor), flattening), sin_azimuth),
299
- _mm256_add_pd(angular_distance, inner), longitude_difference);
304
+ __m256d cos_2sm_sq_f64x4 = _mm256_mul_pd(cos_double_angular_midpoint_f64x4, cos_double_angular_midpoint_f64x4);
305
+ // innermost_f64x4 = -1 + 2 × cos²(2σₘ)
306
+ __m256d innermost_f64x4 = _mm256_fmadd_pd(two_f64x4, cos_2sm_sq_f64x4, _mm256_set1_pd(-1.0));
307
+ // middle_f64x4 = cos(2σₘ) + C × cos(σ) × innermost_f64x4
308
+ __m256d middle_f64x4 = _mm256_fmadd_pd(_mm256_mul_pd(correction_factor_f64x4, cos_angular_distance_f64x4),
309
+ innermost_f64x4, cos_double_angular_midpoint_f64x4);
310
+ // inner_f64x4 = C × sin(σ) × middle_f64x4
311
+ __m256d inner_f64x4 = _mm256_mul_pd(_mm256_mul_pd(correction_factor_f64x4, sin_angular_distance_f64x4),
312
+ middle_f64x4);
313
+
314
+ // λ' = L + (1-C) * f * sin_α * (σ + inner_f64x4)
315
+ __m256d lambda_new_f64x4 = _mm256_fmadd_pd(
316
+ _mm256_mul_pd(_mm256_mul_pd(_mm256_sub_pd(one_f64x4, correction_factor_f64x4), flattening_f64x4),
317
+ sin_azimuth_f64x4),
318
+ _mm256_add_pd(angular_distance_f64x4, inner_f64x4), longitude_difference_f64x4);
300
319
 
301
320
  // Check convergence: |λ - λ'| < threshold
302
- __m256d lambda_diff_abs = _mm256_andnot_pd(_mm256_set1_pd(-0.0), _mm256_sub_pd(lambda_new, lambda));
303
- __m256d newly_converged = _mm256_cmp_pd(lambda_diff_abs, convergence_threshold, _CMP_LT_OS);
304
- converged_mask = _mm256_or_pd(converged_mask, newly_converged);
321
+ __m256d lambda_diff_abs_f64x4 = _mm256_andnot_pd(_mm256_set1_pd(-0.0),
322
+ _mm256_sub_pd(lambda_new_f64x4, lambda_f64x4));
323
+ __m256d newly_converged_f64x4 = _mm256_cmp_pd(lambda_diff_abs_f64x4, convergence_threshold_f64x4, _CMP_LT_OS);
324
+ converged_mask_f64x4 = _mm256_or_pd(converged_mask_f64x4, newly_converged_f64x4);
305
325
 
306
- // Only update lambda for non-converged lanes
307
- lambda = _mm256_blendv_pd(lambda_new, lambda, converged_mask);
326
+ // Only update lambda_f64x4 for non-converged lanes
327
+ lambda_f64x4 = _mm256_blendv_pd(lambda_new_f64x4, lambda_f64x4, converged_mask_f64x4);
308
328
  }
309
329
 
310
330
  // Final distance calculation
311
331
  // u² = cos²α * (a² - b²) / b²
312
- __m256d a_sq = _mm256_mul_pd(equatorial_radius, equatorial_radius);
313
- __m256d b_sq = _mm256_mul_pd(polar_radius, polar_radius);
314
- __m256d u_squared = _mm256_div_pd(_mm256_mul_pd(cos_squared_azimuth, _mm256_sub_pd(a_sq, b_sq)), b_sq);
332
+ __m256d a_sq_f64x4 = _mm256_mul_pd(equatorial_radius_f64x4, equatorial_radius_f64x4);
333
+ __m256d b_sq_f64x4 = _mm256_mul_pd(polar_radius_f64x4, polar_radius_f64x4);
334
+ __m256d u_squared_f64x4 = _mm256_div_pd(
335
+ _mm256_mul_pd(cos_squared_azimuth_f64x4, _mm256_sub_pd(a_sq_f64x4, b_sq_f64x4)), b_sq_f64x4);
315
336
 
316
337
  // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
317
- __m256d series_a = _mm256_fmadd_pd(u_squared, _mm256_set1_pd(-175.0), _mm256_set1_pd(320.0));
318
- series_a = _mm256_fmadd_pd(u_squared, series_a, _mm256_set1_pd(-768.0));
319
- series_a = _mm256_fmadd_pd(u_squared, series_a, _mm256_set1_pd(4096.0));
320
- series_a = _mm256_fmadd_pd(_mm256_div_pd(u_squared, _mm256_set1_pd(16384.0)), series_a, one);
338
+ __m256d series_a_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, _mm256_set1_pd(-175.0), _mm256_set1_pd(320.0));
339
+ series_a_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_a_f64x4, _mm256_set1_pd(-768.0));
340
+ series_a_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_a_f64x4, _mm256_set1_pd(4096.0));
341
+ series_a_f64x4 = _mm256_fmadd_pd(_mm256_div_pd(u_squared_f64x4, _mm256_set1_pd(16384.0)), series_a_f64x4,
342
+ one_f64x4);
321
343
 
322
344
  // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
323
- __m256d series_b = _mm256_fmadd_pd(u_squared, _mm256_set1_pd(-47.0), _mm256_set1_pd(74.0));
324
- series_b = _mm256_fmadd_pd(u_squared, series_b, _mm256_set1_pd(-128.0));
325
- series_b = _mm256_fmadd_pd(u_squared, series_b, _mm256_set1_pd(256.0));
326
- series_b = _mm256_mul_pd(_mm256_div_pd(u_squared, _mm256_set1_pd(1024.0)), series_b);
345
+ __m256d series_b_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, _mm256_set1_pd(-47.0), _mm256_set1_pd(74.0));
346
+ series_b_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_b_f64x4, _mm256_set1_pd(-128.0));
347
+ series_b_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_b_f64x4, _mm256_set1_pd(256.0));
348
+ series_b_f64x4 = _mm256_mul_pd(_mm256_div_pd(u_squared_f64x4, _mm256_set1_pd(1024.0)), series_b_f64x4);
327
349
 
328
350
  // Δσ = B × sin(σ) × (cos(2σₘ) +
329
351
  // B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
330
- __m256d cos_2sm_sq = _mm256_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
331
- __m256d sin_sq = _mm256_mul_pd(sin_angular_distance, sin_angular_distance);
332
- __m256d term1 = _mm256_fmadd_pd(two, cos_2sm_sq, _mm256_set1_pd(-1.0));
333
- term1 = _mm256_mul_pd(cos_angular_distance, term1);
334
- __m256d term2 = _mm256_fmadd_pd(four, sin_sq, _mm256_set1_pd(-3.0));
335
- __m256d term3 = _mm256_fmadd_pd(four, cos_2sm_sq, _mm256_set1_pd(-3.0));
336
- term2 = _mm256_mul_pd(_mm256_mul_pd(_mm256_div_pd(series_b, six), cos_double_angular_midpoint),
337
- _mm256_mul_pd(term2, term3));
338
- __m256d delta_sigma = _mm256_mul_pd(
339
- series_b, _mm256_mul_pd(sin_angular_distance, _mm256_add_pd(cos_double_angular_midpoint,
340
- _mm256_mul_pd(_mm256_div_pd(series_b, four),
341
- _mm256_sub_pd(term1, term2)))));
352
+ __m256d cos_2sm_sq_f64x4 = _mm256_mul_pd(cos_double_angular_midpoint_f64x4, cos_double_angular_midpoint_f64x4);
353
+ __m256d sin_sq_f64x4 = _mm256_mul_pd(sin_angular_distance_f64x4, sin_angular_distance_f64x4);
354
+ __m256d term1_f64x4 = _mm256_fmadd_pd(two_f64x4, cos_2sm_sq_f64x4, _mm256_set1_pd(-1.0));
355
+ term1_f64x4 = _mm256_mul_pd(cos_angular_distance_f64x4, term1_f64x4);
356
+ __m256d term2_f64x4 = _mm256_fmadd_pd(four_f64x4, sin_sq_f64x4, _mm256_set1_pd(-3.0));
357
+ __m256d term3_f64x4 = _mm256_fmadd_pd(four_f64x4, cos_2sm_sq_f64x4, _mm256_set1_pd(-3.0));
358
+ term2_f64x4 = _mm256_mul_pd(
359
+ _mm256_mul_pd(_mm256_div_pd(series_b_f64x4, six_f64x4), cos_double_angular_midpoint_f64x4),
360
+ _mm256_mul_pd(term2_f64x4, term3_f64x4));
361
+ __m256d delta_sigma_f64x4 = _mm256_mul_pd(
362
+ series_b_f64x4, _mm256_mul_pd(sin_angular_distance_f64x4,
363
+ _mm256_add_pd(cos_double_angular_midpoint_f64x4,
364
+ _mm256_mul_pd(_mm256_div_pd(series_b_f64x4, four_f64x4),
365
+ _mm256_sub_pd(term1_f64x4, term2_f64x4)))));
342
366
 
343
367
  // s = b * A * (σ - Δσ)
344
- __m256d distances = _mm256_mul_pd(_mm256_mul_pd(polar_radius, series_a),
345
- _mm256_sub_pd(angular_distance, delta_sigma));
368
+ __m256d distances_f64x4 = _mm256_mul_pd(_mm256_mul_pd(polar_radius_f64x4, series_a_f64x4),
369
+ _mm256_sub_pd(angular_distance_f64x4, delta_sigma_f64x4));
346
370
 
347
371
  // Set coincident points to zero
348
- distances = _mm256_blendv_pd(distances, _mm256_setzero_pd(), coincident_mask);
372
+ distances_f64x4 = _mm256_blendv_pd(distances_f64x4, _mm256_setzero_pd(), coincident_mask_f64x4);
349
373
 
350
- return distances;
374
+ return distances_f64x4;
351
375
  }
352
376
 
353
377
  NK_PUBLIC void nk_vincenty_f64_haswell( //
@@ -356,14 +380,14 @@ NK_PUBLIC void nk_vincenty_f64_haswell( //
356
380
  nk_size_t n, nk_f64_t *results) {
357
381
 
358
382
  while (n >= 4) {
359
- __m256d first_latitudes = _mm256_loadu_pd(a_lats);
360
- __m256d first_longitudes = _mm256_loadu_pd(a_lons);
361
- __m256d second_latitudes = _mm256_loadu_pd(b_lats);
362
- __m256d second_longitudes = _mm256_loadu_pd(b_lons);
383
+ __m256d first_latitudes_f64x4 = _mm256_loadu_pd(a_lats);
384
+ __m256d first_longitudes_f64x4 = _mm256_loadu_pd(a_lons);
385
+ __m256d second_latitudes_f64x4 = _mm256_loadu_pd(b_lats);
386
+ __m256d second_longitudes_f64x4 = _mm256_loadu_pd(b_lons);
363
387
 
364
- __m256d distances = nk_vincenty_f64x4_haswell_(first_latitudes, first_longitudes, second_latitudes,
365
- second_longitudes);
366
- _mm256_storeu_pd(results, distances);
388
+ __m256d distances_f64x4 = nk_vincenty_f64x4_haswell_(first_latitudes_f64x4, first_longitudes_f64x4,
389
+ second_latitudes_f64x4, second_longitudes_f64x4);
390
+ _mm256_storeu_pd(results, distances_f64x4);
367
391
 
368
392
  a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
369
393
  }
@@ -375,9 +399,9 @@ NK_PUBLIC void nk_vincenty_f64_haswell( //
375
399
  nk_partial_load_b64x4_haswell_(a_lons, &a_lon_vec, n);
376
400
  nk_partial_load_b64x4_haswell_(b_lats, &b_lat_vec, n);
377
401
  nk_partial_load_b64x4_haswell_(b_lons, &b_lon_vec, n);
378
- __m256d distances = nk_vincenty_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
379
- b_lon_vec.ymm_pd);
380
- result_vec.ymm_pd = distances;
402
+ __m256d distances_f64x4 = nk_vincenty_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
403
+ b_lon_vec.ymm_pd);
404
+ result_vec.ymm_pd = distances_f64x4;
381
405
  nk_partial_store_b64x4_haswell_(&result_vec, results, n);
382
406
  }
383
407
  }
@@ -386,164 +410,180 @@ NK_PUBLIC void nk_vincenty_f64_haswell( //
386
410
  * @brief AVX2 helper for Vincenty's geodesic distance on 8 f32 point pairs.
387
411
  * @note This is a true SIMD implementation using masked convergence tracking via blending.
388
412
  */
389
- NK_INTERNAL __m256 nk_vincenty_f32x8_haswell_( //
390
- __m256 first_latitudes, __m256 first_longitudes, //
391
- __m256 second_latitudes, __m256 second_longitudes) {
392
-
393
- __m256 const equatorial_radius = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
394
- __m256 const polar_radius = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
395
- __m256 const flattening = _mm256_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
396
- __m256 const convergence_threshold = _mm256_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
397
- __m256 const one = _mm256_set1_ps(1.0f);
398
- __m256 const two = _mm256_set1_ps(2.0f);
399
- __m256 const three = _mm256_set1_ps(3.0f);
400
- __m256 const four = _mm256_set1_ps(4.0f);
401
- __m256 const six = _mm256_set1_ps(6.0f);
402
- __m256 const sixteen = _mm256_set1_ps(16.0f);
403
- __m256 const epsilon = _mm256_set1_ps(1e-7f);
413
+ NK_INTERNAL __m256 nk_vincenty_f32x8_haswell_( //
414
+ __m256 first_latitudes_f32x8, __m256 first_longitudes_f32x8, //
415
+ __m256 second_latitudes_f32x8, __m256 second_longitudes_f32x8) {
416
+
417
+ __m256 const equatorial_radius_f32x8 = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
418
+ __m256 const polar_radius_f32x8 = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
419
+ __m256 const flattening_f32x8 = _mm256_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
420
+ __m256 const convergence_threshold_f32x8 = _mm256_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
421
+ __m256 const one_f32x8 = _mm256_set1_ps(1.0f);
422
+ __m256 const two_f32x8 = _mm256_set1_ps(2.0f);
423
+ __m256 const three_f32x8 = _mm256_set1_ps(3.0f);
424
+ __m256 const four_f32x8 = _mm256_set1_ps(4.0f);
425
+ __m256 const six_f32x8 = _mm256_set1_ps(6.0f);
426
+ __m256 const sixteen_f32x8 = _mm256_set1_ps(16.0f);
427
+ __m256 const epsilon_f32x8 = _mm256_set1_ps(1e-7f);
404
428
 
405
429
  // Longitude difference
406
- __m256 longitude_difference = _mm256_sub_ps(second_longitudes, first_longitudes);
430
+ __m256 longitude_difference_f32x8 = _mm256_sub_ps(second_longitudes_f32x8, first_longitudes_f32x8);
407
431
 
408
432
  // Reduced latitudes: tan(U) = (1-f) * tan(lat)
409
- __m256 one_minus_f = _mm256_sub_ps(one, flattening);
410
- __m256 tan_first = _mm256_div_ps(nk_sin_f32x8_haswell_(first_latitudes), nk_cos_f32x8_haswell_(first_latitudes));
411
- __m256 tan_second = _mm256_div_ps(nk_sin_f32x8_haswell_(second_latitudes), nk_cos_f32x8_haswell_(second_latitudes));
412
- __m256 tan_reduced_first = _mm256_mul_ps(one_minus_f, tan_first);
413
- __m256 tan_reduced_second = _mm256_mul_ps(one_minus_f, tan_second);
433
+ __m256 one_minus_f_f32x8 = _mm256_sub_ps(one_f32x8, flattening_f32x8);
434
+ __m256 tan_first_f32x8 = _mm256_div_ps(nk_sin_f32x8_haswell_(first_latitudes_f32x8),
435
+ nk_cos_f32x8_haswell_(first_latitudes_f32x8));
436
+ __m256 tan_second_f32x8 = _mm256_div_ps(nk_sin_f32x8_haswell_(second_latitudes_f32x8),
437
+ nk_cos_f32x8_haswell_(second_latitudes_f32x8));
438
+ __m256 tan_reduced_first_f32x8 = _mm256_mul_ps(one_minus_f_f32x8, tan_first_f32x8);
439
+ __m256 tan_reduced_second_f32x8 = _mm256_mul_ps(one_minus_f_f32x8, tan_second_f32x8);
414
440
 
415
441
  // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
416
- __m256 cos_reduced_first = _mm256_div_ps(
417
- one, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_first, tan_reduced_first, one)));
418
- __m256 sin_reduced_first = _mm256_mul_ps(tan_reduced_first, cos_reduced_first);
419
- __m256 cos_reduced_second = _mm256_div_ps(
420
- one, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_second, tan_reduced_second, one)));
421
- __m256 sin_reduced_second = _mm256_mul_ps(tan_reduced_second, cos_reduced_second);
422
-
423
- // Initialize lambda and tracking variables
424
- __m256 lambda = longitude_difference;
425
- __m256 sin_angular_distance, cos_angular_distance, angular_distance;
426
- __m256 sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
442
+ __m256 cos_reduced_first_f32x8 = _mm256_div_ps(
443
+ one_f32x8, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_first_f32x8, tan_reduced_first_f32x8, one_f32x8)));
444
+ __m256 sin_reduced_first_f32x8 = _mm256_mul_ps(tan_reduced_first_f32x8, cos_reduced_first_f32x8);
445
+ __m256 cos_reduced_second_f32x8 = _mm256_div_ps(
446
+ one_f32x8, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_second_f32x8, tan_reduced_second_f32x8, one_f32x8)));
447
+ __m256 sin_reduced_second_f32x8 = _mm256_mul_ps(tan_reduced_second_f32x8, cos_reduced_second_f32x8);
448
+
449
+ // Initialize lambda_f32x8 and tracking variables
450
+ __m256 lambda_f32x8 = longitude_difference_f32x8;
451
+ __m256 sin_angular_distance_f32x8, cos_angular_distance_f32x8, angular_distance_f32x8;
452
+ __m256 sin_azimuth_f32x8, cos_squared_azimuth_f32x8, cos_double_angular_midpoint_f32x8;
427
453
 
428
454
  // Track convergence and coincident points using masks
429
- __m256 converged_mask = _mm256_setzero_ps();
430
- __m256 coincident_mask = _mm256_setzero_ps();
455
+ __m256 converged_mask_f32x8 = _mm256_setzero_ps();
456
+ __m256 coincident_mask_f32x8 = _mm256_setzero_ps();
431
457
 
432
458
  for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
433
459
  // Check if all lanes converged
434
- int converged_bits = _mm256_movemask_ps(converged_mask);
460
+ int converged_bits = _mm256_movemask_ps(converged_mask_f32x8);
435
461
  if (converged_bits == 0xFF) break;
436
462
 
437
- __m256 sin_lambda = nk_sin_f32x8_haswell_(lambda);
438
- __m256 cos_lambda = nk_cos_f32x8_haswell_(lambda);
463
+ __m256 sin_lambda_f32x8 = nk_sin_f32x8_haswell_(lambda_f32x8);
464
+ __m256 cos_lambda_f32x8 = nk_cos_f32x8_haswell_(lambda_f32x8);
439
465
 
440
- // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
441
- __m256 cross_term = _mm256_mul_ps(cos_reduced_second, sin_lambda);
442
- __m256 mixed_term = _mm256_sub_ps(
443
- _mm256_mul_ps(cos_reduced_first, sin_reduced_second),
444
- _mm256_mul_ps(_mm256_mul_ps(sin_reduced_first, cos_reduced_second), cos_lambda));
445
- __m256 sin_angular_dist_sq = _mm256_fmadd_ps(cross_term, cross_term, _mm256_mul_ps(mixed_term, mixed_term));
446
- sin_angular_distance = _mm256_sqrt_ps(sin_angular_dist_sq);
466
+ // sin²(angular_distance_f32x8) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
467
+ __m256 cross_term_f32x8 = _mm256_mul_ps(cos_reduced_second_f32x8, sin_lambda_f32x8);
468
+ __m256 mixed_term_f32x8 = _mm256_sub_ps(
469
+ _mm256_mul_ps(cos_reduced_first_f32x8, sin_reduced_second_f32x8),
470
+ _mm256_mul_ps(_mm256_mul_ps(sin_reduced_first_f32x8, cos_reduced_second_f32x8), cos_lambda_f32x8));
471
+ __m256 sin_angular_dist_sq_f32x8 = _mm256_fmadd_ps(cross_term_f32x8, cross_term_f32x8,
472
+ _mm256_mul_ps(mixed_term_f32x8, mixed_term_f32x8));
473
+ sin_angular_distance_f32x8 = _mm256_sqrt_ps(sin_angular_dist_sq_f32x8);
447
474
 
448
- // Check for coincident points (sin_angular_distance ≈ 0)
449
- coincident_mask = _mm256_cmp_ps(sin_angular_distance, epsilon, _CMP_LT_OS);
475
+ // Check for coincident points (sin_angular_distance_f32x8 ≈ 0)
476
+ coincident_mask_f32x8 = _mm256_cmp_ps(sin_angular_distance_f32x8, epsilon_f32x8, _CMP_LT_OS);
450
477
 
451
- // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
452
- cos_angular_distance = _mm256_fmadd_ps(_mm256_mul_ps(cos_reduced_first, cos_reduced_second), cos_lambda,
453
- _mm256_mul_ps(sin_reduced_first, sin_reduced_second));
478
+ // cos(angular_distance_f32x8) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
479
+ cos_angular_distance_f32x8 = _mm256_fmadd_ps(_mm256_mul_ps(cos_reduced_first_f32x8, cos_reduced_second_f32x8),
480
+ cos_lambda_f32x8,
481
+ _mm256_mul_ps(sin_reduced_first_f32x8, sin_reduced_second_f32x8));
454
482
 
455
- // angular_distance = atan2(sin, cos)
456
- angular_distance = nk_atan2_f32x8_haswell_(sin_angular_distance, cos_angular_distance);
483
+ // angular_distance_f32x8 = atan2(sin, cos)
484
+ angular_distance_f32x8 = nk_atan2_f32x8_haswell_(sin_angular_distance_f32x8, cos_angular_distance_f32x8);
457
485
 
458
- // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
486
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f32x8)
459
487
  // Avoid division by zero by using blending
460
- __m256 safe_sin_angular = _mm256_blendv_ps(sin_angular_distance, one, coincident_mask);
461
- sin_azimuth = _mm256_div_ps(_mm256_mul_ps(_mm256_mul_ps(cos_reduced_first, cos_reduced_second), sin_lambda),
462
- safe_sin_angular);
463
- cos_squared_azimuth = _mm256_sub_ps(one, _mm256_mul_ps(sin_azimuth, sin_azimuth));
488
+ __m256 safe_sin_angular_f32x8 = _mm256_blendv_ps(sin_angular_distance_f32x8, one_f32x8, coincident_mask_f32x8);
489
+ sin_azimuth_f32x8 = _mm256_div_ps(
490
+ _mm256_mul_ps(_mm256_mul_ps(cos_reduced_first_f32x8, cos_reduced_second_f32x8), sin_lambda_f32x8),
491
+ safe_sin_angular_f32x8);
492
+ cos_squared_azimuth_f32x8 = _mm256_sub_ps(one_f32x8, _mm256_mul_ps(sin_azimuth_f32x8, sin_azimuth_f32x8));
464
493
 
465
494
  // Handle equatorial case: cos²α ≈ 0
466
- __m256 equatorial_mask = _mm256_cmp_ps(cos_squared_azimuth, epsilon, _CMP_LT_OS);
467
- __m256 safe_cos_sq_azimuth = _mm256_blendv_ps(cos_squared_azimuth, one, equatorial_mask);
495
+ __m256 equatorial_mask_f32x8 = _mm256_cmp_ps(cos_squared_azimuth_f32x8, epsilon_f32x8, _CMP_LT_OS);
496
+ __m256 safe_cos_sq_azimuth_f32x8 = _mm256_blendv_ps(cos_squared_azimuth_f32x8, one_f32x8,
497
+ equatorial_mask_f32x8);
468
498
 
469
499
  // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
470
- __m256 sin_product = _mm256_mul_ps(sin_reduced_first, sin_reduced_second);
471
- cos_double_angular_midpoint = _mm256_sub_ps(
472
- cos_angular_distance, _mm256_div_ps(_mm256_mul_ps(two, sin_product), safe_cos_sq_azimuth));
473
- cos_double_angular_midpoint = _mm256_blendv_ps(cos_double_angular_midpoint, _mm256_setzero_ps(),
474
- equatorial_mask);
500
+ __m256 sin_product_f32x8 = _mm256_mul_ps(sin_reduced_first_f32x8, sin_reduced_second_f32x8);
501
+ cos_double_angular_midpoint_f32x8 = _mm256_sub_ps(
502
+ cos_angular_distance_f32x8,
503
+ _mm256_div_ps(_mm256_mul_ps(two_f32x8, sin_product_f32x8), safe_cos_sq_azimuth_f32x8));
504
+ cos_double_angular_midpoint_f32x8 = _mm256_blendv_ps(cos_double_angular_midpoint_f32x8, _mm256_setzero_ps(),
505
+ equatorial_mask_f32x8);
475
506
 
476
507
  // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
477
- __m256 correction_factor = _mm256_mul_ps(
478
- _mm256_div_ps(flattening, sixteen),
479
- _mm256_mul_ps(cos_squared_azimuth,
480
- _mm256_fmadd_ps(flattening, _mm256_fnmadd_ps(three, cos_squared_azimuth, four), four)));
508
+ __m256 correction_factor_f32x8 = _mm256_mul_ps(
509
+ _mm256_div_ps(flattening_f32x8, sixteen_f32x8),
510
+ _mm256_mul_ps(
511
+ cos_squared_azimuth_f32x8,
512
+ _mm256_fmadd_ps(flattening_f32x8, _mm256_fnmadd_ps(three_f32x8, cos_squared_azimuth_f32x8, four_f32x8),
513
+ four_f32x8)));
481
514
 
482
515
  // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
483
- __m256 cos_2sm_sq = _mm256_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
484
- // innermost = -1 + 2 × cos²(2σₘ)
485
- __m256 innermost = _mm256_fmadd_ps(two, cos_2sm_sq, _mm256_set1_ps(-1.0f));
486
- // middle = cos(2σₘ) + C × cos(σ) × innermost
487
- __m256 middle = _mm256_fmadd_ps(_mm256_mul_ps(correction_factor, cos_angular_distance), innermost,
488
- cos_double_angular_midpoint);
489
- // inner = C × sin(σ) × middle
490
- __m256 inner = _mm256_mul_ps(_mm256_mul_ps(correction_factor, sin_angular_distance), middle);
491
-
492
- // λ' = L + (1-C) * f * sin_α * (σ + inner)
493
- __m256 lambda_new = _mm256_fmadd_ps(
494
- _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(one, correction_factor), flattening), sin_azimuth),
495
- _mm256_add_ps(angular_distance, inner), longitude_difference);
516
+ __m256 cos_2sm_sq_f32x8 = _mm256_mul_ps(cos_double_angular_midpoint_f32x8, cos_double_angular_midpoint_f32x8);
517
+ // innermost_f32x8 = -1 + 2 × cos²(2σₘ)
518
+ __m256 innermost_f32x8 = _mm256_fmadd_ps(two_f32x8, cos_2sm_sq_f32x8, _mm256_set1_ps(-1.0f));
519
+ // middle_f32x8 = cos(2σₘ) + C × cos(σ) × innermost_f32x8
520
+ __m256 middle_f32x8 = _mm256_fmadd_ps(_mm256_mul_ps(correction_factor_f32x8, cos_angular_distance_f32x8),
521
+ innermost_f32x8, cos_double_angular_midpoint_f32x8);
522
+ // inner_f32x8 = C × sin(σ) × middle_f32x8
523
+ __m256 inner_f32x8 = _mm256_mul_ps(_mm256_mul_ps(correction_factor_f32x8, sin_angular_distance_f32x8),
524
+ middle_f32x8);
525
+
526
+ // λ' = L + (1-C) * f * sin_α * (σ + inner_f32x8)
527
+ __m256 lambda_new_f32x8 = _mm256_fmadd_ps(
528
+ _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(one_f32x8, correction_factor_f32x8), flattening_f32x8),
529
+ sin_azimuth_f32x8),
530
+ _mm256_add_ps(angular_distance_f32x8, inner_f32x8), longitude_difference_f32x8);
496
531
 
497
532
  // Check convergence: |λ - λ'| < threshold
498
- __m256 lambda_diff_abs = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), _mm256_sub_ps(lambda_new, lambda));
499
- __m256 newly_converged = _mm256_cmp_ps(lambda_diff_abs, convergence_threshold, _CMP_LT_OS);
500
- converged_mask = _mm256_or_ps(converged_mask, newly_converged);
533
+ __m256 lambda_diff_abs_f32x8 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f),
534
+ _mm256_sub_ps(lambda_new_f32x8, lambda_f32x8));
535
+ __m256 newly_converged_f32x8 = _mm256_cmp_ps(lambda_diff_abs_f32x8, convergence_threshold_f32x8, _CMP_LT_OS);
536
+ converged_mask_f32x8 = _mm256_or_ps(converged_mask_f32x8, newly_converged_f32x8);
501
537
 
502
- // Only update lambda for non-converged lanes
503
- lambda = _mm256_blendv_ps(lambda_new, lambda, converged_mask);
538
+ // Only update lambda_f32x8 for non-converged lanes
539
+ lambda_f32x8 = _mm256_blendv_ps(lambda_new_f32x8, lambda_f32x8, converged_mask_f32x8);
504
540
  }
505
541
 
506
542
  // Final distance calculation
507
543
  // u² = cos²α * (a² - b²) / b²
508
- __m256 a_sq = _mm256_mul_ps(equatorial_radius, equatorial_radius);
509
- __m256 b_sq = _mm256_mul_ps(polar_radius, polar_radius);
510
- __m256 u_squared = _mm256_div_ps(_mm256_mul_ps(cos_squared_azimuth, _mm256_sub_ps(a_sq, b_sq)), b_sq);
544
+ __m256 a_sq_f32x8 = _mm256_mul_ps(equatorial_radius_f32x8, equatorial_radius_f32x8);
545
+ __m256 b_sq_f32x8 = _mm256_mul_ps(polar_radius_f32x8, polar_radius_f32x8);
546
+ __m256 u_squared_f32x8 = _mm256_div_ps(
547
+ _mm256_mul_ps(cos_squared_azimuth_f32x8, _mm256_sub_ps(a_sq_f32x8, b_sq_f32x8)), b_sq_f32x8);
511
548
 
512
549
  // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
513
- __m256 series_a = _mm256_fmadd_ps(u_squared, _mm256_set1_ps(-175.0f), _mm256_set1_ps(320.0f));
514
- series_a = _mm256_fmadd_ps(u_squared, series_a, _mm256_set1_ps(-768.0f));
515
- series_a = _mm256_fmadd_ps(u_squared, series_a, _mm256_set1_ps(4096.0f));
516
- series_a = _mm256_fmadd_ps(_mm256_div_ps(u_squared, _mm256_set1_ps(16384.0f)), series_a, one);
550
+ __m256 series_a_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, _mm256_set1_ps(-175.0f), _mm256_set1_ps(320.0f));
551
+ series_a_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_a_f32x8, _mm256_set1_ps(-768.0f));
552
+ series_a_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_a_f32x8, _mm256_set1_ps(4096.0f));
553
+ series_a_f32x8 = _mm256_fmadd_ps(_mm256_div_ps(u_squared_f32x8, _mm256_set1_ps(16384.0f)), series_a_f32x8,
554
+ one_f32x8);
517
555
 
518
556
  // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
519
- __m256 series_b = _mm256_fmadd_ps(u_squared, _mm256_set1_ps(-47.0f), _mm256_set1_ps(74.0f));
520
- series_b = _mm256_fmadd_ps(u_squared, series_b, _mm256_set1_ps(-128.0f));
521
- series_b = _mm256_fmadd_ps(u_squared, series_b, _mm256_set1_ps(256.0f));
522
- series_b = _mm256_mul_ps(_mm256_div_ps(u_squared, _mm256_set1_ps(1024.0f)), series_b);
557
+ __m256 series_b_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, _mm256_set1_ps(-47.0f), _mm256_set1_ps(74.0f));
558
+ series_b_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_b_f32x8, _mm256_set1_ps(-128.0f));
559
+ series_b_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_b_f32x8, _mm256_set1_ps(256.0f));
560
+ series_b_f32x8 = _mm256_mul_ps(_mm256_div_ps(u_squared_f32x8, _mm256_set1_ps(1024.0f)), series_b_f32x8);
523
561
 
524
562
  // Δσ = B × sin(σ) × (cos(2σₘ) +
525
563
  // B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
526
- __m256 cos_2sm_sq = _mm256_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
527
- __m256 sin_sq = _mm256_mul_ps(sin_angular_distance, sin_angular_distance);
528
- __m256 term1 = _mm256_fmadd_ps(two, cos_2sm_sq, _mm256_set1_ps(-1.0f));
529
- term1 = _mm256_mul_ps(cos_angular_distance, term1);
530
- __m256 term2 = _mm256_fmadd_ps(four, sin_sq, _mm256_set1_ps(-3.0f));
531
- __m256 term3 = _mm256_fmadd_ps(four, cos_2sm_sq, _mm256_set1_ps(-3.0f));
532
- term2 = _mm256_mul_ps(_mm256_mul_ps(_mm256_div_ps(series_b, six), cos_double_angular_midpoint),
533
- _mm256_mul_ps(term2, term3));
534
- __m256 delta_sigma = _mm256_mul_ps(
535
- series_b, _mm256_mul_ps(sin_angular_distance, _mm256_add_ps(cos_double_angular_midpoint,
536
- _mm256_mul_ps(_mm256_div_ps(series_b, four),
537
- _mm256_sub_ps(term1, term2)))));
564
+ __m256 cos_2sm_sq_f32x8 = _mm256_mul_ps(cos_double_angular_midpoint_f32x8, cos_double_angular_midpoint_f32x8);
565
+ __m256 sin_sq_f32x8 = _mm256_mul_ps(sin_angular_distance_f32x8, sin_angular_distance_f32x8);
566
+ __m256 term1_f32x8 = _mm256_fmadd_ps(two_f32x8, cos_2sm_sq_f32x8, _mm256_set1_ps(-1.0f));
567
+ term1_f32x8 = _mm256_mul_ps(cos_angular_distance_f32x8, term1_f32x8);
568
+ __m256 term2_f32x8 = _mm256_fmadd_ps(four_f32x8, sin_sq_f32x8, _mm256_set1_ps(-3.0f));
569
+ __m256 term3_f32x8 = _mm256_fmadd_ps(four_f32x8, cos_2sm_sq_f32x8, _mm256_set1_ps(-3.0f));
570
+ term2_f32x8 = _mm256_mul_ps(
571
+ _mm256_mul_ps(_mm256_div_ps(series_b_f32x8, six_f32x8), cos_double_angular_midpoint_f32x8),
572
+ _mm256_mul_ps(term2_f32x8, term3_f32x8));
573
+ __m256 delta_sigma_f32x8 = _mm256_mul_ps(
574
+ series_b_f32x8, _mm256_mul_ps(sin_angular_distance_f32x8,
575
+ _mm256_add_ps(cos_double_angular_midpoint_f32x8,
576
+ _mm256_mul_ps(_mm256_div_ps(series_b_f32x8, four_f32x8),
577
+ _mm256_sub_ps(term1_f32x8, term2_f32x8)))));
538
578
 
539
579
  // s = b * A * (σ - Δσ)
540
- __m256 distances = _mm256_mul_ps(_mm256_mul_ps(polar_radius, series_a),
541
- _mm256_sub_ps(angular_distance, delta_sigma));
580
+ __m256 distances_f32x8 = _mm256_mul_ps(_mm256_mul_ps(polar_radius_f32x8, series_a_f32x8),
581
+ _mm256_sub_ps(angular_distance_f32x8, delta_sigma_f32x8));
542
582
 
543
583
  // Set coincident points to zero
544
- distances = _mm256_blendv_ps(distances, _mm256_setzero_ps(), coincident_mask);
584
+ distances_f32x8 = _mm256_blendv_ps(distances_f32x8, _mm256_setzero_ps(), coincident_mask_f32x8);
545
585
 
546
- return distances;
586
+ return distances_f32x8;
547
587
  }
548
588
 
549
589
  NK_PUBLIC void nk_vincenty_f32_haswell( //
@@ -552,14 +592,14 @@ NK_PUBLIC void nk_vincenty_f32_haswell( //
552
592
  nk_size_t n, nk_f32_t *results) {
553
593
 
554
594
  while (n >= 8) {
555
- __m256 first_latitudes = _mm256_loadu_ps(a_lats);
556
- __m256 first_longitudes = _mm256_loadu_ps(a_lons);
557
- __m256 second_latitudes = _mm256_loadu_ps(b_lats);
558
- __m256 second_longitudes = _mm256_loadu_ps(b_lons);
595
+ __m256 first_latitudes_f32x8 = _mm256_loadu_ps(a_lats);
596
+ __m256 first_longitudes_f32x8 = _mm256_loadu_ps(a_lons);
597
+ __m256 second_latitudes_f32x8 = _mm256_loadu_ps(b_lats);
598
+ __m256 second_longitudes_f32x8 = _mm256_loadu_ps(b_lons);
559
599
 
560
- __m256 distances = nk_vincenty_f32x8_haswell_(first_latitudes, first_longitudes, second_latitudes,
561
- second_longitudes);
562
- _mm256_storeu_ps(results, distances);
600
+ __m256 distances_f32x8 = nk_vincenty_f32x8_haswell_(first_latitudes_f32x8, first_longitudes_f32x8,
601
+ second_latitudes_f32x8, second_longitudes_f32x8);
602
+ _mm256_storeu_ps(results, distances_f32x8);
563
603
 
564
604
  a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
565
605
  }
@@ -571,9 +611,9 @@ NK_PUBLIC void nk_vincenty_f32_haswell( //
571
611
  nk_partial_load_b32x8_serial_(a_lons, &a_lon_vec, n);
572
612
  nk_partial_load_b32x8_serial_(b_lats, &b_lat_vec, n);
573
613
  nk_partial_load_b32x8_serial_(b_lons, &b_lon_vec, n);
574
- __m256 distances = nk_vincenty_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
575
- b_lon_vec.ymm_ps);
576
- result_vec.ymm_ps = distances;
614
+ __m256 distances_f32x8 = nk_vincenty_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
615
+ b_lon_vec.ymm_ps);
616
+ result_vec.ymm_ps = distances_f32x8;
577
617
  nk_partial_store_b32x8_serial_(&result_vec, results, n);
578
618
  }
579
619
  }