numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -8,13 +8,14 @@
8
8
  *
9
9
  * @section geospatial_skylake_instructions Key AVX-512 Geospatial Instructions
10
10
  *
11
- * Intrinsic Instruction Ice Genoa
12
- * _mm512_sqrt_ps VSQRTPS (ZMM, ZMM) 19c @ p05 15c @ p01
13
- * _mm512_sqrt_pd VSQRTPD (ZMM, ZMM) 23c @ p05 21c @ p01
14
- * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11c @ p0 11c @ p01
15
- * _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13c @ p0 13c @ p01
16
- * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4c @ p01 4c @ p01
17
- * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4c @ p01 4c @ p01
11
+ * Intrinsic Instruction Icelake Genoa
12
+ * _mm512_sqrt_ps VSQRTPS (ZMM, ZMM) 19cy @ p0+p0+p05 15cy @ p01
13
+ * _mm512_sqrt_pd VSQRTPD (ZMM, ZMM) 23cy @ p0+p0+p05 21cy @ p01
14
+ * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11cy @ p0 11cy @ p01
15
+ * _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13cy @ p0 13cy @ p01
16
+ * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
17
+ * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
18
+ * _mm512_cmp_ps_mask VCMPPS (K, ZMM, ZMM, I8) 4cy @ p5 5cy @ p01
18
19
  */
19
20
  #ifndef NK_GEOSPATIAL_SKYLAKE_H
20
21
  #define NK_GEOSPATIAL_SKYLAKE_H
@@ -37,44 +38,48 @@ extern "C" {
37
38
  #pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "f16c", "fma", "bmi", "bmi2")
38
39
  #endif
39
40
 
40
- NK_INTERNAL __m512d nk_haversine_f64x8_skylake_( //
41
- __m512d first_latitudes, __m512d first_longitudes, //
42
- __m512d second_latitudes, __m512d second_longitudes) {
41
+ NK_INTERNAL __m512d nk_haversine_f64x8_skylake_( //
42
+ __m512d first_latitudes_f64x8, __m512d first_longitudes_f64x8, //
43
+ __m512d second_latitudes_f64x8, __m512d second_longitudes_f64x8) {
43
44
 
44
- __m512d const earth_radius = _mm512_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
45
- __m512d const half = _mm512_set1_pd(0.5);
46
- __m512d const one = _mm512_set1_pd(1.0);
47
- __m512d const two = _mm512_set1_pd(2.0);
45
+ __m512d const earth_radius_f64x8 = _mm512_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
46
+ __m512d const half_f64x8 = _mm512_set1_pd(0.5);
47
+ __m512d const one_f64x8 = _mm512_set1_pd(1.0);
48
+ __m512d const two_f64x8 = _mm512_set1_pd(2.0);
48
49
 
49
- __m512d latitude_delta = _mm512_sub_pd(second_latitudes, first_latitudes);
50
- __m512d longitude_delta = _mm512_sub_pd(second_longitudes, first_longitudes);
50
+ __m512d latitude_delta_f64x8 = _mm512_sub_pd(second_latitudes_f64x8, first_latitudes_f64x8);
51
+ __m512d longitude_delta_f64x8 = _mm512_sub_pd(second_longitudes_f64x8, first_longitudes_f64x8);
51
52
 
52
53
  // Haversine terms: sin²(Δ/2)
53
- __m512d latitude_delta_half = _mm512_mul_pd(latitude_delta, half);
54
- __m512d longitude_delta_half = _mm512_mul_pd(longitude_delta, half);
55
- __m512d sin_latitude_delta_half = nk_sin_f64x8_skylake_(latitude_delta_half);
56
- __m512d sin_longitude_delta_half = nk_sin_f64x8_skylake_(longitude_delta_half);
57
- __m512d sin_squared_latitude_delta_half = _mm512_mul_pd(sin_latitude_delta_half, sin_latitude_delta_half);
58
- __m512d sin_squared_longitude_delta_half = _mm512_mul_pd(sin_longitude_delta_half, sin_longitude_delta_half);
54
+ __m512d latitude_delta_half_f64x8 = _mm512_mul_pd(latitude_delta_f64x8, half_f64x8);
55
+ __m512d longitude_delta_half_f64x8 = _mm512_mul_pd(longitude_delta_f64x8, half_f64x8);
56
+ __m512d sin_latitude_delta_half_f64x8 = nk_sin_f64x8_skylake_(latitude_delta_half_f64x8);
57
+ __m512d sin_longitude_delta_half_f64x8 = nk_sin_f64x8_skylake_(longitude_delta_half_f64x8);
58
+ __m512d sin_squared_latitude_delta_half_f64x8 = _mm512_mul_pd(sin_latitude_delta_half_f64x8,
59
+ sin_latitude_delta_half_f64x8);
60
+ __m512d sin_squared_longitude_delta_half_f64x8 = _mm512_mul_pd(sin_longitude_delta_half_f64x8,
61
+ sin_longitude_delta_half_f64x8);
59
62
 
60
63
  // Latitude cosine product
61
- __m512d cos_first_latitude = nk_cos_f64x8_skylake_(first_latitudes);
62
- __m512d cos_second_latitude = nk_cos_f64x8_skylake_(second_latitudes);
63
- __m512d cos_latitude_product = _mm512_mul_pd(cos_first_latitude, cos_second_latitude);
64
+ __m512d cos_first_latitude_f64x8 = nk_cos_f64x8_skylake_(first_latitudes_f64x8);
65
+ __m512d cos_second_latitude_f64x8 = nk_cos_f64x8_skylake_(second_latitudes_f64x8);
66
+ __m512d cos_latitude_product_f64x8 = _mm512_mul_pd(cos_first_latitude_f64x8, cos_second_latitude_f64x8);
64
67
 
65
68
  // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
66
- __m512d haversine_term = _mm512_add_pd(sin_squared_latitude_delta_half,
67
- _mm512_mul_pd(cos_latitude_product, sin_squared_longitude_delta_half));
68
- // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
69
- __m512d zero = _mm512_setzero_pd();
70
- haversine_term = _mm512_max_pd(zero, _mm512_min_pd(one, haversine_term));
69
+ __m512d haversine_term_f64x8 = _mm512_add_pd(
70
+ sin_squared_latitude_delta_half_f64x8,
71
+ _mm512_mul_pd(cos_latitude_product_f64x8, sin_squared_longitude_delta_half_f64x8));
72
+ // Clamp haversine_term_f64x8 to [0, 1] to prevent NaN from sqrt of negative values
73
+ __m512d zero_f64x8 = _mm512_setzero_pd();
74
+ haversine_term_f64x8 = _mm512_max_pd(zero_f64x8, _mm512_min_pd(one_f64x8, haversine_term_f64x8));
71
75
 
72
76
  // Central angle: c = 2 × atan2(√a, √(1-a))
73
- __m512d sqrt_haversine = _mm512_sqrt_pd(haversine_term);
74
- __m512d sqrt_complement = _mm512_sqrt_pd(_mm512_sub_pd(one, haversine_term));
75
- __m512d central_angle = _mm512_mul_pd(two, nk_atan2_f64x8_skylake_(sqrt_haversine, sqrt_complement));
77
+ __m512d sqrt_haversine_f64x8 = _mm512_sqrt_pd(haversine_term_f64x8);
78
+ __m512d sqrt_complement_f64x8 = _mm512_sqrt_pd(_mm512_sub_pd(one_f64x8, haversine_term_f64x8));
79
+ __m512d central_angle_f64x8 = _mm512_mul_pd(two_f64x8,
80
+ nk_atan2_f64x8_skylake_(sqrt_haversine_f64x8, sqrt_complement_f64x8));
76
81
 
77
- return _mm512_mul_pd(earth_radius, central_angle);
82
+ return _mm512_mul_pd(earth_radius_f64x8, central_angle_f64x8);
78
83
  }
79
84
 
80
85
  NK_PUBLIC void nk_haversine_f64_skylake( //
@@ -83,14 +88,14 @@ NK_PUBLIC void nk_haversine_f64_skylake( //
83
88
  nk_size_t n, nk_f64_t *results) {
84
89
 
85
90
  while (n >= 8) {
86
- __m512d first_latitudes = _mm512_loadu_pd(a_lats);
87
- __m512d first_longitudes = _mm512_loadu_pd(a_lons);
88
- __m512d second_latitudes = _mm512_loadu_pd(b_lats);
89
- __m512d second_longitudes = _mm512_loadu_pd(b_lons);
91
+ __m512d first_latitudes_f64x8 = _mm512_loadu_pd(a_lats);
92
+ __m512d first_longitudes_f64x8 = _mm512_loadu_pd(a_lons);
93
+ __m512d second_latitudes_f64x8 = _mm512_loadu_pd(b_lats);
94
+ __m512d second_longitudes_f64x8 = _mm512_loadu_pd(b_lons);
90
95
 
91
- __m512d distances = nk_haversine_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
92
- second_longitudes);
93
- _mm512_storeu_pd(results, distances);
96
+ __m512d distances_f64x8 = nk_haversine_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
97
+ second_latitudes_f64x8, second_longitudes_f64x8);
98
+ _mm512_storeu_pd(results, distances_f64x8);
94
99
 
95
100
  a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
96
101
  }
@@ -98,14 +103,14 @@ NK_PUBLIC void nk_haversine_f64_skylake( //
98
103
  // Handle remaining elements with masked operations
99
104
  if (n > 0) {
100
105
  __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
101
- __m512d first_latitudes = _mm512_maskz_loadu_pd(mask, a_lats);
102
- __m512d first_longitudes = _mm512_maskz_loadu_pd(mask, a_lons);
103
- __m512d second_latitudes = _mm512_maskz_loadu_pd(mask, b_lats);
104
- __m512d second_longitudes = _mm512_maskz_loadu_pd(mask, b_lons);
105
-
106
- __m512d distances = nk_haversine_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
107
- second_longitudes);
108
- _mm512_mask_storeu_pd(results, mask, distances);
106
+ __m512d first_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lats);
107
+ __m512d first_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lons);
108
+ __m512d second_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lats);
109
+ __m512d second_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lons);
110
+
111
+ __m512d distances_f64x8 = nk_haversine_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
112
+ second_latitudes_f64x8, second_longitudes_f64x8);
113
+ _mm512_mask_storeu_pd(results, mask, distances_f64x8);
109
114
  }
110
115
  }
111
116
 
@@ -113,158 +118,171 @@ NK_PUBLIC void nk_haversine_f64_skylake( //
113
118
  * @brief AVX-512 helper for Vincenty's geodesic distance on 8 f64 point pairs.
114
119
  * @note This is a true SIMD implementation using masked convergence tracking.
115
120
  */
116
- NK_INTERNAL __m512d nk_vincenty_f64x8_skylake_( //
117
- __m512d first_latitudes, __m512d first_longitudes, //
118
- __m512d second_latitudes, __m512d second_longitudes) {
119
-
120
- __m512d const equatorial_radius = _mm512_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
121
- __m512d const polar_radius = _mm512_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
122
- __m512d const flattening = _mm512_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
123
- __m512d const convergence_threshold = _mm512_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
124
- __m512d const one = _mm512_set1_pd(1.0);
125
- __m512d const two = _mm512_set1_pd(2.0);
126
- __m512d const three = _mm512_set1_pd(3.0);
127
- __m512d const four = _mm512_set1_pd(4.0);
128
- __m512d const six = _mm512_set1_pd(6.0);
129
- __m512d const sixteen = _mm512_set1_pd(16.0);
121
+ NK_INTERNAL __m512d nk_vincenty_f64x8_skylake_( //
122
+ __m512d first_latitudes_f64x8, __m512d first_longitudes_f64x8, //
123
+ __m512d second_latitudes_f64x8, __m512d second_longitudes_f64x8) {
124
+
125
+ __m512d const equatorial_radius_f64x8 = _mm512_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
126
+ __m512d const polar_radius_f64x8 = _mm512_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
127
+ __m512d const flattening_f64x8 = _mm512_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
128
+ __m512d const convergence_threshold_f64x8 = _mm512_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
129
+ __m512d const one_f64x8 = _mm512_set1_pd(1.0);
130
+ __m512d const two_f64x8 = _mm512_set1_pd(2.0);
131
+ __m512d const three_f64x8 = _mm512_set1_pd(3.0);
132
+ __m512d const four_f64x8 = _mm512_set1_pd(4.0);
133
+ __m512d const six_f64x8 = _mm512_set1_pd(6.0);
134
+ __m512d const sixteen_f64x8 = _mm512_set1_pd(16.0);
130
135
 
131
136
  // Longitude difference
132
- __m512d longitude_difference = _mm512_sub_pd(second_longitudes, first_longitudes);
137
+ __m512d longitude_difference_f64x8 = _mm512_sub_pd(second_longitudes_f64x8, first_longitudes_f64x8);
133
138
 
134
139
  // Reduced latitudes: tan(U) = (1-f) * tan(lat)
135
- __m512d one_minus_f = _mm512_sub_pd(one, flattening);
136
- __m512d tan_first = _mm512_div_pd(nk_sin_f64x8_skylake_(first_latitudes), nk_cos_f64x8_skylake_(first_latitudes));
137
- __m512d tan_second = _mm512_div_pd(nk_sin_f64x8_skylake_(second_latitudes),
138
- nk_cos_f64x8_skylake_(second_latitudes));
139
- __m512d tan_reduced_first = _mm512_mul_pd(one_minus_f, tan_first);
140
- __m512d tan_reduced_second = _mm512_mul_pd(one_minus_f, tan_second);
140
+ __m512d one_minus_f_f64x8 = _mm512_sub_pd(one_f64x8, flattening_f64x8);
141
+ __m512d tan_first_f64x8 = _mm512_div_pd(nk_sin_f64x8_skylake_(first_latitudes_f64x8),
142
+ nk_cos_f64x8_skylake_(first_latitudes_f64x8));
143
+ __m512d tan_second_f64x8 = _mm512_div_pd(nk_sin_f64x8_skylake_(second_latitudes_f64x8),
144
+ nk_cos_f64x8_skylake_(second_latitudes_f64x8));
145
+ __m512d tan_reduced_first_f64x8 = _mm512_mul_pd(one_minus_f_f64x8, tan_first_f64x8);
146
+ __m512d tan_reduced_second_f64x8 = _mm512_mul_pd(one_minus_f_f64x8, tan_second_f64x8);
141
147
 
142
148
  // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
143
- __m512d cos_reduced_first = _mm512_div_pd(
144
- one, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_first, tan_reduced_first, one)));
145
- __m512d sin_reduced_first = _mm512_mul_pd(tan_reduced_first, cos_reduced_first);
146
- __m512d cos_reduced_second = _mm512_div_pd(
147
- one, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_second, tan_reduced_second, one)));
148
- __m512d sin_reduced_second = _mm512_mul_pd(tan_reduced_second, cos_reduced_second);
149
-
150
- // Initialize lambda and tracking variables
151
- __m512d lambda = longitude_difference;
152
- __m512d sin_angular_distance, cos_angular_distance, angular_distance;
153
- __m512d sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
149
+ __m512d cos_reduced_first_f64x8 = _mm512_div_pd(
150
+ one_f64x8, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_first_f64x8, tan_reduced_first_f64x8, one_f64x8)));
151
+ __m512d sin_reduced_first_f64x8 = _mm512_mul_pd(tan_reduced_first_f64x8, cos_reduced_first_f64x8);
152
+ __m512d cos_reduced_second_f64x8 = _mm512_div_pd(
153
+ one_f64x8, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_second_f64x8, tan_reduced_second_f64x8, one_f64x8)));
154
+ __m512d sin_reduced_second_f64x8 = _mm512_mul_pd(tan_reduced_second_f64x8, cos_reduced_second_f64x8);
155
+
156
+ // Initialize lambda_f64x8 and tracking variables
157
+ __m512d lambda_f64x8 = longitude_difference_f64x8;
158
+ __m512d sin_angular_distance_f64x8, cos_angular_distance_f64x8, angular_distance_f64x8;
159
+ __m512d sin_azimuth_f64x8, cos_squared_azimuth_f64x8, cos_double_angular_midpoint_f64x8;
154
160
 
155
161
  // Track convergence and coincident points
156
162
  __mmask8 converged_mask = 0;
157
163
  __mmask8 coincident_mask = 0;
158
164
 
159
165
  for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFF; ++iteration) {
160
- __m512d sin_lambda = nk_sin_f64x8_skylake_(lambda);
161
- __m512d cos_lambda = nk_cos_f64x8_skylake_(lambda);
162
-
163
- // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
164
- __m512d cross_term = _mm512_mul_pd(cos_reduced_second, sin_lambda);
165
- __m512d mixed_term = _mm512_sub_pd(
166
- _mm512_mul_pd(cos_reduced_first, sin_reduced_second),
167
- _mm512_mul_pd(_mm512_mul_pd(sin_reduced_first, cos_reduced_second), cos_lambda));
168
- __m512d sin_angular_dist_sq = _mm512_fmadd_pd(cross_term, cross_term, _mm512_mul_pd(mixed_term, mixed_term));
169
- sin_angular_distance = _mm512_sqrt_pd(sin_angular_dist_sq);
170
-
171
- // Check for coincident points (sin_angular_distance ≈ 0)
172
- coincident_mask = _mm512_cmp_pd_mask(sin_angular_distance, _mm512_set1_pd(1e-15), _CMP_LT_OS);
173
-
174
- // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
175
- cos_angular_distance = _mm512_fmadd_pd(_mm512_mul_pd(cos_reduced_first, cos_reduced_second), cos_lambda,
176
- _mm512_mul_pd(sin_reduced_first, sin_reduced_second));
177
-
178
- // angular_distance = atan2(sin, cos)
179
- angular_distance = nk_atan2_f64x8_skylake_(sin_angular_distance, cos_angular_distance);
180
-
181
- // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
166
+ __m512d sin_lambda_f64x8 = nk_sin_f64x8_skylake_(lambda_f64x8);
167
+ __m512d cos_lambda_f64x8 = nk_cos_f64x8_skylake_(lambda_f64x8);
168
+
169
+ // sin²(angular_distance_f64x8) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
170
+ __m512d cross_term_f64x8 = _mm512_mul_pd(cos_reduced_second_f64x8, sin_lambda_f64x8);
171
+ __m512d mixed_term_f64x8 = _mm512_sub_pd(
172
+ _mm512_mul_pd(cos_reduced_first_f64x8, sin_reduced_second_f64x8),
173
+ _mm512_mul_pd(_mm512_mul_pd(sin_reduced_first_f64x8, cos_reduced_second_f64x8), cos_lambda_f64x8));
174
+ __m512d sin_angular_dist_sq_f64x8 = _mm512_fmadd_pd(cross_term_f64x8, cross_term_f64x8,
175
+ _mm512_mul_pd(mixed_term_f64x8, mixed_term_f64x8));
176
+ sin_angular_distance_f64x8 = _mm512_sqrt_pd(sin_angular_dist_sq_f64x8);
177
+
178
+ // Check for coincident points (sin_angular_distance_f64x8 ≈ 0)
179
+ coincident_mask = _mm512_cmp_pd_mask(sin_angular_distance_f64x8, _mm512_set1_pd(1e-15), _CMP_LT_OS);
180
+
181
+ // cos(angular_distance_f64x8) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
182
+ cos_angular_distance_f64x8 = _mm512_fmadd_pd(_mm512_mul_pd(cos_reduced_first_f64x8, cos_reduced_second_f64x8),
183
+ cos_lambda_f64x8,
184
+ _mm512_mul_pd(sin_reduced_first_f64x8, sin_reduced_second_f64x8));
185
+
186
+ // angular_distance_f64x8 = atan2(sin, cos)
187
+ angular_distance_f64x8 = nk_atan2_f64x8_skylake_(sin_angular_distance_f64x8, cos_angular_distance_f64x8);
188
+
189
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f64x8)
182
190
  // Use masked divide: zero result for coincident lanes, avoids division by zero
183
- sin_azimuth = _mm512_maskz_div_pd(
191
+ sin_azimuth_f64x8 = _mm512_maskz_div_pd(
184
192
  _knot_mask8(coincident_mask),
185
- _mm512_mul_pd(_mm512_mul_pd(cos_reduced_first, cos_reduced_second), sin_lambda), sin_angular_distance);
186
- cos_squared_azimuth = _mm512_sub_pd(one, _mm512_mul_pd(sin_azimuth, sin_azimuth));
193
+ _mm512_mul_pd(_mm512_mul_pd(cos_reduced_first_f64x8, cos_reduced_second_f64x8), sin_lambda_f64x8),
194
+ sin_angular_distance_f64x8);
195
+ cos_squared_azimuth_f64x8 = _mm512_sub_pd(one_f64x8, _mm512_mul_pd(sin_azimuth_f64x8, sin_azimuth_f64x8));
187
196
 
188
197
  // Handle equatorial case: cos²α = 0
189
- __mmask8 equatorial_mask = _mm512_cmp_pd_mask(cos_squared_azimuth, _mm512_set1_pd(1e-15), _CMP_LT_OS);
198
+ __mmask8 equatorial_mask = _mm512_cmp_pd_mask(cos_squared_azimuth_f64x8, _mm512_set1_pd(1e-15), _CMP_LT_OS);
190
199
 
191
200
  // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
192
- // Use masked divide: for equatorial lanes, quotient = cos_angular_distance (passthrough),
201
+ // Use masked divide: for equatorial lanes, quotient_f64x8 = cos_angular_distance_f64x8 (passthrough),
193
202
  // so subtraction yields zero. Avoids division by zero.
194
- __m512d sin_product = _mm512_mul_pd(sin_reduced_first, sin_reduced_second);
195
- __m512d quotient = _mm512_mask_div_pd(cos_angular_distance, _knot_mask8(equatorial_mask),
196
- _mm512_mul_pd(two, sin_product), cos_squared_azimuth);
197
- cos_double_angular_midpoint = _mm512_sub_pd(cos_angular_distance, quotient);
203
+ __m512d sin_product_f64x8 = _mm512_mul_pd(sin_reduced_first_f64x8, sin_reduced_second_f64x8);
204
+ __m512d quotient_f64x8 = _mm512_mask_div_pd(cos_angular_distance_f64x8, _knot_mask8(equatorial_mask),
205
+ _mm512_mul_pd(two_f64x8, sin_product_f64x8),
206
+ cos_squared_azimuth_f64x8);
207
+ cos_double_angular_midpoint_f64x8 = _mm512_sub_pd(cos_angular_distance_f64x8, quotient_f64x8);
198
208
 
199
209
  // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
200
- __m512d correction_factor = _mm512_mul_pd(
201
- _mm512_div_pd(flattening, sixteen),
202
- _mm512_mul_pd(cos_squared_azimuth,
203
- _mm512_fmadd_pd(flattening, _mm512_fnmadd_pd(three, cos_squared_azimuth, four), four)));
210
+ __m512d correction_factor_f64x8 = _mm512_mul_pd(
211
+ _mm512_div_pd(flattening_f64x8, sixteen_f64x8),
212
+ _mm512_mul_pd(
213
+ cos_squared_azimuth_f64x8,
214
+ _mm512_fmadd_pd(flattening_f64x8, _mm512_fnmadd_pd(three_f64x8, cos_squared_azimuth_f64x8, four_f64x8),
215
+ four_f64x8)));
204
216
 
205
217
  // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
206
- __m512d cos_2sm_sq = _mm512_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
207
- // innermost = -1 + 2 × cos²(2σₘ)
208
- __m512d innermost = _mm512_fmadd_pd(two, cos_2sm_sq, _mm512_set1_pd(-1.0));
209
- // middle = cos(2σₘ) + C × cos(σ) × innermost
210
- __m512d middle = _mm512_fmadd_pd(_mm512_mul_pd(correction_factor, cos_angular_distance), innermost,
211
- cos_double_angular_midpoint);
212
- // inner = C × sin(σ) × middle
213
- __m512d inner = _mm512_mul_pd(_mm512_mul_pd(correction_factor, sin_angular_distance), middle);
214
-
215
- // λ' = L + (1-C) * f * sin_α * (σ + inner)
216
- __m512d lambda_new = _mm512_fmadd_pd(
217
- _mm512_mul_pd(_mm512_mul_pd(_mm512_sub_pd(one, correction_factor), flattening), sin_azimuth),
218
- _mm512_add_pd(angular_distance, inner), longitude_difference);
218
+ __m512d cos_2sm_sq_f64x8 = _mm512_mul_pd(cos_double_angular_midpoint_f64x8, cos_double_angular_midpoint_f64x8);
219
+ // innermost_f64x8 = -1 + 2 × cos²(2σₘ)
220
+ __m512d innermost_f64x8 = _mm512_fmadd_pd(two_f64x8, cos_2sm_sq_f64x8, _mm512_set1_pd(-1.0));
221
+ // middle_f64x8 = cos(2σₘ) + C × cos(σ) × innermost_f64x8
222
+ __m512d middle_f64x8 = _mm512_fmadd_pd(_mm512_mul_pd(correction_factor_f64x8, cos_angular_distance_f64x8),
223
+ innermost_f64x8, cos_double_angular_midpoint_f64x8);
224
+ // inner_f64x8 = C × sin(σ) × middle_f64x8
225
+ __m512d inner_f64x8 = _mm512_mul_pd(_mm512_mul_pd(correction_factor_f64x8, sin_angular_distance_f64x8),
226
+ middle_f64x8);
227
+
228
+ // λ' = L + (1-C) * f * sin_α * (σ + inner_f64x8)
229
+ __m512d lambda_new_f64x8 = _mm512_fmadd_pd(
230
+ _mm512_mul_pd(_mm512_mul_pd(_mm512_sub_pd(one_f64x8, correction_factor_f64x8), flattening_f64x8),
231
+ sin_azimuth_f64x8),
232
+ _mm512_add_pd(angular_distance_f64x8, inner_f64x8), longitude_difference_f64x8);
219
233
 
220
234
  // Check convergence: |λ - λ'| < threshold
221
- __m512d lambda_diff = _mm512_abs_pd(_mm512_sub_pd(lambda_new, lambda));
222
- converged_mask = _mm512_cmp_pd_mask(lambda_diff, convergence_threshold, _CMP_LT_OS);
235
+ __m512d lambda_diff_f64x8 = _mm512_abs_pd(_mm512_sub_pd(lambda_new_f64x8, lambda_f64x8));
236
+ converged_mask = _mm512_cmp_pd_mask(lambda_diff_f64x8, convergence_threshold_f64x8, _CMP_LT_OS);
223
237
 
224
- lambda = lambda_new;
238
+ lambda_f64x8 = lambda_new_f64x8;
225
239
  }
226
240
 
227
241
  // Final distance calculation
228
242
  // u² = cos²α * (a² - b²) / b²
229
- __m512d a_sq = _mm512_mul_pd(equatorial_radius, equatorial_radius);
230
- __m512d b_sq = _mm512_mul_pd(polar_radius, polar_radius);
231
- __m512d u_squared = _mm512_div_pd(_mm512_mul_pd(cos_squared_azimuth, _mm512_sub_pd(a_sq, b_sq)), b_sq);
243
+ __m512d a_sq_f64x8 = _mm512_mul_pd(equatorial_radius_f64x8, equatorial_radius_f64x8);
244
+ __m512d b_sq_f64x8 = _mm512_mul_pd(polar_radius_f64x8, polar_radius_f64x8);
245
+ __m512d u_squared_f64x8 = _mm512_div_pd(
246
+ _mm512_mul_pd(cos_squared_azimuth_f64x8, _mm512_sub_pd(a_sq_f64x8, b_sq_f64x8)), b_sq_f64x8);
232
247
 
233
248
  // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
234
- __m512d series_a = _mm512_fmadd_pd(u_squared, _mm512_set1_pd(-175.0), _mm512_set1_pd(320.0));
235
- series_a = _mm512_fmadd_pd(u_squared, series_a, _mm512_set1_pd(-768.0));
236
- series_a = _mm512_fmadd_pd(u_squared, series_a, _mm512_set1_pd(4096.0));
237
- series_a = _mm512_fmadd_pd(_mm512_div_pd(u_squared, _mm512_set1_pd(16384.0)), series_a, one);
249
+ __m512d series_a_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, _mm512_set1_pd(-175.0), _mm512_set1_pd(320.0));
250
+ series_a_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_a_f64x8, _mm512_set1_pd(-768.0));
251
+ series_a_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_a_f64x8, _mm512_set1_pd(4096.0));
252
+ series_a_f64x8 = _mm512_fmadd_pd(_mm512_div_pd(u_squared_f64x8, _mm512_set1_pd(16384.0)), series_a_f64x8,
253
+ one_f64x8);
238
254
 
239
255
  // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
240
- __m512d series_b = _mm512_fmadd_pd(u_squared, _mm512_set1_pd(-47.0), _mm512_set1_pd(74.0));
241
- series_b = _mm512_fmadd_pd(u_squared, series_b, _mm512_set1_pd(-128.0));
242
- series_b = _mm512_fmadd_pd(u_squared, series_b, _mm512_set1_pd(256.0));
243
- series_b = _mm512_mul_pd(_mm512_div_pd(u_squared, _mm512_set1_pd(1024.0)), series_b);
256
+ __m512d series_b_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, _mm512_set1_pd(-47.0), _mm512_set1_pd(74.0));
257
+ series_b_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_b_f64x8, _mm512_set1_pd(-128.0));
258
+ series_b_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_b_f64x8, _mm512_set1_pd(256.0));
259
+ series_b_f64x8 = _mm512_mul_pd(_mm512_div_pd(u_squared_f64x8, _mm512_set1_pd(1024.0)), series_b_f64x8);
244
260
 
245
261
  // Δσ = B × sin(σ) × (cos(2σₘ) +
246
262
  // B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
247
- __m512d cos_2sm_sq = _mm512_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
248
- __m512d sin_sq = _mm512_mul_pd(sin_angular_distance, sin_angular_distance);
249
- __m512d term1 = _mm512_fmadd_pd(two, cos_2sm_sq, _mm512_set1_pd(-1.0));
250
- term1 = _mm512_mul_pd(cos_angular_distance, term1);
251
- __m512d term2 = _mm512_fmadd_pd(four, sin_sq, _mm512_set1_pd(-3.0));
252
- __m512d term3 = _mm512_fmadd_pd(four, cos_2sm_sq, _mm512_set1_pd(-3.0));
253
- term2 = _mm512_mul_pd(_mm512_mul_pd(_mm512_div_pd(series_b, six), cos_double_angular_midpoint),
254
- _mm512_mul_pd(term2, term3));
255
- __m512d delta_sigma = _mm512_mul_pd(
256
- series_b, _mm512_mul_pd(sin_angular_distance, _mm512_add_pd(cos_double_angular_midpoint,
257
- _mm512_mul_pd(_mm512_div_pd(series_b, four),
258
- _mm512_sub_pd(term1, term2)))));
263
+ __m512d cos_2sm_sq_f64x8 = _mm512_mul_pd(cos_double_angular_midpoint_f64x8, cos_double_angular_midpoint_f64x8);
264
+ __m512d sin_sq_f64x8 = _mm512_mul_pd(sin_angular_distance_f64x8, sin_angular_distance_f64x8);
265
+ __m512d term1_f64x8 = _mm512_fmadd_pd(two_f64x8, cos_2sm_sq_f64x8, _mm512_set1_pd(-1.0));
266
+ term1_f64x8 = _mm512_mul_pd(cos_angular_distance_f64x8, term1_f64x8);
267
+ __m512d term2_f64x8 = _mm512_fmadd_pd(four_f64x8, sin_sq_f64x8, _mm512_set1_pd(-3.0));
268
+ __m512d term3_f64x8 = _mm512_fmadd_pd(four_f64x8, cos_2sm_sq_f64x8, _mm512_set1_pd(-3.0));
269
+ term2_f64x8 = _mm512_mul_pd(
270
+ _mm512_mul_pd(_mm512_div_pd(series_b_f64x8, six_f64x8), cos_double_angular_midpoint_f64x8),
271
+ _mm512_mul_pd(term2_f64x8, term3_f64x8));
272
+ __m512d delta_sigma_f64x8 = _mm512_mul_pd(
273
+ series_b_f64x8, _mm512_mul_pd(sin_angular_distance_f64x8,
274
+ _mm512_add_pd(cos_double_angular_midpoint_f64x8,
275
+ _mm512_mul_pd(_mm512_div_pd(series_b_f64x8, four_f64x8),
276
+ _mm512_sub_pd(term1_f64x8, term2_f64x8)))));
259
277
 
260
278
  // s = b * A * (σ - Δσ)
261
- __m512d distances = _mm512_mul_pd(_mm512_mul_pd(polar_radius, series_a),
262
- _mm512_sub_pd(angular_distance, delta_sigma));
279
+ __m512d distances_f64x8 = _mm512_mul_pd(_mm512_mul_pd(polar_radius_f64x8, series_a_f64x8),
280
+ _mm512_sub_pd(angular_distance_f64x8, delta_sigma_f64x8));
263
281
 
264
282
  // Set coincident points to zero
265
- distances = _mm512_mask_blend_pd(coincident_mask, distances, _mm512_setzero_pd());
283
+ distances_f64x8 = _mm512_mask_blend_pd(coincident_mask, distances_f64x8, _mm512_setzero_pd());
266
284
 
267
- return distances;
285
+ return distances_f64x8;
268
286
  }
269
287
 
270
288
  NK_PUBLIC void nk_vincenty_f64_skylake( //
@@ -273,14 +291,14 @@ NK_PUBLIC void nk_vincenty_f64_skylake( //
273
291
  nk_size_t n, nk_f64_t *results) {
274
292
 
275
293
  while (n >= 8) {
276
- __m512d first_latitudes = _mm512_loadu_pd(a_lats);
277
- __m512d first_longitudes = _mm512_loadu_pd(a_lons);
278
- __m512d second_latitudes = _mm512_loadu_pd(b_lats);
279
- __m512d second_longitudes = _mm512_loadu_pd(b_lons);
294
+ __m512d first_latitudes_f64x8 = _mm512_loadu_pd(a_lats);
295
+ __m512d first_longitudes_f64x8 = _mm512_loadu_pd(a_lons);
296
+ __m512d second_latitudes_f64x8 = _mm512_loadu_pd(b_lats);
297
+ __m512d second_longitudes_f64x8 = _mm512_loadu_pd(b_lons);
280
298
 
281
- __m512d distances = nk_vincenty_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
282
- second_longitudes);
283
- _mm512_storeu_pd(results, distances);
299
+ __m512d distances_f64x8 = nk_vincenty_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
300
+ second_latitudes_f64x8, second_longitudes_f64x8);
301
+ _mm512_storeu_pd(results, distances_f64x8);
284
302
 
285
303
  a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
286
304
  }
@@ -288,56 +306,60 @@ NK_PUBLIC void nk_vincenty_f64_skylake( //
288
306
  // Handle remaining elements with masked operations
289
307
  if (n > 0) {
290
308
  __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
291
- __m512d first_latitudes = _mm512_maskz_loadu_pd(mask, a_lats);
292
- __m512d first_longitudes = _mm512_maskz_loadu_pd(mask, a_lons);
293
- __m512d second_latitudes = _mm512_maskz_loadu_pd(mask, b_lats);
294
- __m512d second_longitudes = _mm512_maskz_loadu_pd(mask, b_lons);
295
-
296
- __m512d distances = nk_vincenty_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
297
- second_longitudes);
298
- _mm512_mask_storeu_pd(results, mask, distances);
309
+ __m512d first_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lats);
310
+ __m512d first_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lons);
311
+ __m512d second_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lats);
312
+ __m512d second_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lons);
313
+
314
+ __m512d distances_f64x8 = nk_vincenty_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
315
+ second_latitudes_f64x8, second_longitudes_f64x8);
316
+ _mm512_mask_storeu_pd(results, mask, distances_f64x8);
299
317
  }
300
318
  }
301
319
 
302
- NK_INTERNAL __m512 nk_haversine_f32x16_skylake_( //
303
- __m512 first_latitudes, __m512 first_longitudes, //
304
- __m512 second_latitudes, __m512 second_longitudes) {
320
+ NK_INTERNAL __m512 nk_haversine_f32x16_skylake_( //
321
+ __m512 first_latitudes_f32x16, __m512 first_longitudes_f32x16, //
322
+ __m512 second_latitudes_f32x16, __m512 second_longitudes_f32x16) {
305
323
 
306
- __m512 const earth_radius = _mm512_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
307
- __m512 const half = _mm512_set1_ps(0.5f);
308
- __m512 const one = _mm512_set1_ps(1.0f);
309
- __m512 const two = _mm512_set1_ps(2.0f);
324
+ __m512 const earth_radius_f32x16 = _mm512_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
325
+ __m512 const half_f32x16 = _mm512_set1_ps(0.5f);
326
+ __m512 const one_f32x16 = _mm512_set1_ps(1.0f);
327
+ __m512 const two_f32x16 = _mm512_set1_ps(2.0f);
310
328
 
311
- __m512 latitude_delta = _mm512_sub_ps(second_latitudes, first_latitudes);
312
- __m512 longitude_delta = _mm512_sub_ps(second_longitudes, first_longitudes);
329
+ __m512 latitude_delta_f32x16 = _mm512_sub_ps(second_latitudes_f32x16, first_latitudes_f32x16);
330
+ __m512 longitude_delta_f32x16 = _mm512_sub_ps(second_longitudes_f32x16, first_longitudes_f32x16);
313
331
 
314
332
  // Haversine terms: sin²(Δ/2)
315
- __m512 latitude_delta_half = _mm512_mul_ps(latitude_delta, half);
316
- __m512 longitude_delta_half = _mm512_mul_ps(longitude_delta, half);
317
- __m512 sin_latitude_delta_half = nk_sin_f32x16_skylake_(latitude_delta_half);
318
- __m512 sin_longitude_delta_half = nk_sin_f32x16_skylake_(longitude_delta_half);
319
- __m512 sin_squared_latitude_delta_half = _mm512_mul_ps(sin_latitude_delta_half, sin_latitude_delta_half);
320
- __m512 sin_squared_longitude_delta_half = _mm512_mul_ps(sin_longitude_delta_half, sin_longitude_delta_half);
333
+ __m512 latitude_delta_half_f32x16 = _mm512_mul_ps(latitude_delta_f32x16, half_f32x16);
334
+ __m512 longitude_delta_half_f32x16 = _mm512_mul_ps(longitude_delta_f32x16, half_f32x16);
335
+ __m512 sin_latitude_delta_half_f32x16 = nk_sin_f32x16_skylake_(latitude_delta_half_f32x16);
336
+ __m512 sin_longitude_delta_half_f32x16 = nk_sin_f32x16_skylake_(longitude_delta_half_f32x16);
337
+ __m512 sin_squared_latitude_delta_half_f32x16 = _mm512_mul_ps(sin_latitude_delta_half_f32x16,
338
+ sin_latitude_delta_half_f32x16);
339
+ __m512 sin_squared_longitude_delta_half_f32x16 = _mm512_mul_ps(sin_longitude_delta_half_f32x16,
340
+ sin_longitude_delta_half_f32x16);
321
341
 
322
342
  // Latitude cosine product
323
- __m512 cos_first_latitude = nk_cos_f32x16_skylake_(first_latitudes);
324
- __m512 cos_second_latitude = nk_cos_f32x16_skylake_(second_latitudes);
325
- __m512 cos_latitude_product = _mm512_mul_ps(cos_first_latitude, cos_second_latitude);
343
+ __m512 cos_first_latitude_f32x16 = nk_cos_f32x16_skylake_(first_latitudes_f32x16);
344
+ __m512 cos_second_latitude_f32x16 = nk_cos_f32x16_skylake_(second_latitudes_f32x16);
345
+ __m512 cos_latitude_product_f32x16 = _mm512_mul_ps(cos_first_latitude_f32x16, cos_second_latitude_f32x16);
326
346
 
327
347
  // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
328
- __m512 haversine_term = _mm512_add_ps(sin_squared_latitude_delta_half,
329
- _mm512_mul_ps(cos_latitude_product, sin_squared_longitude_delta_half));
348
+ __m512 haversine_term_f32x16 = _mm512_add_ps(
349
+ sin_squared_latitude_delta_half_f32x16,
350
+ _mm512_mul_ps(cos_latitude_product_f32x16, sin_squared_longitude_delta_half_f32x16));
330
351
 
331
352
  // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
332
- __m512 zero = _mm512_setzero_ps();
333
- haversine_term = _mm512_max_ps(zero, _mm512_min_ps(one, haversine_term));
353
+ __m512 zero_f32x16 = _mm512_setzero_ps();
354
+ haversine_term_f32x16 = _mm512_max_ps(zero_f32x16, _mm512_min_ps(one_f32x16, haversine_term_f32x16));
334
355
 
335
356
  // Central angle: c = 2 × atan2(√a, √(1-a))
336
- __m512 sqrt_haversine = _mm512_sqrt_ps(haversine_term);
337
- __m512 sqrt_complement = _mm512_sqrt_ps(_mm512_sub_ps(one, haversine_term));
338
- __m512 central_angle = _mm512_mul_ps(two, nk_atan2_f32x16_skylake_(sqrt_haversine, sqrt_complement));
357
+ __m512 sqrt_haversine_f32x16 = _mm512_sqrt_ps(haversine_term_f32x16);
358
+ __m512 sqrt_complement_f32x16 = _mm512_sqrt_ps(_mm512_sub_ps(one_f32x16, haversine_term_f32x16));
359
+ __m512 central_angle_f32x16 = _mm512_mul_ps(
360
+ two_f32x16, nk_atan2_f32x16_skylake_(sqrt_haversine_f32x16, sqrt_complement_f32x16));
339
361
 
340
- return _mm512_mul_ps(earth_radius, central_angle);
362
+ return _mm512_mul_ps(earth_radius_f32x16, central_angle_f32x16);
341
363
  }
342
364
 
343
365
  NK_PUBLIC void nk_haversine_f32_skylake( //
@@ -346,14 +368,14 @@ NK_PUBLIC void nk_haversine_f32_skylake( //
346
368
  nk_size_t n, nk_f32_t *results) {
347
369
 
348
370
  while (n >= 16) {
349
- __m512 first_latitudes = _mm512_loadu_ps(a_lats);
350
- __m512 first_longitudes = _mm512_loadu_ps(a_lons);
351
- __m512 second_latitudes = _mm512_loadu_ps(b_lats);
352
- __m512 second_longitudes = _mm512_loadu_ps(b_lons);
371
+ __m512 first_latitudes_f32x16 = _mm512_loadu_ps(a_lats);
372
+ __m512 first_longitudes_f32x16 = _mm512_loadu_ps(a_lons);
373
+ __m512 second_latitudes_f32x16 = _mm512_loadu_ps(b_lats);
374
+ __m512 second_longitudes_f32x16 = _mm512_loadu_ps(b_lons);
353
375
 
354
- __m512 distances = nk_haversine_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
355
- second_longitudes);
356
- _mm512_storeu_ps(results, distances);
376
+ __m512 distances_f32x16 = nk_haversine_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
377
+ second_latitudes_f32x16, second_longitudes_f32x16);
378
+ _mm512_storeu_ps(results, distances_f32x16);
357
379
 
358
380
  a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
359
381
  }
@@ -361,14 +383,14 @@ NK_PUBLIC void nk_haversine_f32_skylake( //
361
383
  // Handle remaining elements with masked operations
362
384
  if (n > 0) {
363
385
  __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
364
- __m512 first_latitudes = _mm512_maskz_loadu_ps(mask, a_lats);
365
- __m512 first_longitudes = _mm512_maskz_loadu_ps(mask, a_lons);
366
- __m512 second_latitudes = _mm512_maskz_loadu_ps(mask, b_lats);
367
- __m512 second_longitudes = _mm512_maskz_loadu_ps(mask, b_lons);
368
-
369
- __m512 distances = nk_haversine_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
370
- second_longitudes);
371
- _mm512_mask_storeu_ps(results, mask, distances);
386
+ __m512 first_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lats);
387
+ __m512 first_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lons);
388
+ __m512 second_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lats);
389
+ __m512 second_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lons);
390
+
391
+ __m512 distances_f32x16 = nk_haversine_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
392
+ second_latitudes_f32x16, second_longitudes_f32x16);
393
+ _mm512_mask_storeu_ps(results, mask, distances_f32x16);
372
394
  }
373
395
  }
374
396
 
@@ -376,158 +398,172 @@ NK_PUBLIC void nk_haversine_f32_skylake( //
376
398
  * @brief AVX-512 helper for Vincenty's geodesic distance on 16 f32 point pairs.
377
399
  * @note This is a true SIMD implementation using masked convergence tracking.
378
400
  */
379
- NK_INTERNAL __m512 nk_vincenty_f32x16_skylake_( //
380
- __m512 first_latitudes, __m512 first_longitudes, //
381
- __m512 second_latitudes, __m512 second_longitudes) {
382
-
383
- __m512 const equatorial_radius = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
384
- __m512 const polar_radius = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
385
- __m512 const flattening = _mm512_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
386
- __m512 const convergence_threshold = _mm512_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
387
- __m512 const one = _mm512_set1_ps(1.0f);
388
- __m512 const two = _mm512_set1_ps(2.0f);
389
- __m512 const three = _mm512_set1_ps(3.0f);
390
- __m512 const four = _mm512_set1_ps(4.0f);
391
- __m512 const six = _mm512_set1_ps(6.0f);
392
- __m512 const sixteen = _mm512_set1_ps(16.0f);
401
+ NK_INTERNAL __m512 nk_vincenty_f32x16_skylake_( //
402
+ __m512 first_latitudes_f32x16, __m512 first_longitudes_f32x16, //
403
+ __m512 second_latitudes_f32x16, __m512 second_longitudes_f32x16) {
404
+
405
+ __m512 const equatorial_radius_f32x16 = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
406
+ __m512 const polar_radius_f32x16 = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
407
+ __m512 const flattening_f32x16 = _mm512_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
408
+ __m512 const convergence_threshold_f32x16 = _mm512_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
409
+ __m512 const one_f32x16 = _mm512_set1_ps(1.0f);
410
+ __m512 const two_f32x16 = _mm512_set1_ps(2.0f);
411
+ __m512 const three_f32x16 = _mm512_set1_ps(3.0f);
412
+ __m512 const four_f32x16 = _mm512_set1_ps(4.0f);
413
+ __m512 const six_f32x16 = _mm512_set1_ps(6.0f);
414
+ __m512 const sixteen_f32x16 = _mm512_set1_ps(16.0f);
393
415
 
394
416
  // Longitude difference
395
- __m512 longitude_difference = _mm512_sub_ps(second_longitudes, first_longitudes);
417
+ __m512 longitude_difference_f32x16 = _mm512_sub_ps(second_longitudes_f32x16, first_longitudes_f32x16);
396
418
 
397
419
  // Reduced latitudes: tan(U) = (1-f) * tan(lat)
398
- __m512 one_minus_f = _mm512_sub_ps(one, flattening);
399
- __m512 tan_first = _mm512_div_ps(nk_sin_f32x16_skylake_(first_latitudes), nk_cos_f32x16_skylake_(first_latitudes));
400
- __m512 tan_second = _mm512_div_ps(nk_sin_f32x16_skylake_(second_latitudes),
401
- nk_cos_f32x16_skylake_(second_latitudes));
402
- __m512 tan_reduced_first = _mm512_mul_ps(one_minus_f, tan_first);
403
- __m512 tan_reduced_second = _mm512_mul_ps(one_minus_f, tan_second);
420
+ __m512 one_minus_f_f32x16 = _mm512_sub_ps(one_f32x16, flattening_f32x16);
421
+ __m512 tan_first_f32x16 = _mm512_div_ps(nk_sin_f32x16_skylake_(first_latitudes_f32x16),
422
+ nk_cos_f32x16_skylake_(first_latitudes_f32x16));
423
+ __m512 tan_second_f32x16 = _mm512_div_ps(nk_sin_f32x16_skylake_(second_latitudes_f32x16),
424
+ nk_cos_f32x16_skylake_(second_latitudes_f32x16));
425
+ __m512 tan_reduced_first_f32x16 = _mm512_mul_ps(one_minus_f_f32x16, tan_first_f32x16);
426
+ __m512 tan_reduced_second_f32x16 = _mm512_mul_ps(one_minus_f_f32x16, tan_second_f32x16);
404
427
 
405
428
  // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
406
- __m512 cos_reduced_first = _mm512_div_ps(
407
- one, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_first, tan_reduced_first, one)));
408
- __m512 sin_reduced_first = _mm512_mul_ps(tan_reduced_first, cos_reduced_first);
409
- __m512 cos_reduced_second = _mm512_div_ps(
410
- one, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_second, tan_reduced_second, one)));
411
- __m512 sin_reduced_second = _mm512_mul_ps(tan_reduced_second, cos_reduced_second);
412
-
413
- // Initialize lambda and tracking variables
414
- __m512 lambda = longitude_difference;
415
- __m512 sin_angular_distance, cos_angular_distance, angular_distance;
416
- __m512 sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
429
+ __m512 cos_reduced_first_f32x16 = _mm512_div_ps(
430
+ one_f32x16, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_first_f32x16, tan_reduced_first_f32x16, one_f32x16)));
431
+ __m512 sin_reduced_first_f32x16 = _mm512_mul_ps(tan_reduced_first_f32x16, cos_reduced_first_f32x16);
432
+ __m512 cos_reduced_second_f32x16 = _mm512_div_ps(
433
+ one_f32x16, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_second_f32x16, tan_reduced_second_f32x16, one_f32x16)));
434
+ __m512 sin_reduced_second_f32x16 = _mm512_mul_ps(tan_reduced_second_f32x16, cos_reduced_second_f32x16);
435
+
436
+ // Initialize lambda_f32x16 and tracking variables
437
+ __m512 lambda_f32x16 = longitude_difference_f32x16;
438
+ __m512 sin_angular_distance_f32x16, cos_angular_distance_f32x16, angular_distance_f32x16;
439
+ __m512 sin_azimuth_f32x16, cos_squared_azimuth_f32x16, cos_double_angular_midpoint_f32x16;
417
440
 
418
441
  // Track convergence and coincident points
419
442
  __mmask16 converged_mask = 0;
420
443
  __mmask16 coincident_mask = 0;
421
444
 
422
445
  for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFFFF; ++iteration) {
423
- __m512 sin_lambda = nk_sin_f32x16_skylake_(lambda);
424
- __m512 cos_lambda = nk_cos_f32x16_skylake_(lambda);
425
-
426
- // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
427
- __m512 cross_term = _mm512_mul_ps(cos_reduced_second, sin_lambda);
428
- __m512 mixed_term = _mm512_sub_ps(
429
- _mm512_mul_ps(cos_reduced_first, sin_reduced_second),
430
- _mm512_mul_ps(_mm512_mul_ps(sin_reduced_first, cos_reduced_second), cos_lambda));
431
- __m512 sin_angular_dist_sq = _mm512_fmadd_ps(cross_term, cross_term, _mm512_mul_ps(mixed_term, mixed_term));
432
- sin_angular_distance = _mm512_sqrt_ps(sin_angular_dist_sq);
433
-
434
- // Check for coincident points (sin_angular_distance ≈ 0)
435
- coincident_mask = _mm512_cmp_ps_mask(sin_angular_distance, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
436
-
437
- // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
438
- cos_angular_distance = _mm512_fmadd_ps(_mm512_mul_ps(cos_reduced_first, cos_reduced_second), cos_lambda,
439
- _mm512_mul_ps(sin_reduced_first, sin_reduced_second));
440
-
441
- // angular_distance = atan2(sin, cos)
442
- angular_distance = nk_atan2_f32x16_skylake_(sin_angular_distance, cos_angular_distance);
443
-
444
- // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
446
+ __m512 sin_lambda_f32x16 = nk_sin_f32x16_skylake_(lambda_f32x16);
447
+ __m512 cos_lambda_f32x16 = nk_cos_f32x16_skylake_(lambda_f32x16);
448
+
449
+ // sin²(angular_distance_f32x16) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
450
+ __m512 cross_term_f32x16 = _mm512_mul_ps(cos_reduced_second_f32x16, sin_lambda_f32x16);
451
+ __m512 mixed_term_f32x16 = _mm512_sub_ps(
452
+ _mm512_mul_ps(cos_reduced_first_f32x16, sin_reduced_second_f32x16),
453
+ _mm512_mul_ps(_mm512_mul_ps(sin_reduced_first_f32x16, cos_reduced_second_f32x16), cos_lambda_f32x16));
454
+ __m512 sin_angular_dist_sq_f32x16 = _mm512_fmadd_ps(cross_term_f32x16, cross_term_f32x16,
455
+ _mm512_mul_ps(mixed_term_f32x16, mixed_term_f32x16));
456
+ sin_angular_distance_f32x16 = _mm512_sqrt_ps(sin_angular_dist_sq_f32x16);
457
+
458
+ // Check for coincident points (sin_angular_distance_f32x16 ≈ 0)
459
+ coincident_mask = _mm512_cmp_ps_mask(sin_angular_distance_f32x16, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
460
+
461
+ // cos(angular_distance_f32x16) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
462
+ cos_angular_distance_f32x16 = _mm512_fmadd_ps(
463
+ _mm512_mul_ps(cos_reduced_first_f32x16, cos_reduced_second_f32x16), cos_lambda_f32x16,
464
+ _mm512_mul_ps(sin_reduced_first_f32x16, sin_reduced_second_f32x16));
465
+
466
+ // angular_distance_f32x16 = atan2(sin, cos)
467
+ angular_distance_f32x16 = nk_atan2_f32x16_skylake_(sin_angular_distance_f32x16, cos_angular_distance_f32x16);
468
+
469
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f32x16)
445
470
  // Use masked divide: zero result for coincident lanes, avoids division by zero
446
- sin_azimuth = _mm512_maskz_div_ps(
471
+ sin_azimuth_f32x16 = _mm512_maskz_div_ps(
447
472
  _knot_mask16(coincident_mask),
448
- _mm512_mul_ps(_mm512_mul_ps(cos_reduced_first, cos_reduced_second), sin_lambda), sin_angular_distance);
449
- cos_squared_azimuth = _mm512_sub_ps(one, _mm512_mul_ps(sin_azimuth, sin_azimuth));
473
+ _mm512_mul_ps(_mm512_mul_ps(cos_reduced_first_f32x16, cos_reduced_second_f32x16), sin_lambda_f32x16),
474
+ sin_angular_distance_f32x16);
475
+ cos_squared_azimuth_f32x16 = _mm512_sub_ps(one_f32x16, _mm512_mul_ps(sin_azimuth_f32x16, sin_azimuth_f32x16));
450
476
 
451
477
  // Handle equatorial case: cos²α = 0
452
- __mmask16 equatorial_mask = _mm512_cmp_ps_mask(cos_squared_azimuth, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
478
+ __mmask16 equatorial_mask = _mm512_cmp_ps_mask(cos_squared_azimuth_f32x16, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
453
479
 
454
480
  // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
455
- // Use masked divide: for equatorial lanes, quotient = cos_angular_distance (passthrough),
481
+ // Use masked divide: for equatorial lanes, quotient_f32x16 = cos_angular_distance_f32x16 (passthrough),
456
482
  // so subtraction yields zero. Avoids division by zero.
457
- __m512 sin_product = _mm512_mul_ps(sin_reduced_first, sin_reduced_second);
458
- __m512 quotient = _mm512_mask_div_ps(cos_angular_distance, _knot_mask16(equatorial_mask),
459
- _mm512_mul_ps(two, sin_product), cos_squared_azimuth);
460
- cos_double_angular_midpoint = _mm512_sub_ps(cos_angular_distance, quotient);
483
+ __m512 sin_product_f32x16 = _mm512_mul_ps(sin_reduced_first_f32x16, sin_reduced_second_f32x16);
484
+ __m512 quotient_f32x16 = _mm512_mask_div_ps(cos_angular_distance_f32x16, _knot_mask16(equatorial_mask),
485
+ _mm512_mul_ps(two_f32x16, sin_product_f32x16),
486
+ cos_squared_azimuth_f32x16);
487
+ cos_double_angular_midpoint_f32x16 = _mm512_sub_ps(cos_angular_distance_f32x16, quotient_f32x16);
461
488
 
462
489
  // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
463
- __m512 correction_factor = _mm512_mul_ps(
464
- _mm512_div_ps(flattening, sixteen),
465
- _mm512_mul_ps(cos_squared_azimuth,
466
- _mm512_fmadd_ps(flattening, _mm512_fnmadd_ps(three, cos_squared_azimuth, four), four)));
490
+ __m512 correction_factor_f32x16 = _mm512_mul_ps(
491
+ _mm512_div_ps(flattening_f32x16, sixteen_f32x16),
492
+ _mm512_mul_ps(
493
+ cos_squared_azimuth_f32x16,
494
+ _mm512_fmadd_ps(flattening_f32x16,
495
+ _mm512_fnmadd_ps(three_f32x16, cos_squared_azimuth_f32x16, four_f32x16), four_f32x16)));
467
496
 
468
497
  // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
469
- __m512 cos_2sm_sq = _mm512_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
470
- // innermost = -1 + 2 × cos²(2σₘ)
471
- __m512 innermost = _mm512_fmadd_ps(two, cos_2sm_sq, _mm512_set1_ps(-1.0f));
472
- // middle = cos(2σₘ) + C × cos(σ) × innermost
473
- __m512 middle = _mm512_fmadd_ps(_mm512_mul_ps(correction_factor, cos_angular_distance), innermost,
474
- cos_double_angular_midpoint);
475
- // inner = C × sin(σ) × middle
476
- __m512 inner = _mm512_mul_ps(_mm512_mul_ps(correction_factor, sin_angular_distance), middle);
477
-
478
- // λ' = L + (1-C) * f * sin_α * (σ + inner)
479
- __m512 lambda_new = _mm512_fmadd_ps(
480
- _mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(one, correction_factor), flattening), sin_azimuth),
481
- _mm512_add_ps(angular_distance, inner), longitude_difference);
498
+ __m512 cos_2sm_sq_f32x16 = _mm512_mul_ps(cos_double_angular_midpoint_f32x16,
499
+ cos_double_angular_midpoint_f32x16);
500
+ // innermost_f32x16 = -1 + 2 × cos²(2σₘ)
501
+ __m512 innermost_f32x16 = _mm512_fmadd_ps(two_f32x16, cos_2sm_sq_f32x16, _mm512_set1_ps(-1.0f));
502
+ // middle_f32x16 = cos(2σₘ) + C × cos(σ) × innermost_f32x16
503
+ __m512 middle_f32x16 = _mm512_fmadd_ps(_mm512_mul_ps(correction_factor_f32x16, cos_angular_distance_f32x16),
504
+ innermost_f32x16, cos_double_angular_midpoint_f32x16);
505
+ // inner_f32x16 = C × sin(σ) × middle_f32x16
506
+ __m512 inner_f32x16 = _mm512_mul_ps(_mm512_mul_ps(correction_factor_f32x16, sin_angular_distance_f32x16),
507
+ middle_f32x16);
508
+
509
+ // λ' = L + (1-C) * f * sin_α * (σ + inner_f32x16)
510
+ __m512 lambda_new_f32x16 = _mm512_fmadd_ps(
511
+ _mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(one_f32x16, correction_factor_f32x16), flattening_f32x16),
512
+ sin_azimuth_f32x16),
513
+ _mm512_add_ps(angular_distance_f32x16, inner_f32x16), longitude_difference_f32x16);
482
514
 
483
515
  // Check convergence: |λ - λ'| < threshold
484
- __m512 lambda_diff = _mm512_abs_ps(_mm512_sub_ps(lambda_new, lambda));
485
- converged_mask = _mm512_cmp_ps_mask(lambda_diff, convergence_threshold, _CMP_LT_OS);
516
+ __m512 lambda_diff_f32x16 = _mm512_abs_ps(_mm512_sub_ps(lambda_new_f32x16, lambda_f32x16));
517
+ converged_mask = _mm512_cmp_ps_mask(lambda_diff_f32x16, convergence_threshold_f32x16, _CMP_LT_OS);
486
518
 
487
- lambda = lambda_new;
519
+ lambda_f32x16 = lambda_new_f32x16;
488
520
  }
489
521
 
490
522
  // Final distance calculation
491
523
  // u² = cos²α * (a² - b²) / b²
492
- __m512 a_sq = _mm512_mul_ps(equatorial_radius, equatorial_radius);
493
- __m512 b_sq = _mm512_mul_ps(polar_radius, polar_radius);
494
- __m512 u_squared = _mm512_div_ps(_mm512_mul_ps(cos_squared_azimuth, _mm512_sub_ps(a_sq, b_sq)), b_sq);
524
+ __m512 a_sq_f32x16 = _mm512_mul_ps(equatorial_radius_f32x16, equatorial_radius_f32x16);
525
+ __m512 b_sq_f32x16 = _mm512_mul_ps(polar_radius_f32x16, polar_radius_f32x16);
526
+ __m512 u_squared_f32x16 = _mm512_div_ps(
527
+ _mm512_mul_ps(cos_squared_azimuth_f32x16, _mm512_sub_ps(a_sq_f32x16, b_sq_f32x16)), b_sq_f32x16);
495
528
 
496
529
  // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
497
- __m512 series_a = _mm512_fmadd_ps(u_squared, _mm512_set1_ps(-175.0f), _mm512_set1_ps(320.0f));
498
- series_a = _mm512_fmadd_ps(u_squared, series_a, _mm512_set1_ps(-768.0f));
499
- series_a = _mm512_fmadd_ps(u_squared, series_a, _mm512_set1_ps(4096.0f));
500
- series_a = _mm512_fmadd_ps(_mm512_div_ps(u_squared, _mm512_set1_ps(16384.0f)), series_a, one);
530
+ __m512 series_a_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, _mm512_set1_ps(-175.0f), _mm512_set1_ps(320.0f));
531
+ series_a_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_a_f32x16, _mm512_set1_ps(-768.0f));
532
+ series_a_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_a_f32x16, _mm512_set1_ps(4096.0f));
533
+ series_a_f32x16 = _mm512_fmadd_ps(_mm512_div_ps(u_squared_f32x16, _mm512_set1_ps(16384.0f)), series_a_f32x16,
534
+ one_f32x16);
501
535
 
502
536
  // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
503
- __m512 series_b = _mm512_fmadd_ps(u_squared, _mm512_set1_ps(-47.0f), _mm512_set1_ps(74.0f));
504
- series_b = _mm512_fmadd_ps(u_squared, series_b, _mm512_set1_ps(-128.0f));
505
- series_b = _mm512_fmadd_ps(u_squared, series_b, _mm512_set1_ps(256.0f));
506
- series_b = _mm512_mul_ps(_mm512_div_ps(u_squared, _mm512_set1_ps(1024.0f)), series_b);
537
+ __m512 series_b_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, _mm512_set1_ps(-47.0f), _mm512_set1_ps(74.0f));
538
+ series_b_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_b_f32x16, _mm512_set1_ps(-128.0f));
539
+ series_b_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_b_f32x16, _mm512_set1_ps(256.0f));
540
+ series_b_f32x16 = _mm512_mul_ps(_mm512_div_ps(u_squared_f32x16, _mm512_set1_ps(1024.0f)), series_b_f32x16);
507
541
 
508
542
  // Δσ = B × sin(σ) × (cos(2σₘ) +
509
543
  // B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
510
- __m512 cos_2sm_sq = _mm512_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
511
- __m512 sin_sq = _mm512_mul_ps(sin_angular_distance, sin_angular_distance);
512
- __m512 term1 = _mm512_fmadd_ps(two, cos_2sm_sq, _mm512_set1_ps(-1.0f));
513
- term1 = _mm512_mul_ps(cos_angular_distance, term1);
514
- __m512 term2 = _mm512_fmadd_ps(four, sin_sq, _mm512_set1_ps(-3.0f));
515
- __m512 term3 = _mm512_fmadd_ps(four, cos_2sm_sq, _mm512_set1_ps(-3.0f));
516
- term2 = _mm512_mul_ps(_mm512_mul_ps(_mm512_div_ps(series_b, six), cos_double_angular_midpoint),
517
- _mm512_mul_ps(term2, term3));
518
- __m512 delta_sigma = _mm512_mul_ps(
519
- series_b, _mm512_mul_ps(sin_angular_distance, _mm512_add_ps(cos_double_angular_midpoint,
520
- _mm512_mul_ps(_mm512_div_ps(series_b, four),
521
- _mm512_sub_ps(term1, term2)))));
544
+ __m512 cos_2sm_sq_f32x16 = _mm512_mul_ps(cos_double_angular_midpoint_f32x16, cos_double_angular_midpoint_f32x16);
545
+ __m512 sin_sq_f32x16 = _mm512_mul_ps(sin_angular_distance_f32x16, sin_angular_distance_f32x16);
546
+ __m512 term1_f32x16 = _mm512_fmadd_ps(two_f32x16, cos_2sm_sq_f32x16, _mm512_set1_ps(-1.0f));
547
+ term1_f32x16 = _mm512_mul_ps(cos_angular_distance_f32x16, term1_f32x16);
548
+ __m512 term2_f32x16 = _mm512_fmadd_ps(four_f32x16, sin_sq_f32x16, _mm512_set1_ps(-3.0f));
549
+ __m512 term3_f32x16 = _mm512_fmadd_ps(four_f32x16, cos_2sm_sq_f32x16, _mm512_set1_ps(-3.0f));
550
+ term2_f32x16 = _mm512_mul_ps(
551
+ _mm512_mul_ps(_mm512_div_ps(series_b_f32x16, six_f32x16), cos_double_angular_midpoint_f32x16),
552
+ _mm512_mul_ps(term2_f32x16, term3_f32x16));
553
+ __m512 delta_sigma_f32x16 = _mm512_mul_ps(
554
+ series_b_f32x16, _mm512_mul_ps(sin_angular_distance_f32x16,
555
+ _mm512_add_ps(cos_double_angular_midpoint_f32x16,
556
+ _mm512_mul_ps(_mm512_div_ps(series_b_f32x16, four_f32x16),
557
+ _mm512_sub_ps(term1_f32x16, term2_f32x16)))));
522
558
 
523
559
  // s = b * A * (σ - Δσ)
524
- __m512 distances = _mm512_mul_ps(_mm512_mul_ps(polar_radius, series_a),
525
- _mm512_sub_ps(angular_distance, delta_sigma));
560
+ __m512 distances_f32x16 = _mm512_mul_ps(_mm512_mul_ps(polar_radius_f32x16, series_a_f32x16),
561
+ _mm512_sub_ps(angular_distance_f32x16, delta_sigma_f32x16));
526
562
 
527
563
  // Set coincident points to zero
528
- distances = _mm512_mask_blend_ps(coincident_mask, distances, _mm512_setzero_ps());
564
+ distances_f32x16 = _mm512_mask_blend_ps(coincident_mask, distances_f32x16, _mm512_setzero_ps());
529
565
 
530
- return distances;
566
+ return distances_f32x16;
531
567
  }
532
568
 
533
569
  NK_PUBLIC void nk_vincenty_f32_skylake( //
@@ -536,14 +572,14 @@ NK_PUBLIC void nk_vincenty_f32_skylake( //
536
572
  nk_size_t n, nk_f32_t *results) {
537
573
 
538
574
  while (n >= 16) {
539
- __m512 first_latitudes = _mm512_loadu_ps(a_lats);
540
- __m512 first_longitudes = _mm512_loadu_ps(a_lons);
541
- __m512 second_latitudes = _mm512_loadu_ps(b_lats);
542
- __m512 second_longitudes = _mm512_loadu_ps(b_lons);
575
+ __m512 first_latitudes_f32x16 = _mm512_loadu_ps(a_lats);
576
+ __m512 first_longitudes_f32x16 = _mm512_loadu_ps(a_lons);
577
+ __m512 second_latitudes_f32x16 = _mm512_loadu_ps(b_lats);
578
+ __m512 second_longitudes_f32x16 = _mm512_loadu_ps(b_lons);
543
579
 
544
- __m512 distances = nk_vincenty_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
545
- second_longitudes);
546
- _mm512_storeu_ps(results, distances);
580
+ __m512 distances_f32x16 = nk_vincenty_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
581
+ second_latitudes_f32x16, second_longitudes_f32x16);
582
+ _mm512_storeu_ps(results, distances_f32x16);
547
583
 
548
584
  a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
549
585
  }
@@ -551,14 +587,14 @@ NK_PUBLIC void nk_vincenty_f32_skylake( //
551
587
  // Handle remaining elements with masked operations
552
588
  if (n > 0) {
553
589
  __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
554
- __m512 first_latitudes = _mm512_maskz_loadu_ps(mask, a_lats);
555
- __m512 first_longitudes = _mm512_maskz_loadu_ps(mask, a_lons);
556
- __m512 second_latitudes = _mm512_maskz_loadu_ps(mask, b_lats);
557
- __m512 second_longitudes = _mm512_maskz_loadu_ps(mask, b_lons);
558
-
559
- __m512 distances = nk_vincenty_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
560
- second_longitudes);
561
- _mm512_mask_storeu_ps(results, mask, distances);
590
+ __m512 first_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lats);
591
+ __m512 first_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lons);
592
+ __m512 second_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lats);
593
+ __m512 second_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lons);
594
+
595
+ __m512 distances_f32x16 = nk_vincenty_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
596
+ second_latitudes_f32x16, second_longitudes_f32x16);
597
+ _mm512_mask_storeu_ps(results, mask, distances_f32x16);
562
598
  }
563
599
  }
564
600