numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,577 @@
1
+ /**
2
+ * @brief SIMD-accelerated Geospatial Distances for Skylake.
3
+ * @file include/numkong/geospatial/skylake.h
4
+ * @author Ash Vardanian
5
+ * @date February 6, 2026
6
+ *
7
+ * @sa include/numkong/geospatial.h
8
+ *
9
+ * @section geospatial_skylake_instructions Key AVX-512 Geospatial Instructions
10
+ *
11
+ * Intrinsic Instruction Ice Genoa
12
+ * _mm512_sqrt_ps VSQRTPS (ZMM, ZMM) 19c @ p05 15c @ p01
13
+ * _mm512_sqrt_pd VSQRTPD (ZMM, ZMM) 23c @ p05 21c @ p01
14
+ * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11c @ p0 11c @ p01
15
+ * _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13c @ p0 13c @ p01
16
+ * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4c @ p01 4c @ p01
17
+ * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4c @ p01 4c @ p01
18
+ */
19
+ #ifndef NK_GEOSPATIAL_SKYLAKE_H
20
+ #define NK_GEOSPATIAL_SKYLAKE_H
21
+
22
+ #if NK_TARGET_X86_
23
+ #if NK_TARGET_SKYLAKE
24
+
25
+ #include "numkong/types.h"
26
+ #include "numkong/trigonometry/skylake.h" // `nk_sin_f64x8_skylake_`, `nk_cos_f64x8_skylake_`, `nk_atan2_f64x8_skylake_`, etc.
27
+
28
+ #if defined(__cplusplus)
29
+ extern "C" {
30
+ #endif
31
+
32
+ #if defined(__clang__)
33
+ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512dq,f16c,fma,bmi,bmi2"))), \
34
+ apply_to = function)
35
+ #elif defined(__GNUC__)
36
+ #pragma GCC push_options
37
+ #pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "f16c", "fma", "bmi", "bmi2")
38
+ #endif
39
+
40
+ NK_INTERNAL __m512d nk_haversine_f64x8_skylake_( //
41
+ __m512d first_latitudes, __m512d first_longitudes, //
42
+ __m512d second_latitudes, __m512d second_longitudes) {
43
+
44
+ __m512d const earth_radius = _mm512_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
45
+ __m512d const half = _mm512_set1_pd(0.5);
46
+ __m512d const one = _mm512_set1_pd(1.0);
47
+ __m512d const two = _mm512_set1_pd(2.0);
48
+
49
+ __m512d latitude_delta = _mm512_sub_pd(second_latitudes, first_latitudes);
50
+ __m512d longitude_delta = _mm512_sub_pd(second_longitudes, first_longitudes);
51
+
52
+ // Haversine terms: sin²(Δ/2)
53
+ __m512d latitude_delta_half = _mm512_mul_pd(latitude_delta, half);
54
+ __m512d longitude_delta_half = _mm512_mul_pd(longitude_delta, half);
55
+ __m512d sin_latitude_delta_half = nk_sin_f64x8_skylake_(latitude_delta_half);
56
+ __m512d sin_longitude_delta_half = nk_sin_f64x8_skylake_(longitude_delta_half);
57
+ __m512d sin_squared_latitude_delta_half = _mm512_mul_pd(sin_latitude_delta_half, sin_latitude_delta_half);
58
+ __m512d sin_squared_longitude_delta_half = _mm512_mul_pd(sin_longitude_delta_half, sin_longitude_delta_half);
59
+
60
+ // Latitude cosine product
61
+ __m512d cos_first_latitude = nk_cos_f64x8_skylake_(first_latitudes);
62
+ __m512d cos_second_latitude = nk_cos_f64x8_skylake_(second_latitudes);
63
+ __m512d cos_latitude_product = _mm512_mul_pd(cos_first_latitude, cos_second_latitude);
64
+
65
+ // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
66
+ __m512d haversine_term = _mm512_add_pd(sin_squared_latitude_delta_half,
67
+ _mm512_mul_pd(cos_latitude_product, sin_squared_longitude_delta_half));
68
+ // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
69
+ __m512d zero = _mm512_setzero_pd();
70
+ haversine_term = _mm512_max_pd(zero, _mm512_min_pd(one, haversine_term));
71
+
72
+ // Central angle: c = 2 × atan2(√a, √(1-a))
73
+ __m512d sqrt_haversine = _mm512_sqrt_pd(haversine_term);
74
+ __m512d sqrt_complement = _mm512_sqrt_pd(_mm512_sub_pd(one, haversine_term));
75
+ __m512d central_angle = _mm512_mul_pd(two, nk_atan2_f64x8_skylake_(sqrt_haversine, sqrt_complement));
76
+
77
+ return _mm512_mul_pd(earth_radius, central_angle);
78
+ }
79
+
80
+ NK_PUBLIC void nk_haversine_f64_skylake( //
81
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
82
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
83
+ nk_size_t n, nk_f64_t *results) {
84
+
85
+ while (n >= 8) {
86
+ __m512d first_latitudes = _mm512_loadu_pd(a_lats);
87
+ __m512d first_longitudes = _mm512_loadu_pd(a_lons);
88
+ __m512d second_latitudes = _mm512_loadu_pd(b_lats);
89
+ __m512d second_longitudes = _mm512_loadu_pd(b_lons);
90
+
91
+ __m512d distances = nk_haversine_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
92
+ second_longitudes);
93
+ _mm512_storeu_pd(results, distances);
94
+
95
+ a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
96
+ }
97
+
98
+ // Handle remaining elements with masked operations
99
+ if (n > 0) {
100
+ __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
101
+ __m512d first_latitudes = _mm512_maskz_loadu_pd(mask, a_lats);
102
+ __m512d first_longitudes = _mm512_maskz_loadu_pd(mask, a_lons);
103
+ __m512d second_latitudes = _mm512_maskz_loadu_pd(mask, b_lats);
104
+ __m512d second_longitudes = _mm512_maskz_loadu_pd(mask, b_lons);
105
+
106
+ __m512d distances = nk_haversine_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
107
+ second_longitudes);
108
+ _mm512_mask_storeu_pd(results, mask, distances);
109
+ }
110
+ }
111
+
112
+ /**
113
+ * @brief AVX-512 helper for Vincenty's geodesic distance on 8 f64 point pairs.
114
+ * @note This is a true SIMD implementation using masked convergence tracking.
115
+ */
116
+ NK_INTERNAL __m512d nk_vincenty_f64x8_skylake_( //
117
+ __m512d first_latitudes, __m512d first_longitudes, //
118
+ __m512d second_latitudes, __m512d second_longitudes) {
119
+
120
+ __m512d const equatorial_radius = _mm512_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
121
+ __m512d const polar_radius = _mm512_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
122
+ __m512d const flattening = _mm512_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
123
+ __m512d const convergence_threshold = _mm512_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
124
+ __m512d const one = _mm512_set1_pd(1.0);
125
+ __m512d const two = _mm512_set1_pd(2.0);
126
+ __m512d const three = _mm512_set1_pd(3.0);
127
+ __m512d const four = _mm512_set1_pd(4.0);
128
+ __m512d const six = _mm512_set1_pd(6.0);
129
+ __m512d const sixteen = _mm512_set1_pd(16.0);
130
+
131
+ // Longitude difference
132
+ __m512d longitude_difference = _mm512_sub_pd(second_longitudes, first_longitudes);
133
+
134
+ // Reduced latitudes: tan(U) = (1-f) * tan(lat)
135
+ __m512d one_minus_f = _mm512_sub_pd(one, flattening);
136
+ __m512d tan_first = _mm512_div_pd(nk_sin_f64x8_skylake_(first_latitudes), nk_cos_f64x8_skylake_(first_latitudes));
137
+ __m512d tan_second = _mm512_div_pd(nk_sin_f64x8_skylake_(second_latitudes),
138
+ nk_cos_f64x8_skylake_(second_latitudes));
139
+ __m512d tan_reduced_first = _mm512_mul_pd(one_minus_f, tan_first);
140
+ __m512d tan_reduced_second = _mm512_mul_pd(one_minus_f, tan_second);
141
+
142
+ // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
143
+ __m512d cos_reduced_first = _mm512_div_pd(
144
+ one, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_first, tan_reduced_first, one)));
145
+ __m512d sin_reduced_first = _mm512_mul_pd(tan_reduced_first, cos_reduced_first);
146
+ __m512d cos_reduced_second = _mm512_div_pd(
147
+ one, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_second, tan_reduced_second, one)));
148
+ __m512d sin_reduced_second = _mm512_mul_pd(tan_reduced_second, cos_reduced_second);
149
+
150
+ // Initialize lambda and tracking variables
151
+ __m512d lambda = longitude_difference;
152
+ __m512d sin_angular_distance, cos_angular_distance, angular_distance;
153
+ __m512d sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
154
+
155
+ // Track convergence and coincident points
156
+ __mmask8 converged_mask = 0;
157
+ __mmask8 coincident_mask = 0;
158
+
159
+ for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFF; ++iteration) {
160
+ __m512d sin_lambda = nk_sin_f64x8_skylake_(lambda);
161
+ __m512d cos_lambda = nk_cos_f64x8_skylake_(lambda);
162
+
163
+ // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
164
+ __m512d cross_term = _mm512_mul_pd(cos_reduced_second, sin_lambda);
165
+ __m512d mixed_term = _mm512_sub_pd(
166
+ _mm512_mul_pd(cos_reduced_first, sin_reduced_second),
167
+ _mm512_mul_pd(_mm512_mul_pd(sin_reduced_first, cos_reduced_second), cos_lambda));
168
+ __m512d sin_angular_dist_sq = _mm512_fmadd_pd(cross_term, cross_term, _mm512_mul_pd(mixed_term, mixed_term));
169
+ sin_angular_distance = _mm512_sqrt_pd(sin_angular_dist_sq);
170
+
171
+ // Check for coincident points (sin_angular_distance ≈ 0)
172
+ coincident_mask = _mm512_cmp_pd_mask(sin_angular_distance, _mm512_set1_pd(1e-15), _CMP_LT_OS);
173
+
174
+ // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
175
+ cos_angular_distance = _mm512_fmadd_pd(_mm512_mul_pd(cos_reduced_first, cos_reduced_second), cos_lambda,
176
+ _mm512_mul_pd(sin_reduced_first, sin_reduced_second));
177
+
178
+ // angular_distance = atan2(sin, cos)
179
+ angular_distance = nk_atan2_f64x8_skylake_(sin_angular_distance, cos_angular_distance);
180
+
181
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
182
+ // Use masked divide: zero result for coincident lanes, avoids division by zero
183
+ sin_azimuth = _mm512_maskz_div_pd(
184
+ _knot_mask8(coincident_mask),
185
+ _mm512_mul_pd(_mm512_mul_pd(cos_reduced_first, cos_reduced_second), sin_lambda), sin_angular_distance);
186
+ cos_squared_azimuth = _mm512_sub_pd(one, _mm512_mul_pd(sin_azimuth, sin_azimuth));
187
+
188
+ // Handle equatorial case: cos²α = 0
189
+ __mmask8 equatorial_mask = _mm512_cmp_pd_mask(cos_squared_azimuth, _mm512_set1_pd(1e-15), _CMP_LT_OS);
190
+
191
+ // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
192
+ // Use masked divide: for equatorial lanes, quotient = cos_angular_distance (passthrough),
193
+ // so subtraction yields zero. Avoids division by zero.
194
+ __m512d sin_product = _mm512_mul_pd(sin_reduced_first, sin_reduced_second);
195
+ __m512d quotient = _mm512_mask_div_pd(cos_angular_distance, _knot_mask8(equatorial_mask),
196
+ _mm512_mul_pd(two, sin_product), cos_squared_azimuth);
197
+ cos_double_angular_midpoint = _mm512_sub_pd(cos_angular_distance, quotient);
198
+
199
+ // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
200
+ __m512d correction_factor = _mm512_mul_pd(
201
+ _mm512_div_pd(flattening, sixteen),
202
+ _mm512_mul_pd(cos_squared_azimuth,
203
+ _mm512_fmadd_pd(flattening, _mm512_fnmadd_pd(three, cos_squared_azimuth, four), four)));
204
+
205
+ // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
206
+ __m512d cos_2sm_sq = _mm512_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
207
+ // innermost = -1 + 2 × cos²(2σₘ)
208
+ __m512d innermost = _mm512_fmadd_pd(two, cos_2sm_sq, _mm512_set1_pd(-1.0));
209
+ // middle = cos(2σₘ) + C × cos(σ) × innermost
210
+ __m512d middle = _mm512_fmadd_pd(_mm512_mul_pd(correction_factor, cos_angular_distance), innermost,
211
+ cos_double_angular_midpoint);
212
+ // inner = C × sin(σ) × middle
213
+ __m512d inner = _mm512_mul_pd(_mm512_mul_pd(correction_factor, sin_angular_distance), middle);
214
+
215
+ // λ' = L + (1-C) * f * sin_α * (σ + inner)
216
+ __m512d lambda_new = _mm512_fmadd_pd(
217
+ _mm512_mul_pd(_mm512_mul_pd(_mm512_sub_pd(one, correction_factor), flattening), sin_azimuth),
218
+ _mm512_add_pd(angular_distance, inner), longitude_difference);
219
+
220
+ // Check convergence: |λ - λ'| < threshold
221
+ __m512d lambda_diff = _mm512_abs_pd(_mm512_sub_pd(lambda_new, lambda));
222
+ converged_mask = _mm512_cmp_pd_mask(lambda_diff, convergence_threshold, _CMP_LT_OS);
223
+
224
+ lambda = lambda_new;
225
+ }
226
+
227
+ // Final distance calculation
228
+ // u² = cos²α * (a² - b²) / b²
229
+ __m512d a_sq = _mm512_mul_pd(equatorial_radius, equatorial_radius);
230
+ __m512d b_sq = _mm512_mul_pd(polar_radius, polar_radius);
231
+ __m512d u_squared = _mm512_div_pd(_mm512_mul_pd(cos_squared_azimuth, _mm512_sub_pd(a_sq, b_sq)), b_sq);
232
+
233
+ // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
234
+ __m512d series_a = _mm512_fmadd_pd(u_squared, _mm512_set1_pd(-175.0), _mm512_set1_pd(320.0));
235
+ series_a = _mm512_fmadd_pd(u_squared, series_a, _mm512_set1_pd(-768.0));
236
+ series_a = _mm512_fmadd_pd(u_squared, series_a, _mm512_set1_pd(4096.0));
237
+ series_a = _mm512_fmadd_pd(_mm512_div_pd(u_squared, _mm512_set1_pd(16384.0)), series_a, one);
238
+
239
+ // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
240
+ __m512d series_b = _mm512_fmadd_pd(u_squared, _mm512_set1_pd(-47.0), _mm512_set1_pd(74.0));
241
+ series_b = _mm512_fmadd_pd(u_squared, series_b, _mm512_set1_pd(-128.0));
242
+ series_b = _mm512_fmadd_pd(u_squared, series_b, _mm512_set1_pd(256.0));
243
+ series_b = _mm512_mul_pd(_mm512_div_pd(u_squared, _mm512_set1_pd(1024.0)), series_b);
244
+
245
+ // Δσ = B × sin(σ) × (cos(2σₘ) +
246
+ // B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
247
+ __m512d cos_2sm_sq = _mm512_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
248
+ __m512d sin_sq = _mm512_mul_pd(sin_angular_distance, sin_angular_distance);
249
+ __m512d term1 = _mm512_fmadd_pd(two, cos_2sm_sq, _mm512_set1_pd(-1.0));
250
+ term1 = _mm512_mul_pd(cos_angular_distance, term1);
251
+ __m512d term2 = _mm512_fmadd_pd(four, sin_sq, _mm512_set1_pd(-3.0));
252
+ __m512d term3 = _mm512_fmadd_pd(four, cos_2sm_sq, _mm512_set1_pd(-3.0));
253
+ term2 = _mm512_mul_pd(_mm512_mul_pd(_mm512_div_pd(series_b, six), cos_double_angular_midpoint),
254
+ _mm512_mul_pd(term2, term3));
255
+ __m512d delta_sigma = _mm512_mul_pd(
256
+ series_b, _mm512_mul_pd(sin_angular_distance, _mm512_add_pd(cos_double_angular_midpoint,
257
+ _mm512_mul_pd(_mm512_div_pd(series_b, four),
258
+ _mm512_sub_pd(term1, term2)))));
259
+
260
+ // s = b * A * (σ - Δσ)
261
+ __m512d distances = _mm512_mul_pd(_mm512_mul_pd(polar_radius, series_a),
262
+ _mm512_sub_pd(angular_distance, delta_sigma));
263
+
264
+ // Set coincident points to zero
265
+ distances = _mm512_mask_blend_pd(coincident_mask, distances, _mm512_setzero_pd());
266
+
267
+ return distances;
268
+ }
269
+
270
+ NK_PUBLIC void nk_vincenty_f64_skylake( //
271
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
272
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
273
+ nk_size_t n, nk_f64_t *results) {
274
+
275
+ while (n >= 8) {
276
+ __m512d first_latitudes = _mm512_loadu_pd(a_lats);
277
+ __m512d first_longitudes = _mm512_loadu_pd(a_lons);
278
+ __m512d second_latitudes = _mm512_loadu_pd(b_lats);
279
+ __m512d second_longitudes = _mm512_loadu_pd(b_lons);
280
+
281
+ __m512d distances = nk_vincenty_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
282
+ second_longitudes);
283
+ _mm512_storeu_pd(results, distances);
284
+
285
+ a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
286
+ }
287
+
288
+ // Handle remaining elements with masked operations
289
+ if (n > 0) {
290
+ __mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
291
+ __m512d first_latitudes = _mm512_maskz_loadu_pd(mask, a_lats);
292
+ __m512d first_longitudes = _mm512_maskz_loadu_pd(mask, a_lons);
293
+ __m512d second_latitudes = _mm512_maskz_loadu_pd(mask, b_lats);
294
+ __m512d second_longitudes = _mm512_maskz_loadu_pd(mask, b_lons);
295
+
296
+ __m512d distances = nk_vincenty_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
297
+ second_longitudes);
298
+ _mm512_mask_storeu_pd(results, mask, distances);
299
+ }
300
+ }
301
+
302
+ NK_INTERNAL __m512 nk_haversine_f32x16_skylake_( //
303
+ __m512 first_latitudes, __m512 first_longitudes, //
304
+ __m512 second_latitudes, __m512 second_longitudes) {
305
+
306
+ __m512 const earth_radius = _mm512_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
307
+ __m512 const half = _mm512_set1_ps(0.5f);
308
+ __m512 const one = _mm512_set1_ps(1.0f);
309
+ __m512 const two = _mm512_set1_ps(2.0f);
310
+
311
+ __m512 latitude_delta = _mm512_sub_ps(second_latitudes, first_latitudes);
312
+ __m512 longitude_delta = _mm512_sub_ps(second_longitudes, first_longitudes);
313
+
314
+ // Haversine terms: sin²(Δ/2)
315
+ __m512 latitude_delta_half = _mm512_mul_ps(latitude_delta, half);
316
+ __m512 longitude_delta_half = _mm512_mul_ps(longitude_delta, half);
317
+ __m512 sin_latitude_delta_half = nk_sin_f32x16_skylake_(latitude_delta_half);
318
+ __m512 sin_longitude_delta_half = nk_sin_f32x16_skylake_(longitude_delta_half);
319
+ __m512 sin_squared_latitude_delta_half = _mm512_mul_ps(sin_latitude_delta_half, sin_latitude_delta_half);
320
+ __m512 sin_squared_longitude_delta_half = _mm512_mul_ps(sin_longitude_delta_half, sin_longitude_delta_half);
321
+
322
+ // Latitude cosine product
323
+ __m512 cos_first_latitude = nk_cos_f32x16_skylake_(first_latitudes);
324
+ __m512 cos_second_latitude = nk_cos_f32x16_skylake_(second_latitudes);
325
+ __m512 cos_latitude_product = _mm512_mul_ps(cos_first_latitude, cos_second_latitude);
326
+
327
+ // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
328
+ __m512 haversine_term = _mm512_add_ps(sin_squared_latitude_delta_half,
329
+ _mm512_mul_ps(cos_latitude_product, sin_squared_longitude_delta_half));
330
+
331
+ // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
332
+ __m512 zero = _mm512_setzero_ps();
333
+ haversine_term = _mm512_max_ps(zero, _mm512_min_ps(one, haversine_term));
334
+
335
+ // Central angle: c = 2 × atan2(√a, √(1-a))
336
+ __m512 sqrt_haversine = _mm512_sqrt_ps(haversine_term);
337
+ __m512 sqrt_complement = _mm512_sqrt_ps(_mm512_sub_ps(one, haversine_term));
338
+ __m512 central_angle = _mm512_mul_ps(two, nk_atan2_f32x16_skylake_(sqrt_haversine, sqrt_complement));
339
+
340
+ return _mm512_mul_ps(earth_radius, central_angle);
341
+ }
342
+
343
+ NK_PUBLIC void nk_haversine_f32_skylake( //
344
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
345
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
346
+ nk_size_t n, nk_f32_t *results) {
347
+
348
+ while (n >= 16) {
349
+ __m512 first_latitudes = _mm512_loadu_ps(a_lats);
350
+ __m512 first_longitudes = _mm512_loadu_ps(a_lons);
351
+ __m512 second_latitudes = _mm512_loadu_ps(b_lats);
352
+ __m512 second_longitudes = _mm512_loadu_ps(b_lons);
353
+
354
+ __m512 distances = nk_haversine_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
355
+ second_longitudes);
356
+ _mm512_storeu_ps(results, distances);
357
+
358
+ a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
359
+ }
360
+
361
+ // Handle remaining elements with masked operations
362
+ if (n > 0) {
363
+ __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
364
+ __m512 first_latitudes = _mm512_maskz_loadu_ps(mask, a_lats);
365
+ __m512 first_longitudes = _mm512_maskz_loadu_ps(mask, a_lons);
366
+ __m512 second_latitudes = _mm512_maskz_loadu_ps(mask, b_lats);
367
+ __m512 second_longitudes = _mm512_maskz_loadu_ps(mask, b_lons);
368
+
369
+ __m512 distances = nk_haversine_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
370
+ second_longitudes);
371
+ _mm512_mask_storeu_ps(results, mask, distances);
372
+ }
373
+ }
374
+
375
+ /**
376
+ * @brief AVX-512 helper for Vincenty's geodesic distance on 16 f32 point pairs.
377
+ * @note This is a true SIMD implementation using masked convergence tracking.
378
+ */
379
+ NK_INTERNAL __m512 nk_vincenty_f32x16_skylake_( //
380
+ __m512 first_latitudes, __m512 first_longitudes, //
381
+ __m512 second_latitudes, __m512 second_longitudes) {
382
+
383
+ __m512 const equatorial_radius = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
384
+ __m512 const polar_radius = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
385
+ __m512 const flattening = _mm512_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
386
+ __m512 const convergence_threshold = _mm512_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
387
+ __m512 const one = _mm512_set1_ps(1.0f);
388
+ __m512 const two = _mm512_set1_ps(2.0f);
389
+ __m512 const three = _mm512_set1_ps(3.0f);
390
+ __m512 const four = _mm512_set1_ps(4.0f);
391
+ __m512 const six = _mm512_set1_ps(6.0f);
392
+ __m512 const sixteen = _mm512_set1_ps(16.0f);
393
+
394
+ // Longitude difference
395
+ __m512 longitude_difference = _mm512_sub_ps(second_longitudes, first_longitudes);
396
+
397
+ // Reduced latitudes: tan(U) = (1-f) * tan(lat)
398
+ __m512 one_minus_f = _mm512_sub_ps(one, flattening);
399
+ __m512 tan_first = _mm512_div_ps(nk_sin_f32x16_skylake_(first_latitudes), nk_cos_f32x16_skylake_(first_latitudes));
400
+ __m512 tan_second = _mm512_div_ps(nk_sin_f32x16_skylake_(second_latitudes),
401
+ nk_cos_f32x16_skylake_(second_latitudes));
402
+ __m512 tan_reduced_first = _mm512_mul_ps(one_minus_f, tan_first);
403
+ __m512 tan_reduced_second = _mm512_mul_ps(one_minus_f, tan_second);
404
+
405
+ // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
406
+ __m512 cos_reduced_first = _mm512_div_ps(
407
+ one, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_first, tan_reduced_first, one)));
408
+ __m512 sin_reduced_first = _mm512_mul_ps(tan_reduced_first, cos_reduced_first);
409
+ __m512 cos_reduced_second = _mm512_div_ps(
410
+ one, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_second, tan_reduced_second, one)));
411
+ __m512 sin_reduced_second = _mm512_mul_ps(tan_reduced_second, cos_reduced_second);
412
+
413
+ // Initialize lambda and tracking variables
414
+ __m512 lambda = longitude_difference;
415
+ __m512 sin_angular_distance, cos_angular_distance, angular_distance;
416
+ __m512 sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
417
+
418
+ // Track convergence and coincident points
419
+ __mmask16 converged_mask = 0;
420
+ __mmask16 coincident_mask = 0;
421
+
422
+ for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFFFF; ++iteration) {
423
+ __m512 sin_lambda = nk_sin_f32x16_skylake_(lambda);
424
+ __m512 cos_lambda = nk_cos_f32x16_skylake_(lambda);
425
+
426
+ // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
427
+ __m512 cross_term = _mm512_mul_ps(cos_reduced_second, sin_lambda);
428
+ __m512 mixed_term = _mm512_sub_ps(
429
+ _mm512_mul_ps(cos_reduced_first, sin_reduced_second),
430
+ _mm512_mul_ps(_mm512_mul_ps(sin_reduced_first, cos_reduced_second), cos_lambda));
431
+ __m512 sin_angular_dist_sq = _mm512_fmadd_ps(cross_term, cross_term, _mm512_mul_ps(mixed_term, mixed_term));
432
+ sin_angular_distance = _mm512_sqrt_ps(sin_angular_dist_sq);
433
+
434
+ // Check for coincident points (sin_angular_distance ≈ 0)
435
+ coincident_mask = _mm512_cmp_ps_mask(sin_angular_distance, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
436
+
437
+ // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
438
+ cos_angular_distance = _mm512_fmadd_ps(_mm512_mul_ps(cos_reduced_first, cos_reduced_second), cos_lambda,
439
+ _mm512_mul_ps(sin_reduced_first, sin_reduced_second));
440
+
441
+ // angular_distance = atan2(sin, cos)
442
+ angular_distance = nk_atan2_f32x16_skylake_(sin_angular_distance, cos_angular_distance);
443
+
444
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
445
+ // Use masked divide: zero result for coincident lanes, avoids division by zero
446
+ sin_azimuth = _mm512_maskz_div_ps(
447
+ _knot_mask16(coincident_mask),
448
+ _mm512_mul_ps(_mm512_mul_ps(cos_reduced_first, cos_reduced_second), sin_lambda), sin_angular_distance);
449
+ cos_squared_azimuth = _mm512_sub_ps(one, _mm512_mul_ps(sin_azimuth, sin_azimuth));
450
+
451
+ // Handle equatorial case: cos²α = 0
452
+ __mmask16 equatorial_mask = _mm512_cmp_ps_mask(cos_squared_azimuth, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
453
+
454
+ // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
455
+ // Use masked divide: for equatorial lanes, quotient = cos_angular_distance (passthrough),
456
+ // so subtraction yields zero. Avoids division by zero.
457
+ __m512 sin_product = _mm512_mul_ps(sin_reduced_first, sin_reduced_second);
458
+ __m512 quotient = _mm512_mask_div_ps(cos_angular_distance, _knot_mask16(equatorial_mask),
459
+ _mm512_mul_ps(two, sin_product), cos_squared_azimuth);
460
+ cos_double_angular_midpoint = _mm512_sub_ps(cos_angular_distance, quotient);
461
+
462
+ // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
463
+ __m512 correction_factor = _mm512_mul_ps(
464
+ _mm512_div_ps(flattening, sixteen),
465
+ _mm512_mul_ps(cos_squared_azimuth,
466
+ _mm512_fmadd_ps(flattening, _mm512_fnmadd_ps(three, cos_squared_azimuth, four), four)));
467
+
468
+ // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
469
+ __m512 cos_2sm_sq = _mm512_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
470
+ // innermost = -1 + 2 × cos²(2σₘ)
471
+ __m512 innermost = _mm512_fmadd_ps(two, cos_2sm_sq, _mm512_set1_ps(-1.0f));
472
+ // middle = cos(2σₘ) + C × cos(σ) × innermost
473
+ __m512 middle = _mm512_fmadd_ps(_mm512_mul_ps(correction_factor, cos_angular_distance), innermost,
474
+ cos_double_angular_midpoint);
475
+ // inner = C × sin(σ) × middle
476
+ __m512 inner = _mm512_mul_ps(_mm512_mul_ps(correction_factor, sin_angular_distance), middle);
477
+
478
+ // λ' = L + (1-C) * f * sin_α * (σ + inner)
479
+ __m512 lambda_new = _mm512_fmadd_ps(
480
+ _mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(one, correction_factor), flattening), sin_azimuth),
481
+ _mm512_add_ps(angular_distance, inner), longitude_difference);
482
+
483
+ // Check convergence: |λ - λ'| < threshold
484
+ __m512 lambda_diff = _mm512_abs_ps(_mm512_sub_ps(lambda_new, lambda));
485
+ converged_mask = _mm512_cmp_ps_mask(lambda_diff, convergence_threshold, _CMP_LT_OS);
486
+
487
+ lambda = lambda_new;
488
+ }
489
+
490
+ // Final distance calculation
491
+ // u² = cos²α * (a² - b²) / b²
492
+ __m512 a_sq = _mm512_mul_ps(equatorial_radius, equatorial_radius);
493
+ __m512 b_sq = _mm512_mul_ps(polar_radius, polar_radius);
494
+ __m512 u_squared = _mm512_div_ps(_mm512_mul_ps(cos_squared_azimuth, _mm512_sub_ps(a_sq, b_sq)), b_sq);
495
+
496
+ // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
497
+ __m512 series_a = _mm512_fmadd_ps(u_squared, _mm512_set1_ps(-175.0f), _mm512_set1_ps(320.0f));
498
+ series_a = _mm512_fmadd_ps(u_squared, series_a, _mm512_set1_ps(-768.0f));
499
+ series_a = _mm512_fmadd_ps(u_squared, series_a, _mm512_set1_ps(4096.0f));
500
+ series_a = _mm512_fmadd_ps(_mm512_div_ps(u_squared, _mm512_set1_ps(16384.0f)), series_a, one);
501
+
502
+ // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
503
+ __m512 series_b = _mm512_fmadd_ps(u_squared, _mm512_set1_ps(-47.0f), _mm512_set1_ps(74.0f));
504
+ series_b = _mm512_fmadd_ps(u_squared, series_b, _mm512_set1_ps(-128.0f));
505
+ series_b = _mm512_fmadd_ps(u_squared, series_b, _mm512_set1_ps(256.0f));
506
+ series_b = _mm512_mul_ps(_mm512_div_ps(u_squared, _mm512_set1_ps(1024.0f)), series_b);
507
+
508
+ // Δσ = B × sin(σ) × (cos(2σₘ) +
509
+ // B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
510
+ __m512 cos_2sm_sq = _mm512_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
511
+ __m512 sin_sq = _mm512_mul_ps(sin_angular_distance, sin_angular_distance);
512
+ __m512 term1 = _mm512_fmadd_ps(two, cos_2sm_sq, _mm512_set1_ps(-1.0f));
513
+ term1 = _mm512_mul_ps(cos_angular_distance, term1);
514
+ __m512 term2 = _mm512_fmadd_ps(four, sin_sq, _mm512_set1_ps(-3.0f));
515
+ __m512 term3 = _mm512_fmadd_ps(four, cos_2sm_sq, _mm512_set1_ps(-3.0f));
516
+ term2 = _mm512_mul_ps(_mm512_mul_ps(_mm512_div_ps(series_b, six), cos_double_angular_midpoint),
517
+ _mm512_mul_ps(term2, term3));
518
+ __m512 delta_sigma = _mm512_mul_ps(
519
+ series_b, _mm512_mul_ps(sin_angular_distance, _mm512_add_ps(cos_double_angular_midpoint,
520
+ _mm512_mul_ps(_mm512_div_ps(series_b, four),
521
+ _mm512_sub_ps(term1, term2)))));
522
+
523
+ // s = b * A * (σ - Δσ)
524
+ __m512 distances = _mm512_mul_ps(_mm512_mul_ps(polar_radius, series_a),
525
+ _mm512_sub_ps(angular_distance, delta_sigma));
526
+
527
+ // Set coincident points to zero
528
+ distances = _mm512_mask_blend_ps(coincident_mask, distances, _mm512_setzero_ps());
529
+
530
+ return distances;
531
+ }
532
+
533
+ NK_PUBLIC void nk_vincenty_f32_skylake( //
534
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
535
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
536
+ nk_size_t n, nk_f32_t *results) {
537
+
538
+ while (n >= 16) {
539
+ __m512 first_latitudes = _mm512_loadu_ps(a_lats);
540
+ __m512 first_longitudes = _mm512_loadu_ps(a_lons);
541
+ __m512 second_latitudes = _mm512_loadu_ps(b_lats);
542
+ __m512 second_longitudes = _mm512_loadu_ps(b_lons);
543
+
544
+ __m512 distances = nk_vincenty_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
545
+ second_longitudes);
546
+ _mm512_storeu_ps(results, distances);
547
+
548
+ a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
549
+ }
550
+
551
+ // Handle remaining elements with masked operations
552
+ if (n > 0) {
553
+ __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
554
+ __m512 first_latitudes = _mm512_maskz_loadu_ps(mask, a_lats);
555
+ __m512 first_longitudes = _mm512_maskz_loadu_ps(mask, a_lons);
556
+ __m512 second_latitudes = _mm512_maskz_loadu_ps(mask, b_lats);
557
+ __m512 second_longitudes = _mm512_maskz_loadu_ps(mask, b_lons);
558
+
559
+ __m512 distances = nk_vincenty_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
560
+ second_longitudes);
561
+ _mm512_mask_storeu_ps(results, mask, distances);
562
+ }
563
+ }
564
+
565
+ #if defined(__clang__)
566
+ #pragma clang attribute pop
567
+ #elif defined(__GNUC__)
568
+ #pragma GCC pop_options
569
+ #endif
570
+
571
+ #if defined(__cplusplus)
572
+ } // extern "C"
573
+ #endif
574
+
575
+ #endif // NK_TARGET_SKYLAKE
576
+ #endif // NK_TARGET_X86_
577
+ #endif // NK_GEOSPATIAL_SKYLAKE_H