numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,593 @@
1
+ /**
2
+ * @brief SIMD-accelerated Geospatial Distances for Haswell.
3
+ * @file include/numkong/geospatial/haswell.h
4
+ * @author Ash Vardanian
5
+ * @date February 6, 2026
6
+ *
7
+ * @sa include/numkong/geospatial.h
8
+ *
9
+ * @section geospatial_haswell_instructions Key AVX2 Geospatial Instructions
10
+ *
11
+ * Intrinsic Instruction Ice Genoa
12
+ * _mm256_sqrt_ps VSQRTPS (YMM, YMM) 12c @ p0 15c @ p01
13
+ * _mm256_sqrt_pd VSQRTPD (YMM, YMM) 13c @ p0 21c @ p01
14
+ * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11c @ p0 11c @ p01
15
+ * _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13c @ p0 13c @ p01
16
+ * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4c @ p01 4c @ p01
17
+ * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4c @ p01 4c @ p01
18
+ */
19
+ #ifndef NK_GEOSPATIAL_HASWELL_H
20
+ #define NK_GEOSPATIAL_HASWELL_H
21
+
22
+ #if NK_TARGET_X86_
23
+ #if NK_TARGET_HASWELL
24
+
25
+ #include "numkong/types.h"
26
+ #include "numkong/trigonometry/haswell.h" // `nk_sin_f64x4_haswell_`, `nk_cos_f64x4_haswell_`, `nk_atan2_f64x4_haswell_`, etc.
27
+
28
+ #if defined(__cplusplus)
29
+ extern "C" {
30
+ #endif
31
+
32
+ #if defined(__clang__)
33
+ #pragma clang attribute push(__attribute__((target("avx2,f16c,fma,bmi,bmi2"))), apply_to = function)
34
+ #elif defined(__GNUC__)
35
+ #pragma GCC push_options
36
+ #pragma GCC target("avx2", "f16c", "fma", "bmi", "bmi2")
37
+ #endif
38
+
39
+ /* Haswell AVX2 implementations using 4-wide f64 and 8-wide f32 SIMD.
40
+ * These require AVX2 trigonometric kernels from trigonometry.h.
41
+ */
42
+
43
+ NK_INTERNAL __m256d nk_haversine_f64x4_haswell_( //
44
+ __m256d first_latitudes, __m256d first_longitudes, //
45
+ __m256d second_latitudes, __m256d second_longitudes) {
46
+
47
+ __m256d const earth_radius = _mm256_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
48
+ __m256d const half = _mm256_set1_pd(0.5);
49
+ __m256d const one = _mm256_set1_pd(1.0);
50
+ __m256d const two = _mm256_set1_pd(2.0);
51
+
52
+ __m256d latitude_delta = _mm256_sub_pd(second_latitudes, first_latitudes);
53
+ __m256d longitude_delta = _mm256_sub_pd(second_longitudes, first_longitudes);
54
+
55
+ // Haversine terms: sin²(Δ/2)
56
+ __m256d latitude_delta_half = _mm256_mul_pd(latitude_delta, half);
57
+ __m256d longitude_delta_half = _mm256_mul_pd(longitude_delta, half);
58
+ __m256d sin_latitude_delta_half = nk_sin_f64x4_haswell_(latitude_delta_half);
59
+ __m256d sin_longitude_delta_half = nk_sin_f64x4_haswell_(longitude_delta_half);
60
+ __m256d sin_squared_latitude_delta_half = _mm256_mul_pd(sin_latitude_delta_half, sin_latitude_delta_half);
61
+ __m256d sin_squared_longitude_delta_half = _mm256_mul_pd(sin_longitude_delta_half, sin_longitude_delta_half);
62
+
63
+ // Latitude cosine product
64
+ __m256d cos_first_latitude = nk_cos_f64x4_haswell_(first_latitudes);
65
+ __m256d cos_second_latitude = nk_cos_f64x4_haswell_(second_latitudes);
66
+ __m256d cos_latitude_product = _mm256_mul_pd(cos_first_latitude, cos_second_latitude);
67
+
68
+ // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
69
+ __m256d haversine_term = _mm256_add_pd(sin_squared_latitude_delta_half,
70
+ _mm256_mul_pd(cos_latitude_product, sin_squared_longitude_delta_half));
71
+ // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
72
+ __m256d zero = _mm256_setzero_pd();
73
+ haversine_term = _mm256_max_pd(zero, _mm256_min_pd(one, haversine_term));
74
+
75
+ // Central angle: c = 2 × atan2(√a, √(1-a))
76
+ __m256d sqrt_haversine = _mm256_sqrt_pd(haversine_term);
77
+ __m256d sqrt_complement = _mm256_sqrt_pd(_mm256_sub_pd(one, haversine_term));
78
+ __m256d central_angle = _mm256_mul_pd(two, nk_atan2_f64x4_haswell_(sqrt_haversine, sqrt_complement));
79
+
80
+ return _mm256_mul_pd(earth_radius, central_angle);
81
+ }
82
+
83
+ NK_PUBLIC void nk_haversine_f64_haswell( //
84
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
85
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
86
+ nk_size_t n, nk_f64_t *results) {
87
+
88
+ while (n >= 4) {
89
+ __m256d first_latitudes = _mm256_loadu_pd(a_lats);
90
+ __m256d first_longitudes = _mm256_loadu_pd(a_lons);
91
+ __m256d second_latitudes = _mm256_loadu_pd(b_lats);
92
+ __m256d second_longitudes = _mm256_loadu_pd(b_lons);
93
+
94
+ __m256d distances = nk_haversine_f64x4_haswell_(first_latitudes, first_longitudes, second_latitudes,
95
+ second_longitudes);
96
+ _mm256_storeu_pd(results, distances);
97
+
98
+ a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
99
+ }
100
+
101
+ // Handle remaining elements with partial loads (n can be 1-3 here)
102
+ if (n > 0) {
103
+ nk_b256_vec_t a_lat_vec, a_lon_vec, b_lat_vec, b_lon_vec, result_vec;
104
+ nk_partial_load_b64x4_haswell_(a_lats, &a_lat_vec, n);
105
+ nk_partial_load_b64x4_haswell_(a_lons, &a_lon_vec, n);
106
+ nk_partial_load_b64x4_haswell_(b_lats, &b_lat_vec, n);
107
+ nk_partial_load_b64x4_haswell_(b_lons, &b_lon_vec, n);
108
+ __m256d distances = nk_haversine_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
109
+ b_lon_vec.ymm_pd);
110
+ result_vec.ymm_pd = distances;
111
+ nk_partial_store_b64x4_haswell_(&result_vec, results, n);
112
+ }
113
+ }
114
+
115
+ NK_INTERNAL __m256 nk_haversine_f32x8_haswell_( //
116
+ __m256 first_latitudes, __m256 first_longitudes, //
117
+ __m256 second_latitudes, __m256 second_longitudes) {
118
+
119
+ __m256 const earth_radius = _mm256_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
120
+ __m256 const half = _mm256_set1_ps(0.5f);
121
+ __m256 const one = _mm256_set1_ps(1.0f);
122
+ __m256 const two = _mm256_set1_ps(2.0f);
123
+
124
+ __m256 latitude_delta = _mm256_sub_ps(second_latitudes, first_latitudes);
125
+ __m256 longitude_delta = _mm256_sub_ps(second_longitudes, first_longitudes);
126
+
127
+ // Haversine terms: sin²(Δ/2)
128
+ __m256 latitude_delta_half = _mm256_mul_ps(latitude_delta, half);
129
+ __m256 longitude_delta_half = _mm256_mul_ps(longitude_delta, half);
130
+ __m256 sin_latitude_delta_half = nk_sin_f32x8_haswell_(latitude_delta_half);
131
+ __m256 sin_longitude_delta_half = nk_sin_f32x8_haswell_(longitude_delta_half);
132
+ __m256 sin_squared_latitude_delta_half = _mm256_mul_ps(sin_latitude_delta_half, sin_latitude_delta_half);
133
+ __m256 sin_squared_longitude_delta_half = _mm256_mul_ps(sin_longitude_delta_half, sin_longitude_delta_half);
134
+
135
+ // Latitude cosine product
136
+ __m256 cos_first_latitude = nk_cos_f32x8_haswell_(first_latitudes);
137
+ __m256 cos_second_latitude = nk_cos_f32x8_haswell_(second_latitudes);
138
+ __m256 cos_latitude_product = _mm256_mul_ps(cos_first_latitude, cos_second_latitude);
139
+
140
+ // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
141
+ __m256 haversine_term = _mm256_add_ps(sin_squared_latitude_delta_half,
142
+ _mm256_mul_ps(cos_latitude_product, sin_squared_longitude_delta_half));
143
+
144
+ // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
145
+ __m256 zero = _mm256_setzero_ps();
146
+ haversine_term = _mm256_max_ps(zero, _mm256_min_ps(one, haversine_term));
147
+
148
+ // Central angle: c = 2 × atan2(√a, √(1-a))
149
+ __m256 sqrt_haversine = _mm256_sqrt_ps(haversine_term);
150
+ __m256 sqrt_complement = _mm256_sqrt_ps(_mm256_sub_ps(one, haversine_term));
151
+ __m256 central_angle = _mm256_mul_ps(two, nk_atan2_f32x8_haswell_(sqrt_haversine, sqrt_complement));
152
+
153
+ return _mm256_mul_ps(earth_radius, central_angle);
154
+ }
155
+
156
+ NK_PUBLIC void nk_haversine_f32_haswell( //
157
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
158
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
159
+ nk_size_t n, nk_f32_t *results) {
160
+
161
+ while (n >= 8) {
162
+ __m256 first_latitudes = _mm256_loadu_ps(a_lats);
163
+ __m256 first_longitudes = _mm256_loadu_ps(a_lons);
164
+ __m256 second_latitudes = _mm256_loadu_ps(b_lats);
165
+ __m256 second_longitudes = _mm256_loadu_ps(b_lons);
166
+
167
+ __m256 distances = nk_haversine_f32x8_haswell_(first_latitudes, first_longitudes, second_latitudes,
168
+ second_longitudes);
169
+ _mm256_storeu_ps(results, distances);
170
+
171
+ a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
172
+ }
173
+
174
+ // Handle remaining elements with partial loads (n can be 1-7 here)
175
+ if (n > 0) {
176
+ nk_b256_vec_t a_lat_vec, a_lon_vec, b_lat_vec, b_lon_vec, result_vec;
177
+ nk_partial_load_b32x8_serial_(a_lats, &a_lat_vec, n);
178
+ nk_partial_load_b32x8_serial_(a_lons, &a_lon_vec, n);
179
+ nk_partial_load_b32x8_serial_(b_lats, &b_lat_vec, n);
180
+ nk_partial_load_b32x8_serial_(b_lons, &b_lon_vec, n);
181
+ __m256 distances = nk_haversine_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
182
+ b_lon_vec.ymm_ps);
183
+ result_vec.ymm_ps = distances;
184
+ nk_partial_store_b32x8_serial_(&result_vec, results, n);
185
+ }
186
+ }
187
+
188
+ /**
189
+ * @brief AVX2 helper for Vincenty's geodesic distance on 4 f64 point pairs.
190
+ * @note This is a true SIMD implementation using masked convergence tracking via blending.
191
+ */
192
+ NK_INTERNAL __m256d nk_vincenty_f64x4_haswell_( //
193
+ __m256d first_latitudes, __m256d first_longitudes, //
194
+ __m256d second_latitudes, __m256d second_longitudes) {
195
+
196
+ __m256d const equatorial_radius = _mm256_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
197
+ __m256d const polar_radius = _mm256_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
198
+ __m256d const flattening = _mm256_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
199
+ __m256d const convergence_threshold = _mm256_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
200
+ __m256d const one = _mm256_set1_pd(1.0);
201
+ __m256d const two = _mm256_set1_pd(2.0);
202
+ __m256d const three = _mm256_set1_pd(3.0);
203
+ __m256d const four = _mm256_set1_pd(4.0);
204
+ __m256d const six = _mm256_set1_pd(6.0);
205
+ __m256d const sixteen = _mm256_set1_pd(16.0);
206
+ __m256d const epsilon = _mm256_set1_pd(1e-15);
207
+
208
+ // Longitude difference
209
+ __m256d longitude_difference = _mm256_sub_pd(second_longitudes, first_longitudes);
210
+
211
+ // Reduced latitudes: tan(U) = (1-f) * tan(lat)
212
+ __m256d one_minus_f = _mm256_sub_pd(one, flattening);
213
+ __m256d tan_first = _mm256_div_pd(nk_sin_f64x4_haswell_(first_latitudes), nk_cos_f64x4_haswell_(first_latitudes));
214
+ __m256d tan_second = _mm256_div_pd(nk_sin_f64x4_haswell_(second_latitudes),
215
+ nk_cos_f64x4_haswell_(second_latitudes));
216
+ __m256d tan_reduced_first = _mm256_mul_pd(one_minus_f, tan_first);
217
+ __m256d tan_reduced_second = _mm256_mul_pd(one_minus_f, tan_second);
218
+
219
+ // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
220
+ __m256d cos_reduced_first = _mm256_div_pd(
221
+ one, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_first, tan_reduced_first, one)));
222
+ __m256d sin_reduced_first = _mm256_mul_pd(tan_reduced_first, cos_reduced_first);
223
+ __m256d cos_reduced_second = _mm256_div_pd(
224
+ one, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_second, tan_reduced_second, one)));
225
+ __m256d sin_reduced_second = _mm256_mul_pd(tan_reduced_second, cos_reduced_second);
226
+
227
+ // Initialize lambda and tracking variables
228
+ __m256d lambda = longitude_difference;
229
+ __m256d sin_angular_distance, cos_angular_distance, angular_distance;
230
+ __m256d sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
231
+
232
+ // Track convergence and coincident points using masks
233
+ __m256d converged_mask = _mm256_setzero_pd();
234
+ __m256d coincident_mask = _mm256_setzero_pd();
235
+
236
+ for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
237
+ // Check if all lanes converged
238
+ int converged_bits = _mm256_movemask_pd(converged_mask);
239
+ if (converged_bits == 0xF) break;
240
+
241
+ __m256d sin_lambda = nk_sin_f64x4_haswell_(lambda);
242
+ __m256d cos_lambda = nk_cos_f64x4_haswell_(lambda);
243
+
244
+ // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
245
+ __m256d cross_term = _mm256_mul_pd(cos_reduced_second, sin_lambda);
246
+ __m256d mixed_term = _mm256_sub_pd(
247
+ _mm256_mul_pd(cos_reduced_first, sin_reduced_second),
248
+ _mm256_mul_pd(_mm256_mul_pd(sin_reduced_first, cos_reduced_second), cos_lambda));
249
+ __m256d sin_angular_dist_sq = _mm256_fmadd_pd(cross_term, cross_term, _mm256_mul_pd(mixed_term, mixed_term));
250
+ sin_angular_distance = _mm256_sqrt_pd(sin_angular_dist_sq);
251
+
252
+ // Check for coincident points (sin_angular_distance ≈ 0)
253
+ coincident_mask = _mm256_cmp_pd(sin_angular_distance, epsilon, _CMP_LT_OS);
254
+
255
+ // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
256
+ cos_angular_distance = _mm256_fmadd_pd(_mm256_mul_pd(cos_reduced_first, cos_reduced_second), cos_lambda,
257
+ _mm256_mul_pd(sin_reduced_first, sin_reduced_second));
258
+
259
+ // angular_distance = atan2(sin, cos)
260
+ angular_distance = nk_atan2_f64x4_haswell_(sin_angular_distance, cos_angular_distance);
261
+
262
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
263
+ // Avoid division by zero by using blending
264
+ __m256d safe_sin_angular = _mm256_blendv_pd(sin_angular_distance, one, coincident_mask);
265
+ sin_azimuth = _mm256_div_pd(_mm256_mul_pd(_mm256_mul_pd(cos_reduced_first, cos_reduced_second), sin_lambda),
266
+ safe_sin_angular);
267
+ cos_squared_azimuth = _mm256_sub_pd(one, _mm256_mul_pd(sin_azimuth, sin_azimuth));
268
+
269
+ // Handle equatorial case: cos²α ≈ 0
270
+ __m256d equatorial_mask = _mm256_cmp_pd(cos_squared_azimuth, epsilon, _CMP_LT_OS);
271
+ __m256d safe_cos_sq_azimuth = _mm256_blendv_pd(cos_squared_azimuth, one, equatorial_mask);
272
+
273
+ // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
274
+ __m256d sin_product = _mm256_mul_pd(sin_reduced_first, sin_reduced_second);
275
+ cos_double_angular_midpoint = _mm256_sub_pd(
276
+ cos_angular_distance, _mm256_div_pd(_mm256_mul_pd(two, sin_product), safe_cos_sq_azimuth));
277
+ cos_double_angular_midpoint = _mm256_blendv_pd(cos_double_angular_midpoint, _mm256_setzero_pd(),
278
+ equatorial_mask);
279
+
280
+ // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
281
+ __m256d correction_factor = _mm256_mul_pd(
282
+ _mm256_div_pd(flattening, sixteen),
283
+ _mm256_mul_pd(cos_squared_azimuth,
284
+ _mm256_fmadd_pd(flattening, _mm256_fnmadd_pd(three, cos_squared_azimuth, four), four)));
285
+
286
+ // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
287
+ __m256d cos_2sm_sq = _mm256_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
288
+ // innermost = -1 + 2 × cos²(2σₘ)
289
+ __m256d innermost = _mm256_fmadd_pd(two, cos_2sm_sq, _mm256_set1_pd(-1.0));
290
+ // middle = cos(2σₘ) + C × cos(σ) × innermost
291
+ __m256d middle = _mm256_fmadd_pd(_mm256_mul_pd(correction_factor, cos_angular_distance), innermost,
292
+ cos_double_angular_midpoint);
293
+ // inner = C × sin(σ) × middle
294
+ __m256d inner = _mm256_mul_pd(_mm256_mul_pd(correction_factor, sin_angular_distance), middle);
295
+
296
+ // λ' = L + (1-C) * f * sin_α * (σ + inner)
297
+ __m256d lambda_new = _mm256_fmadd_pd(
298
+ _mm256_mul_pd(_mm256_mul_pd(_mm256_sub_pd(one, correction_factor), flattening), sin_azimuth),
299
+ _mm256_add_pd(angular_distance, inner), longitude_difference);
300
+
301
+ // Check convergence: |λ - λ'| < threshold
302
+ __m256d lambda_diff_abs = _mm256_andnot_pd(_mm256_set1_pd(-0.0), _mm256_sub_pd(lambda_new, lambda));
303
+ __m256d newly_converged = _mm256_cmp_pd(lambda_diff_abs, convergence_threshold, _CMP_LT_OS);
304
+ converged_mask = _mm256_or_pd(converged_mask, newly_converged);
305
+
306
+ // Only update lambda for non-converged lanes
307
+ lambda = _mm256_blendv_pd(lambda_new, lambda, converged_mask);
308
+ }
309
+
310
+ // Final distance calculation
311
+ // u² = cos²α * (a² - b²) / b²
312
+ __m256d a_sq = _mm256_mul_pd(equatorial_radius, equatorial_radius);
313
+ __m256d b_sq = _mm256_mul_pd(polar_radius, polar_radius);
314
+ __m256d u_squared = _mm256_div_pd(_mm256_mul_pd(cos_squared_azimuth, _mm256_sub_pd(a_sq, b_sq)), b_sq);
315
+
316
+ // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
317
+ __m256d series_a = _mm256_fmadd_pd(u_squared, _mm256_set1_pd(-175.0), _mm256_set1_pd(320.0));
318
+ series_a = _mm256_fmadd_pd(u_squared, series_a, _mm256_set1_pd(-768.0));
319
+ series_a = _mm256_fmadd_pd(u_squared, series_a, _mm256_set1_pd(4096.0));
320
+ series_a = _mm256_fmadd_pd(_mm256_div_pd(u_squared, _mm256_set1_pd(16384.0)), series_a, one);
321
+
322
+ // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
323
+ __m256d series_b = _mm256_fmadd_pd(u_squared, _mm256_set1_pd(-47.0), _mm256_set1_pd(74.0));
324
+ series_b = _mm256_fmadd_pd(u_squared, series_b, _mm256_set1_pd(-128.0));
325
+ series_b = _mm256_fmadd_pd(u_squared, series_b, _mm256_set1_pd(256.0));
326
+ series_b = _mm256_mul_pd(_mm256_div_pd(u_squared, _mm256_set1_pd(1024.0)), series_b);
327
+
328
+ // Δσ = B × sin(σ) × (cos(2σₘ) +
329
+ // B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
330
+ __m256d cos_2sm_sq = _mm256_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
331
+ __m256d sin_sq = _mm256_mul_pd(sin_angular_distance, sin_angular_distance);
332
+ __m256d term1 = _mm256_fmadd_pd(two, cos_2sm_sq, _mm256_set1_pd(-1.0));
333
+ term1 = _mm256_mul_pd(cos_angular_distance, term1);
334
+ __m256d term2 = _mm256_fmadd_pd(four, sin_sq, _mm256_set1_pd(-3.0));
335
+ __m256d term3 = _mm256_fmadd_pd(four, cos_2sm_sq, _mm256_set1_pd(-3.0));
336
+ term2 = _mm256_mul_pd(_mm256_mul_pd(_mm256_div_pd(series_b, six), cos_double_angular_midpoint),
337
+ _mm256_mul_pd(term2, term3));
338
+ __m256d delta_sigma = _mm256_mul_pd(
339
+ series_b, _mm256_mul_pd(sin_angular_distance, _mm256_add_pd(cos_double_angular_midpoint,
340
+ _mm256_mul_pd(_mm256_div_pd(series_b, four),
341
+ _mm256_sub_pd(term1, term2)))));
342
+
343
+ // s = b * A * (σ - Δσ)
344
+ __m256d distances = _mm256_mul_pd(_mm256_mul_pd(polar_radius, series_a),
345
+ _mm256_sub_pd(angular_distance, delta_sigma));
346
+
347
+ // Set coincident points to zero
348
+ distances = _mm256_blendv_pd(distances, _mm256_setzero_pd(), coincident_mask);
349
+
350
+ return distances;
351
+ }
352
+
353
+ NK_PUBLIC void nk_vincenty_f64_haswell( //
354
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
355
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
356
+ nk_size_t n, nk_f64_t *results) {
357
+
358
+ while (n >= 4) {
359
+ __m256d first_latitudes = _mm256_loadu_pd(a_lats);
360
+ __m256d first_longitudes = _mm256_loadu_pd(a_lons);
361
+ __m256d second_latitudes = _mm256_loadu_pd(b_lats);
362
+ __m256d second_longitudes = _mm256_loadu_pd(b_lons);
363
+
364
+ __m256d distances = nk_vincenty_f64x4_haswell_(first_latitudes, first_longitudes, second_latitudes,
365
+ second_longitudes);
366
+ _mm256_storeu_pd(results, distances);
367
+
368
+ a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
369
+ }
370
+
371
+ // Handle remaining elements with partial loads (n can be 1-3 here)
372
+ if (n > 0) {
373
+ nk_b256_vec_t a_lat_vec, a_lon_vec, b_lat_vec, b_lon_vec, result_vec;
374
+ nk_partial_load_b64x4_haswell_(a_lats, &a_lat_vec, n);
375
+ nk_partial_load_b64x4_haswell_(a_lons, &a_lon_vec, n);
376
+ nk_partial_load_b64x4_haswell_(b_lats, &b_lat_vec, n);
377
+ nk_partial_load_b64x4_haswell_(b_lons, &b_lon_vec, n);
378
+ __m256d distances = nk_vincenty_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
379
+ b_lon_vec.ymm_pd);
380
+ result_vec.ymm_pd = distances;
381
+ nk_partial_store_b64x4_haswell_(&result_vec, results, n);
382
+ }
383
+ }
384
+
385
+ /**
386
+ * @brief AVX2 helper for Vincenty's geodesic distance on 8 f32 point pairs.
387
+ * @note This is a true SIMD implementation using masked convergence tracking via blending.
388
+ */
389
+ NK_INTERNAL __m256 nk_vincenty_f32x8_haswell_( //
390
+ __m256 first_latitudes, __m256 first_longitudes, //
391
+ __m256 second_latitudes, __m256 second_longitudes) {
392
+
393
+ __m256 const equatorial_radius = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
394
+ __m256 const polar_radius = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
395
+ __m256 const flattening = _mm256_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
396
+ __m256 const convergence_threshold = _mm256_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
397
+ __m256 const one = _mm256_set1_ps(1.0f);
398
+ __m256 const two = _mm256_set1_ps(2.0f);
399
+ __m256 const three = _mm256_set1_ps(3.0f);
400
+ __m256 const four = _mm256_set1_ps(4.0f);
401
+ __m256 const six = _mm256_set1_ps(6.0f);
402
+ __m256 const sixteen = _mm256_set1_ps(16.0f);
403
+ __m256 const epsilon = _mm256_set1_ps(1e-7f);
404
+
405
+ // Longitude difference
406
+ __m256 longitude_difference = _mm256_sub_ps(second_longitudes, first_longitudes);
407
+
408
+ // Reduced latitudes: tan(U) = (1-f) * tan(lat)
409
+ __m256 one_minus_f = _mm256_sub_ps(one, flattening);
410
+ __m256 tan_first = _mm256_div_ps(nk_sin_f32x8_haswell_(first_latitudes), nk_cos_f32x8_haswell_(first_latitudes));
411
+ __m256 tan_second = _mm256_div_ps(nk_sin_f32x8_haswell_(second_latitudes), nk_cos_f32x8_haswell_(second_latitudes));
412
+ __m256 tan_reduced_first = _mm256_mul_ps(one_minus_f, tan_first);
413
+ __m256 tan_reduced_second = _mm256_mul_ps(one_minus_f, tan_second);
414
+
415
+ // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
416
+ __m256 cos_reduced_first = _mm256_div_ps(
417
+ one, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_first, tan_reduced_first, one)));
418
+ __m256 sin_reduced_first = _mm256_mul_ps(tan_reduced_first, cos_reduced_first);
419
+ __m256 cos_reduced_second = _mm256_div_ps(
420
+ one, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_second, tan_reduced_second, one)));
421
+ __m256 sin_reduced_second = _mm256_mul_ps(tan_reduced_second, cos_reduced_second);
422
+
423
+ // Initialize lambda and tracking variables
424
+ __m256 lambda = longitude_difference;
425
+ __m256 sin_angular_distance, cos_angular_distance, angular_distance;
426
+ __m256 sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
427
+
428
+ // Track convergence and coincident points using masks
429
+ __m256 converged_mask = _mm256_setzero_ps();
430
+ __m256 coincident_mask = _mm256_setzero_ps();
431
+
432
+ for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
433
+ // Check if all lanes converged
434
+ int converged_bits = _mm256_movemask_ps(converged_mask);
435
+ if (converged_bits == 0xFF) break;
436
+
437
+ __m256 sin_lambda = nk_sin_f32x8_haswell_(lambda);
438
+ __m256 cos_lambda = nk_cos_f32x8_haswell_(lambda);
439
+
440
+ // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
441
+ __m256 cross_term = _mm256_mul_ps(cos_reduced_second, sin_lambda);
442
+ __m256 mixed_term = _mm256_sub_ps(
443
+ _mm256_mul_ps(cos_reduced_first, sin_reduced_second),
444
+ _mm256_mul_ps(_mm256_mul_ps(sin_reduced_first, cos_reduced_second), cos_lambda));
445
+ __m256 sin_angular_dist_sq = _mm256_fmadd_ps(cross_term, cross_term, _mm256_mul_ps(mixed_term, mixed_term));
446
+ sin_angular_distance = _mm256_sqrt_ps(sin_angular_dist_sq);
447
+
448
+ // Check for coincident points (sin_angular_distance ≈ 0)
449
+ coincident_mask = _mm256_cmp_ps(sin_angular_distance, epsilon, _CMP_LT_OS);
450
+
451
+ // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
452
+ cos_angular_distance = _mm256_fmadd_ps(_mm256_mul_ps(cos_reduced_first, cos_reduced_second), cos_lambda,
453
+ _mm256_mul_ps(sin_reduced_first, sin_reduced_second));
454
+
455
+ // angular_distance = atan2(sin, cos)
456
+ angular_distance = nk_atan2_f32x8_haswell_(sin_angular_distance, cos_angular_distance);
457
+
458
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
459
+ // Avoid division by zero by using blending
460
+ __m256 safe_sin_angular = _mm256_blendv_ps(sin_angular_distance, one, coincident_mask);
461
+ sin_azimuth = _mm256_div_ps(_mm256_mul_ps(_mm256_mul_ps(cos_reduced_first, cos_reduced_second), sin_lambda),
462
+ safe_sin_angular);
463
+ cos_squared_azimuth = _mm256_sub_ps(one, _mm256_mul_ps(sin_azimuth, sin_azimuth));
464
+
465
+ // Handle equatorial case: cos²α ≈ 0
466
+ __m256 equatorial_mask = _mm256_cmp_ps(cos_squared_azimuth, epsilon, _CMP_LT_OS);
467
+ __m256 safe_cos_sq_azimuth = _mm256_blendv_ps(cos_squared_azimuth, one, equatorial_mask);
468
+
469
+ // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
470
+ __m256 sin_product = _mm256_mul_ps(sin_reduced_first, sin_reduced_second);
471
+ cos_double_angular_midpoint = _mm256_sub_ps(
472
+ cos_angular_distance, _mm256_div_ps(_mm256_mul_ps(two, sin_product), safe_cos_sq_azimuth));
473
+ cos_double_angular_midpoint = _mm256_blendv_ps(cos_double_angular_midpoint, _mm256_setzero_ps(),
474
+ equatorial_mask);
475
+
476
+ // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
477
+ __m256 correction_factor = _mm256_mul_ps(
478
+ _mm256_div_ps(flattening, sixteen),
479
+ _mm256_mul_ps(cos_squared_azimuth,
480
+ _mm256_fmadd_ps(flattening, _mm256_fnmadd_ps(three, cos_squared_azimuth, four), four)));
481
+
482
+ // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
483
+ __m256 cos_2sm_sq = _mm256_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
484
+ // innermost = -1 + 2 × cos²(2σₘ)
485
+ __m256 innermost = _mm256_fmadd_ps(two, cos_2sm_sq, _mm256_set1_ps(-1.0f));
486
+ // middle = cos(2σₘ) + C × cos(σ) × innermost
487
+ __m256 middle = _mm256_fmadd_ps(_mm256_mul_ps(correction_factor, cos_angular_distance), innermost,
488
+ cos_double_angular_midpoint);
489
+ // inner = C × sin(σ) × middle
490
+ __m256 inner = _mm256_mul_ps(_mm256_mul_ps(correction_factor, sin_angular_distance), middle);
491
+
492
+ // λ' = L + (1-C) * f * sin_α * (σ + inner)
493
+ __m256 lambda_new = _mm256_fmadd_ps(
494
+ _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(one, correction_factor), flattening), sin_azimuth),
495
+ _mm256_add_ps(angular_distance, inner), longitude_difference);
496
+
497
+ // Check convergence: |λ - λ'| < threshold
498
+ __m256 lambda_diff_abs = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), _mm256_sub_ps(lambda_new, lambda));
499
+ __m256 newly_converged = _mm256_cmp_ps(lambda_diff_abs, convergence_threshold, _CMP_LT_OS);
500
+ converged_mask = _mm256_or_ps(converged_mask, newly_converged);
501
+
502
+ // Only update lambda for non-converged lanes
503
+ lambda = _mm256_blendv_ps(lambda_new, lambda, converged_mask);
504
+ }
505
+
506
+ // Final distance calculation
507
+ // u² = cos²α * (a² - b²) / b²
508
+ __m256 a_sq = _mm256_mul_ps(equatorial_radius, equatorial_radius);
509
+ __m256 b_sq = _mm256_mul_ps(polar_radius, polar_radius);
510
+ __m256 u_squared = _mm256_div_ps(_mm256_mul_ps(cos_squared_azimuth, _mm256_sub_ps(a_sq, b_sq)), b_sq);
511
+
512
+ // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
513
+ __m256 series_a = _mm256_fmadd_ps(u_squared, _mm256_set1_ps(-175.0f), _mm256_set1_ps(320.0f));
514
+ series_a = _mm256_fmadd_ps(u_squared, series_a, _mm256_set1_ps(-768.0f));
515
+ series_a = _mm256_fmadd_ps(u_squared, series_a, _mm256_set1_ps(4096.0f));
516
+ series_a = _mm256_fmadd_ps(_mm256_div_ps(u_squared, _mm256_set1_ps(16384.0f)), series_a, one);
517
+
518
+ // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
519
+ __m256 series_b = _mm256_fmadd_ps(u_squared, _mm256_set1_ps(-47.0f), _mm256_set1_ps(74.0f));
520
+ series_b = _mm256_fmadd_ps(u_squared, series_b, _mm256_set1_ps(-128.0f));
521
+ series_b = _mm256_fmadd_ps(u_squared, series_b, _mm256_set1_ps(256.0f));
522
+ series_b = _mm256_mul_ps(_mm256_div_ps(u_squared, _mm256_set1_ps(1024.0f)), series_b);
523
+
524
+ // Δσ = B × sin(σ) × (cos(2σₘ) +
525
+ // B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
526
+ __m256 cos_2sm_sq = _mm256_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
527
+ __m256 sin_sq = _mm256_mul_ps(sin_angular_distance, sin_angular_distance);
528
+ __m256 term1 = _mm256_fmadd_ps(two, cos_2sm_sq, _mm256_set1_ps(-1.0f));
529
+ term1 = _mm256_mul_ps(cos_angular_distance, term1);
530
+ __m256 term2 = _mm256_fmadd_ps(four, sin_sq, _mm256_set1_ps(-3.0f));
531
+ __m256 term3 = _mm256_fmadd_ps(four, cos_2sm_sq, _mm256_set1_ps(-3.0f));
532
+ term2 = _mm256_mul_ps(_mm256_mul_ps(_mm256_div_ps(series_b, six), cos_double_angular_midpoint),
533
+ _mm256_mul_ps(term2, term3));
534
+ __m256 delta_sigma = _mm256_mul_ps(
535
+ series_b, _mm256_mul_ps(sin_angular_distance, _mm256_add_ps(cos_double_angular_midpoint,
536
+ _mm256_mul_ps(_mm256_div_ps(series_b, four),
537
+ _mm256_sub_ps(term1, term2)))));
538
+
539
+ // s = b * A * (σ - Δσ)
540
+ __m256 distances = _mm256_mul_ps(_mm256_mul_ps(polar_radius, series_a),
541
+ _mm256_sub_ps(angular_distance, delta_sigma));
542
+
543
+ // Set coincident points to zero
544
+ distances = _mm256_blendv_ps(distances, _mm256_setzero_ps(), coincident_mask);
545
+
546
+ return distances;
547
+ }
548
+
549
+ NK_PUBLIC void nk_vincenty_f32_haswell( //
550
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
551
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
552
+ nk_size_t n, nk_f32_t *results) {
553
+
554
+ while (n >= 8) {
555
+ __m256 first_latitudes = _mm256_loadu_ps(a_lats);
556
+ __m256 first_longitudes = _mm256_loadu_ps(a_lons);
557
+ __m256 second_latitudes = _mm256_loadu_ps(b_lats);
558
+ __m256 second_longitudes = _mm256_loadu_ps(b_lons);
559
+
560
+ __m256 distances = nk_vincenty_f32x8_haswell_(first_latitudes, first_longitudes, second_latitudes,
561
+ second_longitudes);
562
+ _mm256_storeu_ps(results, distances);
563
+
564
+ a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
565
+ }
566
+
567
+ // Handle remaining elements with partial loads (n can be 1-7 here)
568
+ if (n > 0) {
569
+ nk_b256_vec_t a_lat_vec, a_lon_vec, b_lat_vec, b_lon_vec, result_vec;
570
+ nk_partial_load_b32x8_serial_(a_lats, &a_lat_vec, n);
571
+ nk_partial_load_b32x8_serial_(a_lons, &a_lon_vec, n);
572
+ nk_partial_load_b32x8_serial_(b_lats, &b_lat_vec, n);
573
+ nk_partial_load_b32x8_serial_(b_lons, &b_lon_vec, n);
574
+ __m256 distances = nk_vincenty_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
575
+ b_lon_vec.ymm_ps);
576
+ result_vec.ymm_ps = distances;
577
+ nk_partial_store_b32x8_serial_(&result_vec, results, n);
578
+ }
579
+ }
580
+
581
+ #if defined(__clang__)
582
+ #pragma clang attribute pop
583
+ #elif defined(__GNUC__)
584
+ #pragma GCC pop_options
585
+ #endif
586
+
587
+ #if defined(__cplusplus)
588
+ } // extern "C"
589
+ #endif
590
+
591
+ #endif // NK_TARGET_HASWELL
592
+ #endif // NK_TARGET_X86_
593
+ #endif // NK_GEOSPATIAL_HASWELL_H