numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,453 @@
1
+ /**
2
+ * @brief SIMD-accelerated Geospatial Distances.
3
+ * @file include/numkong/geospatial.h
4
+ * @author Ash Vardanian
5
+ * @date July 1, 2023
6
+ *
7
+ * Contains following distance functions:
8
+ *
9
+ * - Haversine (Great Circle) distance for 2 points
10
+ * - Haversine (Great Circle) distance for 2 arrays of points
11
+ * - Vincenty's distance function for Oblate Spheroid Geodesics
12
+ *
13
+ * All outputs are in meters, and the input coordinates are in radians.
14
+ *
15
+ * For dtypes:
16
+ *
17
+ * - 64-bit IEEE-754 floating point → 64-bit
18
+ * - 32-bit IEEE-754 floating point → 32-bit
19
+ *
20
+ * Precision policy:
21
+ *
22
+ * - `f32` remains the throughput-oriented lane and intentionally stays narrow end-to-end.
23
+ * - `f64` is the higher-accuracy lane for the same formulas.
24
+ * - We do not widen `f32` outputs here because the dominant error comes from the geodesic model
25
+ * and transcendental approximations, not from long horizontal reductions.
26
+ *
27
+ * For hardware architectures:
28
+ *
29
+ * - Arm: NEON
30
+ * - x86: Haswell, Skylake
31
+ *
32
+ * @section haversine_similarity Low-Accuracy High-Performance Haversine Similarity
33
+ *
34
+ * In most cases, for distance computations, we don't need the exact Haversine formula.
35
+ * The very last part of the computation applies `asin(√x)` non-linear transformation.
36
+ * Both `asin` and `sqrt` are monotonically increasing functions, so their product is also
37
+ * monotonically increasing. This means, for relative similarity/closeness computation we
38
+ * can avoid that expensive last step.
39
+ *
40
+ * @section trig_approximations Trigonometric Approximations & SIMD Vectorization
41
+ *
42
+ * The trigonometric functions (sin, cos, atan2) use polynomial approximations with SLEEF-level
43
+ * error bounds (~3.5 ULP). For f64, this translates to ~1e-15 absolute error; for f32, ~1e-7.
44
+ *
45
+ * @section accuracy_comparison Accuracy Comparison: Haversine vs Vincenty
46
+ *
47
+ * Both algorithms compute geodesic distances, but with different Earth models:
48
+ *
49
+ * - Haversine: Sphere (R=6335km), 0.3% - 0.6% vs WGS-84, fast approximation, ranking
50
+ * - Vincenty: WGS-84 Ellipsoid, 0.01% - 0.2% vs WGS-84, high-precision navigation
51
+ *
52
+ * Vincenty is ~3-20x more accurate than Haversine for most routes. The improvement is most
53
+ * significant for long-distance routes and near-polar paths where Earth's oblateness matters.
54
+ *
55
+ * @note SIMD implementations may have slightly different results than serial due to
56
+ * floating-point ordering in iterative algorithms. For Vincenty, expect <0.001%
57
+ * difference between SIMD and serial implementations.
58
+ *
59
+ * @section vincenty_precision High-Precision Vincenty's Formulae & Earth Ellipsoid
60
+ *
61
+ * Several approximations of the Earth Ellipsoid exist, each defined by the Equatorial radius (m),
62
+ * Polar radius (m), and Inverse flattening. The earliest ones date back to 1738, when Pierre Louis
63
+ * Maupertuis in France suggested a shape, that is only 0.3% different from the most accurate modern
64
+ * estimates by the International Earth Rotation and Reference Systems Service (IERS).
65
+ * The Global Positioning System (GPS) uses the World Geodetic Systems's (WGS) WGS-84 standard.
66
+ * NumKong uses the newer & more accurate @b IERS-2003 standard, but allows overriding default parameters:
67
+ *
68
+ * #define NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS (6378136.6)
69
+ * #define NK_EARTH_ELLIPSOID_POLAR_RADIUS (6356751.9)
70
+ * #define NK_EARTH_ELLIPSOID_INVERSE_FLATTENING (298.25642)
71
+ *
72
+ * To revert from oblate spheroids to spheres, use `NK_EARTH_MEDIATORIAL_RADIUS`.
73
+ *
74
+ * @section x86_instructions Relevant x86 Instructions
75
+ *
76
+ * Haversine and Vincenty formulas require sqrt for the final distance calculation and division
77
+ * for Vincenty's iterative convergence. These are the most expensive operations (12-23 cycles)
78
+ * but only execute once per point-pair. The polynomial trig approximations use FMA chains.
79
+ * Note: ZMM sqrt is faster on Genoa (15c) than Ice Lake (19c) due to better 512-bit support.
80
+ *
81
+ * Intrinsic Instruction Ice Genoa
82
+ * _mm256_sqrt_ps VSQRTPS (YMM, YMM) 12c @ p0 15c @ p01
83
+ * _mm256_sqrt_pd VSQRTPD (YMM, YMM) 13c @ p0 21c @ p01
84
+ * _mm512_sqrt_ps VSQRTPS (ZMM, ZMM) 19c @ p05 15c @ p01
85
+ * _mm512_sqrt_pd VSQRTPD (ZMM, ZMM) 23c @ p05 21c @ p01
86
+ * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11c @ p0 11c @ p01
87
+ * _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13c @ p0 13c @ p01
88
+ * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4c @ p01 4c @ p01
89
+ * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4c @ p01 4c @ p01
90
+ *
91
+ * @section arm_instructions Relevant ARM NEON/SVE Instructions
92
+ *
93
+ * ARM sqrt (FSQRT) has low throughput as it uses a dedicated V02 execution unit. This is
94
+ * acceptable since sqrt only appears once per distance calculation. FMA chains for trig
95
+ * polynomial evaluation pipeline well across all 4 V-units.
96
+ *
97
+ * Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
98
+ * vfmaq_f32 FMLA.S (vec) 4c @ V0123 4c @ V0123 4c @ V0123
99
+ * vfmaq_f64 FMLA.D (vec) 4c @ V0123 4c @ V0123 4c @ V0123
100
+ * vsqrtq_f32 FSQRT.S (vec) 10c @ V02 10c @ V02 9c @ V02
101
+ * vsqrtq_f64 FSQRT.D (vec) 13c @ V02 16c @ V02 16c @ V02
102
+ *
103
+ * @section references References
104
+ *
105
+ * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
106
+ * - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
107
+ * - Earth Ellipsoid: https://en.wikipedia.org/wiki/Earth_ellipsoid
108
+ * - Oblate Spheroid Geodesic: https://mathworld.wolfram.com/OblateSpheroidGeodesic.html
109
+ * - Staging experiments: https://github.com/ashvardanian/HaversineMathKong
110
+ * - Speeding up atan2f by 50x: https://mazzo.li/posts/vectorized-atan2.html
111
+ * - Simplifying the GNU C Sine Function: https://www.awelm.com/posts/simplifying-the-gnu-c-sine-function/
112
+ *
113
+ */
114
+ #ifndef NK_GEOSPATIAL_H
115
+ #define NK_GEOSPATIAL_H
116
+
117
+ #include "numkong/types.h"
118
+ #include "numkong/trigonometry.h"
119
+
120
+ /* Earth Ellipsoid Constants
121
+ * The default values use the IERS-2003 standard, but can be overridden before including this header.
122
+ */
123
+ #ifndef NK_EARTH_MEDIATORIAL_RADIUS
124
+ #define NK_EARTH_MEDIATORIAL_RADIUS (6335439.0)
125
+ #endif
126
+ #ifndef NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS
127
+ #define NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS (6378136.6)
128
+ #endif
129
+ #ifndef NK_EARTH_ELLIPSOID_POLAR_RADIUS
130
+ #define NK_EARTH_ELLIPSOID_POLAR_RADIUS (6356751.9)
131
+ #endif
132
+ #ifndef NK_EARTH_ELLIPSOID_INVERSE_FLATTENING
133
+ #define NK_EARTH_ELLIPSOID_INVERSE_FLATTENING (298.25642)
134
+ #endif
135
+ #ifndef NK_VINCENTY_MAX_ITERATIONS
136
+ #define NK_VINCENTY_MAX_ITERATIONS 100
137
+ #endif
138
+ #ifndef NK_VINCENTY_CONVERGENCE_THRESHOLD_F64
139
+ #define NK_VINCENTY_CONVERGENCE_THRESHOLD_F64 1e-12
140
+ #endif
141
+ #ifndef NK_VINCENTY_CONVERGENCE_THRESHOLD_F32
142
+ #define NK_VINCENTY_CONVERGENCE_THRESHOLD_F32 1e-7f
143
+ #endif
144
+
145
+ #if defined(__cplusplus)
146
+ extern "C" {
147
+ #endif
148
+
149
+ /**
150
+ * @brief Haversine distance between two arrays of points on a sphere.
151
+ *
152
+ * @param[in] a_lats Latitudes of the first points, in radians.
153
+ * @param[in] a_lons Longitudes of the first points, in radians.
154
+ * @param[in] b_lats Latitudes of the second points, in radians.
155
+ * @param[in] b_lons Longitudes of the second points, in radians.
156
+ * @param[in] n The number of point pairs.
157
+ * @param[out] results Output distances in meters, length `n`.
158
+ *
159
+ * @note Inputs are in radians and outputs are in meters.
160
+ */
161
+ NK_DYNAMIC void nk_haversine_f64( //
162
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
163
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
164
+ nk_size_t n, nk_f64_t *results);
165
+
166
+ /** @copydoc nk_haversine_f64 */
167
+ NK_DYNAMIC void nk_haversine_f32( //
168
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
169
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
170
+ nk_size_t n, nk_f32_t *results);
171
+
172
+ /**
173
+ * @brief Vincenty distance between two arrays of points on an oblate spheroid.
174
+ *
175
+ * @param[in] a_lats Latitudes of the first points, in radians.
176
+ * @param[in] a_lons Longitudes of the first points, in radians.
177
+ * @param[in] b_lats Latitudes of the second points, in radians.
178
+ * @param[in] b_lons Longitudes of the second points, in radians.
179
+ * @param[in] n The number of point pairs.
180
+ * @param[out] results Output distances in meters, length `n`.
181
+ *
182
+ * @note Inputs are in radians and outputs are in meters.
183
+ * @note Uses the Earth ellipsoid parameters configured via `NK_EARTH_ELLIPSOID_*`.
184
+ */
185
+ NK_DYNAMIC void nk_vincenty_f64( //
186
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
187
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
188
+ nk_size_t n, nk_f64_t *results);
189
+
190
+ /** @copydoc nk_vincenty_f64 */
191
+ NK_DYNAMIC void nk_vincenty_f32( //
192
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
193
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
194
+ nk_size_t n, nk_f32_t *results);
195
+
196
+ /** @copydoc nk_haversine_f64 */
197
+ NK_PUBLIC void nk_haversine_f64_serial( //
198
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
199
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
200
+ nk_size_t n, nk_f64_t *results);
201
+ /** @copydoc nk_vincenty_f64 */
202
+ NK_PUBLIC void nk_vincenty_f64_serial( //
203
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
204
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
205
+ nk_size_t n, nk_f64_t *results);
206
+ /** @copydoc nk_haversine_f32 */
207
+ NK_PUBLIC void nk_haversine_f32_serial( //
208
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
209
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
210
+ nk_size_t n, nk_f32_t *results);
211
+ /** @copydoc nk_vincenty_f32 */
212
+ NK_PUBLIC void nk_vincenty_f32_serial( //
213
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
214
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
215
+ nk_size_t n, nk_f32_t *results);
216
+
217
+ #if NK_TARGET_NEON
218
+ /** @copydoc nk_haversine_f64 */
219
+ NK_PUBLIC void nk_haversine_f64_neon( //
220
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
221
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
222
+ nk_size_t n, nk_f64_t *results);
223
+ /** @copydoc nk_vincenty_f64 */
224
+ NK_PUBLIC void nk_vincenty_f64_neon( //
225
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
226
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
227
+ nk_size_t n, nk_f64_t *results);
228
+ /** @copydoc nk_haversine_f32 */
229
+ NK_PUBLIC void nk_haversine_f32_neon( //
230
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
231
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
232
+ nk_size_t n, nk_f32_t *results);
233
+ /** @copydoc nk_vincenty_f32 */
234
+ NK_PUBLIC void nk_vincenty_f32_neon( //
235
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
236
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
237
+ nk_size_t n, nk_f32_t *results);
238
+ #endif // NK_TARGET_NEON
239
+
240
+ #if NK_TARGET_HASWELL
241
+ /** @copydoc nk_haversine_f64 */
242
+ NK_PUBLIC void nk_haversine_f64_haswell( //
243
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
244
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
245
+ nk_size_t n, nk_f64_t *results);
246
+ /** @copydoc nk_vincenty_f64 */
247
+ NK_PUBLIC void nk_vincenty_f64_haswell( //
248
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
249
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
250
+ nk_size_t n, nk_f64_t *results);
251
+ /** @copydoc nk_haversine_f32 */
252
+ NK_PUBLIC void nk_haversine_f32_haswell( //
253
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
254
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
255
+ nk_size_t n, nk_f32_t *results);
256
+ /** @copydoc nk_vincenty_f32 */
257
+ NK_PUBLIC void nk_vincenty_f32_haswell( //
258
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
259
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
260
+ nk_size_t n, nk_f32_t *results);
261
+ #endif // NK_TARGET_HASWELL
262
+
263
+ #if NK_TARGET_SKYLAKE
264
+ /** @copydoc nk_haversine_f64 */
265
+ NK_PUBLIC void nk_haversine_f64_skylake( //
266
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
267
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
268
+ nk_size_t n, nk_f64_t *results);
269
+ /** @copydoc nk_vincenty_f64 */
270
+ NK_PUBLIC void nk_vincenty_f64_skylake( //
271
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
272
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
273
+ nk_size_t n, nk_f64_t *results);
274
+ /** @copydoc nk_haversine_f32 */
275
+ NK_PUBLIC void nk_haversine_f32_skylake( //
276
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
277
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
278
+ nk_size_t n, nk_f32_t *results);
279
+ /** @copydoc nk_vincenty_f32 */
280
+ NK_PUBLIC void nk_vincenty_f32_skylake( //
281
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
282
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
283
+ nk_size_t n, nk_f32_t *results);
284
+ #endif // NK_TARGET_SKYLAKE
285
+
286
+ #if NK_TARGET_V128RELAXED
287
+ /** @copydoc nk_haversine_f64 */
288
+ NK_PUBLIC void nk_haversine_f64_v128relaxed( //
289
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
290
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
291
+ nk_size_t n, nk_f64_t *results);
292
+ /** @copydoc nk_vincenty_f64 */
293
+ NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
294
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
295
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
296
+ nk_size_t n, nk_f64_t *results);
297
+ /** @copydoc nk_haversine_f32 */
298
+ NK_PUBLIC void nk_haversine_f32_v128relaxed( //
299
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
300
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
301
+ nk_size_t n, nk_f32_t *results);
302
+ /** @copydoc nk_vincenty_f32 */
303
+ NK_PUBLIC void nk_vincenty_f32_v128relaxed( //
304
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
305
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
306
+ nk_size_t n, nk_f32_t *results);
307
+ #endif // NK_TARGET_V128RELAXED
308
+
309
+ #if NK_TARGET_RVV
310
+ /** @copydoc nk_haversine_f64 */
311
+ NK_PUBLIC void nk_haversine_f64_rvv( //
312
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
313
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
314
+ nk_size_t n, nk_f64_t *results);
315
+ /** @copydoc nk_vincenty_f64 */
316
+ NK_PUBLIC void nk_vincenty_f64_rvv( //
317
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
318
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
319
+ nk_size_t n, nk_f64_t *results);
320
+ /** @copydoc nk_haversine_f32 */
321
+ NK_PUBLIC void nk_haversine_f32_rvv( //
322
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
323
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
324
+ nk_size_t n, nk_f32_t *results);
325
+ /** @copydoc nk_vincenty_f32 */
326
+ NK_PUBLIC void nk_vincenty_f32_rvv( //
327
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
328
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
329
+ nk_size_t n, nk_f32_t *results);
330
+ #endif // NK_TARGET_RVV
331
+
332
+ /**
333
+ * @brief Returns the output dtype for Haversine distance.
334
+ */
335
+ NK_INTERNAL nk_dtype_t nk_haversine_output_dtype(nk_dtype_t dtype) {
336
+ switch (dtype) {
337
+ case nk_f64_k: return nk_f64_k;
338
+ case nk_f32_k: return nk_f32_k;
339
+ default: return nk_dtype_unknown_k;
340
+ }
341
+ }
342
+
343
+ /**
344
+ * @brief Returns the output dtype for Vincenty distance.
345
+ */
346
+ NK_INTERNAL nk_dtype_t nk_vincenty_output_dtype(nk_dtype_t dtype) {
347
+ switch (dtype) {
348
+ case nk_f64_k: return nk_f64_k;
349
+ case nk_f32_k: return nk_f32_k;
350
+ default: return nk_dtype_unknown_k;
351
+ }
352
+ }
353
+
354
+ #if defined(__cplusplus)
355
+ } // extern "C"
356
+ #endif
357
+
358
+ #include "numkong/geospatial/serial.h"
359
+ #include "numkong/geospatial/neon.h"
360
+ #include "numkong/geospatial/haswell.h"
361
+ #include "numkong/geospatial/skylake.h"
362
+ #include "numkong/geospatial/v128relaxed.h"
363
+ #include "numkong/geospatial/rvv.h"
364
+
365
+ #if defined(__cplusplus)
366
+ extern "C" {
367
+ #endif
368
+
369
+ #if !NK_DYNAMIC_DISPATCH
370
+
371
+ NK_PUBLIC void nk_haversine_f64( //
372
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
373
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
374
+ nk_size_t n, nk_f64_t *results) {
375
+ #if NK_TARGET_SKYLAKE
376
+ nk_haversine_f64_skylake(a_lats, a_lons, b_lats, b_lons, n, results);
377
+ #elif NK_TARGET_HASWELL
378
+ nk_haversine_f64_haswell(a_lats, a_lons, b_lats, b_lons, n, results);
379
+ #elif NK_TARGET_NEON
380
+ nk_haversine_f64_neon(a_lats, a_lons, b_lats, b_lons, n, results);
381
+ #elif NK_TARGET_V128RELAXED
382
+ nk_haversine_f64_v128relaxed(a_lats, a_lons, b_lats, b_lons, n, results);
383
+ #elif NK_TARGET_RVV
384
+ nk_haversine_f64_rvv(a_lats, a_lons, b_lats, b_lons, n, results);
385
+ #else
386
+ nk_haversine_f64_serial(a_lats, a_lons, b_lats, b_lons, n, results);
387
+ #endif
388
+ }
389
+
390
+ NK_PUBLIC void nk_haversine_f32( //
391
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
392
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
393
+ nk_size_t n, nk_f32_t *results) {
394
+ #if NK_TARGET_SKYLAKE
395
+ nk_haversine_f32_skylake(a_lats, a_lons, b_lats, b_lons, n, results);
396
+ #elif NK_TARGET_HASWELL
397
+ nk_haversine_f32_haswell(a_lats, a_lons, b_lats, b_lons, n, results);
398
+ #elif NK_TARGET_NEON
399
+ nk_haversine_f32_neon(a_lats, a_lons, b_lats, b_lons, n, results);
400
+ #elif NK_TARGET_V128RELAXED
401
+ nk_haversine_f32_v128relaxed(a_lats, a_lons, b_lats, b_lons, n, results);
402
+ #elif NK_TARGET_RVV
403
+ nk_haversine_f32_rvv(a_lats, a_lons, b_lats, b_lons, n, results);
404
+ #else
405
+ nk_haversine_f32_serial(a_lats, a_lons, b_lats, b_lons, n, results);
406
+ #endif
407
+ }
408
+
409
+ NK_PUBLIC void nk_vincenty_f64( //
410
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
411
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
412
+ nk_size_t n, nk_f64_t *results) {
413
+ #if NK_TARGET_SKYLAKE
414
+ nk_vincenty_f64_skylake(a_lats, a_lons, b_lats, b_lons, n, results);
415
+ #elif NK_TARGET_HASWELL
416
+ nk_vincenty_f64_haswell(a_lats, a_lons, b_lats, b_lons, n, results);
417
+ #elif NK_TARGET_NEON
418
+ nk_vincenty_f64_neon(a_lats, a_lons, b_lats, b_lons, n, results);
419
+ #elif NK_TARGET_V128RELAXED
420
+ nk_vincenty_f64_v128relaxed(a_lats, a_lons, b_lats, b_lons, n, results);
421
+ #elif NK_TARGET_RVV
422
+ nk_vincenty_f64_rvv(a_lats, a_lons, b_lats, b_lons, n, results);
423
+ #else
424
+ nk_vincenty_f64_serial(a_lats, a_lons, b_lats, b_lons, n, results);
425
+ #endif
426
+ }
427
+
428
+ NK_PUBLIC void nk_vincenty_f32( //
429
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
430
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
431
+ nk_size_t n, nk_f32_t *results) {
432
+ #if NK_TARGET_SKYLAKE
433
+ nk_vincenty_f32_skylake(a_lats, a_lons, b_lats, b_lons, n, results);
434
+ #elif NK_TARGET_HASWELL
435
+ nk_vincenty_f32_haswell(a_lats, a_lons, b_lats, b_lons, n, results);
436
+ #elif NK_TARGET_NEON
437
+ nk_vincenty_f32_neon(a_lats, a_lons, b_lats, b_lons, n, results);
438
+ #elif NK_TARGET_V128RELAXED
439
+ nk_vincenty_f32_v128relaxed(a_lats, a_lons, b_lats, b_lons, n, results);
440
+ #elif NK_TARGET_RVV
441
+ nk_vincenty_f32_rvv(a_lats, a_lons, b_lats, b_lons, n, results);
442
+ #else
443
+ nk_vincenty_f32_serial(a_lats, a_lons, b_lats, b_lons, n, results);
444
+ #endif
445
+ }
446
+
447
+ #endif // !NK_DYNAMIC_DISPATCH
448
+
449
+ #if defined(__cplusplus)
450
+ } // extern "C"
451
+ #endif
452
+
453
+ #endif
@@ -0,0 +1,235 @@
1
+ /**
2
+ * @brief Geospatial kernels: haversine, vincenty.
3
+ * @file include/numkong/geospatial.hpp
4
+ * @author Ash Vardanian
5
+ * @date February 5, 2026
6
+ */
7
+ #ifndef NK_GEOSPATIAL_HPP
8
+ #define NK_GEOSPATIAL_HPP
9
+
10
+ #include <cstdint> // `std::uint32_t`
11
+ #include <type_traits> // `std::is_same_v`
12
+
13
+ #include "numkong/geospatial.h"
14
+
15
+ #include "numkong/types.hpp"
16
+
17
+ namespace ashvardanian::numkong {
18
+
19
+ /**
20
+ * @brief Batched Haversine: 2R × arcsin(√(sin²(Δφ/2) + cos φ₁ × cos φ₂ × sin²(Δλ/2)))
21
+ * @param[in] a_lats,a_lons Arrays of latitudes/longitudes for first points (radians)
22
+ * @param[in] b_lats,b_lons Arrays of latitudes/longitudes for second points (radians)
23
+ * @param[in] d Number of point pairs
24
+ * @param[out] results Output array of distances (meters)
25
+ *
26
+ * @tparam in_type_ Input coordinate type (f32_t, f64_t)
27
+ * @tparam precision_type_ Precision type for scalar fallback computations, defaults to `in_type_`
28
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
29
+ *
30
+ * @note Uses spherical Earth model with mediatorial radius (6335439.0 m)
31
+ * @note Accuracy: 0.3-0.6% vs WGS-84, suitable for ranking/similarity
32
+ */
33
+ template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
34
+ void haversine(in_type_ const *a_lats, in_type_ const *a_lons, in_type_ const *b_lats, in_type_ const *b_lons,
35
+ std::size_t d, in_type_ *results) noexcept {
36
+ constexpr bool simd = allow_simd_ == prefer_simd_k && std::is_same_v<in_type_, precision_type_>;
37
+
38
+ if constexpr (std::is_same_v<in_type_, f64_t> && simd)
39
+ nk_haversine_f64(&a_lats->raw_, &a_lons->raw_, &b_lats->raw_, &b_lons->raw_, d, &results->raw_);
40
+ else if constexpr (std::is_same_v<in_type_, f32_t> && simd)
41
+ nk_haversine_f32(&a_lats->raw_, &a_lons->raw_, &b_lats->raw_, &b_lons->raw_, d, &results->raw_);
42
+ // Scalar fallback
43
+ else {
44
+ precision_type_ const earth_radius = precision_type_(6335439.0); // mediatorial radius in meters
45
+
46
+ for (std::size_t i = 0; i < d; i++) {
47
+ precision_type_ first_latitude = precision_type_(a_lats[i]);
48
+ precision_type_ first_longitude = precision_type_(a_lons[i]);
49
+ precision_type_ second_latitude = precision_type_(b_lats[i]);
50
+ precision_type_ second_longitude = precision_type_(b_lons[i]);
51
+
52
+ precision_type_ latitude_delta = second_latitude - first_latitude;
53
+ precision_type_ longitude_delta = second_longitude - first_longitude;
54
+
55
+ // Haversine formula: a = sin²(Δlat/2) + cos(lat1)×cos(lat2)×sin²(Δlon/2)
56
+ precision_type_ sin_latitude_delta_half = (latitude_delta * precision_type_(0.5)).sin();
57
+ precision_type_ sin_longitude_delta_half = (longitude_delta * precision_type_(0.5)).sin();
58
+ precision_type_ cos_first_latitude = first_latitude.cos();
59
+ precision_type_ cos_second_latitude = second_latitude.cos();
60
+
61
+ precision_type_ haversine_term = sin_latitude_delta_half * sin_latitude_delta_half +
62
+ cos_first_latitude * cos_second_latitude * sin_longitude_delta_half *
63
+ sin_longitude_delta_half;
64
+
65
+ // Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
66
+ precision_type_ sqrt_haversine = haversine_term.sqrt();
67
+ precision_type_ sqrt_complement = (precision_type_(1.0) - haversine_term).sqrt();
68
+ precision_type_ central_angle = precision_type_(2.0) * sqrt_haversine.atan2(sqrt_complement);
69
+
70
+ results[i] = in_type_(static_cast<double>(earth_radius * central_angle));
71
+ }
72
+ }
73
+ }
74
+
75
+ /**
76
+ * @brief Batched Vincenty distance (geodesic on WGS-84 ellipsoid)
77
+ * @param[in] a_lats,a_lons Arrays of latitudes/longitudes for first points (radians)
78
+ * @param[in] b_lats,b_lons Arrays of latitudes/longitudes for second points (radians)
79
+ * @param[in] d Number of point pairs
80
+ * @param[out] results Output array of distances (meters)
81
+ *
82
+ * @tparam in_type_ Input coordinate type (f32_t, f64_t)
83
+ * @tparam precision_type_ Precision type for scalar fallback computations, defaults to `in_type_`
84
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
85
+ *
86
+ * @note Uses WGS-84/IERS-2003 ellipsoid model
87
+ * @note Accuracy: 0.01-0.2% vs WGS-84, 3-20x more accurate than Haversine
88
+ * @note Iterative algorithm with max 100 iterations
89
+ */
90
+ template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
91
+ void vincenty(in_type_ const *a_lats, in_type_ const *a_lons, in_type_ const *b_lats, in_type_ const *b_lons,
92
+ std::size_t d, in_type_ *results) noexcept {
93
+ constexpr bool simd = allow_simd_ == prefer_simd_k && std::is_same_v<in_type_, precision_type_>;
94
+
95
+ if constexpr (std::is_same_v<in_type_, f64_t> && simd)
96
+ nk_vincenty_f64(&a_lats->raw_, &a_lons->raw_, &b_lats->raw_, &b_lons->raw_, d, &results->raw_);
97
+ else if constexpr (std::is_same_v<in_type_, f32_t> && simd)
98
+ nk_vincenty_f32(&a_lats->raw_, &a_lons->raw_, &b_lats->raw_, &b_lons->raw_, d, &results->raw_);
99
+ // Scalar fallback
100
+ else {
101
+ precision_type_ const equatorial_radius = precision_type_(6378136.6);
102
+ precision_type_ const polar_radius = precision_type_(6356751.9);
103
+ precision_type_ const flattening = precision_type_(1.0) / precision_type_(298.25642);
104
+ precision_type_ const convergence_threshold = precision_type_(1e-12);
105
+ constexpr int max_iterations = 100;
106
+
107
+ for (std::size_t i = 0; i < d; i++) {
108
+ precision_type_ first_latitude = precision_type_(a_lats[i]);
109
+ precision_type_ second_latitude = precision_type_(b_lats[i]);
110
+ precision_type_ longitude_difference = precision_type_(b_lons[i]) - precision_type_(a_lons[i]);
111
+
112
+ // Reduced latitudes on the auxiliary sphere
113
+ precision_type_ tan_reduced_first = (precision_type_(1.0) - flattening) * first_latitude.tan();
114
+ precision_type_ tan_reduced_second = (precision_type_(1.0) - flattening) * second_latitude.tan();
115
+ precision_type_ cos_reduced_first = precision_type_(1.0) /
116
+ (precision_type_(1.0) + tan_reduced_first * tan_reduced_first).sqrt();
117
+ precision_type_ sin_reduced_first = tan_reduced_first * cos_reduced_first;
118
+ precision_type_ cos_reduced_second =
119
+ precision_type_(1.0) / (precision_type_(1.0) + tan_reduced_second * tan_reduced_second).sqrt();
120
+ precision_type_ sin_reduced_second = tan_reduced_second * cos_reduced_second;
121
+
122
+ // Iterative convergence of lambda (difference in longitude on auxiliary sphere)
123
+ precision_type_ lambda = longitude_difference;
124
+ precision_type_ lambda_previous = longitude_difference;
125
+ precision_type_ sin_angular_distance, cos_angular_distance, angular_distance;
126
+ precision_type_ sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
127
+ bool coincident = false;
128
+
129
+ for (unsigned int iteration = 0; iteration < max_iterations; iteration++) {
130
+ precision_type_ sin_lambda = lambda.sin();
131
+ precision_type_ cos_lambda = lambda.cos();
132
+
133
+ precision_type_ cross_term = cos_reduced_second * sin_lambda;
134
+ precision_type_ mixed_term = cos_reduced_first * sin_reduced_second -
135
+ sin_reduced_first * cos_reduced_second * cos_lambda;
136
+ sin_angular_distance = (cross_term * cross_term + mixed_term * mixed_term).sqrt();
137
+
138
+ if (sin_angular_distance == precision_type_(0.0)) {
139
+ coincident = true;
140
+ break;
141
+ }
142
+
143
+ cos_angular_distance = sin_reduced_first * sin_reduced_second +
144
+ cos_reduced_first * cos_reduced_second * cos_lambda;
145
+ angular_distance = sin_angular_distance.atan2(cos_angular_distance);
146
+
147
+ sin_azimuth = cos_reduced_first * cos_reduced_second * sin_lambda / sin_angular_distance;
148
+ cos_squared_azimuth = precision_type_(1.0) - sin_azimuth * sin_azimuth;
149
+
150
+ // Handle equatorial geodesic case
151
+ cos_double_angular_midpoint = (cos_squared_azimuth != precision_type_(0.0))
152
+ ? cos_angular_distance - precision_type_(2.0) * sin_reduced_first *
153
+ sin_reduced_second / cos_squared_azimuth
154
+ : precision_type_(0.0);
155
+
156
+ precision_type_ correction_factor =
157
+ flattening / precision_type_(16.0) * cos_squared_azimuth *
158
+ (precision_type_(4.0) +
159
+ flattening * (precision_type_(4.0) - precision_type_(3.0) * cos_squared_azimuth));
160
+
161
+ lambda_previous = lambda;
162
+ lambda = longitude_difference +
163
+ (precision_type_(1.0) - correction_factor) * flattening * sin_azimuth *
164
+ (angular_distance +
165
+ correction_factor * sin_angular_distance *
166
+ (cos_double_angular_midpoint +
167
+ correction_factor * cos_angular_distance *
168
+ (precision_type_(-1.0) + precision_type_(2.0) * cos_double_angular_midpoint *
169
+ cos_double_angular_midpoint)));
170
+
171
+ if ((lambda - lambda_previous).abs() < convergence_threshold) break;
172
+ }
173
+
174
+ if (coincident) {
175
+ results[i] = in_type_(0.0);
176
+ continue;
177
+ }
178
+
179
+ // Final distance calculation
180
+ precision_type_ u_squared = cos_squared_azimuth *
181
+ (equatorial_radius * equatorial_radius - polar_radius * polar_radius) /
182
+ (polar_radius * polar_radius);
183
+ precision_type_ series_a =
184
+ precision_type_(1.0) +
185
+ u_squared / precision_type_(16384.0) *
186
+ (precision_type_(4096.0) +
187
+ u_squared * (precision_type_(-768.0) +
188
+ u_squared * (precision_type_(320.0) - precision_type_(175.0) * u_squared)));
189
+ precision_type_ series_b = u_squared / precision_type_(1024.0) *
190
+ (precision_type_(256.0) +
191
+ u_squared *
192
+ (precision_type_(-128.0) +
193
+ u_squared * (precision_type_(74.0) - precision_type_(47.0) * u_squared)));
194
+
195
+ precision_type_ angular_correction =
196
+ series_b * sin_angular_distance *
197
+ (cos_double_angular_midpoint +
198
+ series_b / precision_type_(4.0) *
199
+ (cos_angular_distance *
200
+ (precision_type_(-1.0) +
201
+ precision_type_(2.0) * cos_double_angular_midpoint * cos_double_angular_midpoint) -
202
+ series_b / precision_type_(6.0) * cos_double_angular_midpoint *
203
+ (precision_type_(-3.0) + precision_type_(4.0) * sin_angular_distance * sin_angular_distance) *
204
+ (precision_type_(-3.0) +
205
+ precision_type_(4.0) * cos_double_angular_midpoint * cos_double_angular_midpoint)));
206
+
207
+ results[i] = in_type_(
208
+ static_cast<double>(polar_radius * series_a * (angular_distance - angular_correction)));
209
+ }
210
+ }
211
+ }
212
+
213
+ } // namespace ashvardanian::numkong
214
+
215
+ #include "numkong/tensor.hpp"
216
+
217
+ namespace ashvardanian::numkong {
218
+
219
+ template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
220
+ void haversine(vector_view<in_type_> a_lats, vector_view<in_type_> a_lons, vector_view<in_type_> b_lats,
221
+ vector_view<in_type_> b_lons, in_type_ *results) noexcept {
222
+ haversine<in_type_, precision_type_, allow_simd_>(a_lats.data(), a_lons.data(), b_lats.data(), b_lons.data(),
223
+ a_lats.size(), results);
224
+ }
225
+
226
+ template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
227
+ void vincenty(vector_view<in_type_> a_lats, vector_view<in_type_> a_lons, vector_view<in_type_> b_lats,
228
+ vector_view<in_type_> b_lons, in_type_ *results) noexcept {
229
+ vincenty<in_type_, precision_type_, allow_simd_>(a_lats.data(), a_lons.data(), b_lats.data(), b_lons.data(),
230
+ a_lats.size(), results);
231
+ }
232
+
233
+ } // namespace ashvardanian::numkong
234
+
235
+ #endif // NK_GEOSPATIAL_HPP