numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,571 @@
1
+ /**
2
+ * @brief SIMD-accelerated Geospatial Distances for NEON.
3
+ * @file include/numkong/geospatial/neon.h
4
+ * @author Ash Vardanian
5
+ * @date February 6, 2026
6
+ *
7
+ * @sa include/numkong/geospatial.h
8
+ *
9
+ * @section geospatial_neon_instructions Key NEON Geospatial Instructions
10
+ *
11
+ * Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
12
+ * vfmaq_f32 FMLA.S (vec) 4c @ V0123 4c @ V0123 4c @ V0123
13
+ * vfmaq_f64 FMLA.D (vec) 4c @ V0123 4c @ V0123 4c @ V0123
14
+ * vsqrtq_f32 FSQRT.S (vec) 10c @ V02 10c @ V02 9c @ V02
15
+ * vsqrtq_f64 FSQRT.D (vec) 13c @ V02 16c @ V02 16c @ V02
16
+ */
17
+ #ifndef NK_GEOSPATIAL_NEON_H
18
+ #define NK_GEOSPATIAL_NEON_H
19
+
20
+ #if NK_TARGET_ARM_
21
+ #if NK_TARGET_NEON
22
+
23
+ #include "numkong/types.h"
24
+ #include "numkong/trigonometry/neon.h" // `nk_sin_f64x2_neon_`, `nk_cos_f64x2_neon_`, `nk_atan2_f64x2_neon_`, etc.
25
+
26
+ #if defined(__cplusplus)
27
+ extern "C" {
28
+ #endif
29
+
30
+ #if defined(__clang__)
31
+ #pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
32
+ #elif defined(__GNUC__)
33
+ #pragma GCC push_options
34
+ #pragma GCC target("arch=armv8-a+simd")
35
+ #endif
36
+
37
+ /* NEON implementations using 2-wide f64 and 4-wide f32 SIMD.
38
+ * These require NEON trigonometric kernels from trigonometry/neon.h.
39
+ */
40
+
41
+ NK_INTERNAL float64x2_t nk_haversine_f64x2_neon_( //
42
+ float64x2_t first_latitudes, float64x2_t first_longitudes, //
43
+ float64x2_t second_latitudes, float64x2_t second_longitudes) {
44
+
45
+ float64x2_t const earth_radius = vdupq_n_f64(NK_EARTH_MEDIATORIAL_RADIUS);
46
+ float64x2_t const half = vdupq_n_f64(0.5);
47
+ float64x2_t const one = vdupq_n_f64(1.0);
48
+ float64x2_t const two = vdupq_n_f64(2.0);
49
+
50
+ float64x2_t latitude_delta = vsubq_f64(second_latitudes, first_latitudes);
51
+ float64x2_t longitude_delta = vsubq_f64(second_longitudes, first_longitudes);
52
+
53
+ // Haversine terms: sin²(Δ/2)
54
+ float64x2_t latitude_delta_half = vmulq_f64(latitude_delta, half);
55
+ float64x2_t longitude_delta_half = vmulq_f64(longitude_delta, half);
56
+ float64x2_t sin_latitude_delta_half = nk_sin_f64x2_neon_(latitude_delta_half);
57
+ float64x2_t sin_longitude_delta_half = nk_sin_f64x2_neon_(longitude_delta_half);
58
+ float64x2_t sin_squared_latitude_delta_half = vmulq_f64(sin_latitude_delta_half, sin_latitude_delta_half);
59
+ float64x2_t sin_squared_longitude_delta_half = vmulq_f64(sin_longitude_delta_half, sin_longitude_delta_half);
60
+
61
+ // Latitude cosine product
62
+ float64x2_t cos_first_latitude = nk_cos_f64x2_neon_(first_latitudes);
63
+ float64x2_t cos_second_latitude = nk_cos_f64x2_neon_(second_latitudes);
64
+ float64x2_t cos_latitude_product = vmulq_f64(cos_first_latitude, cos_second_latitude);
65
+
66
+ // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
67
+ float64x2_t haversine_term = vaddq_f64(sin_squared_latitude_delta_half,
68
+ vmulq_f64(cos_latitude_product, sin_squared_longitude_delta_half));
69
+ // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
70
+ float64x2_t zero = vdupq_n_f64(0.0);
71
+ haversine_term = vmaxq_f64(zero, vminq_f64(one, haversine_term));
72
+
73
+ // Central angle: c = 2 × atan2(√a, √(1-a))
74
+ float64x2_t sqrt_haversine = vsqrtq_f64(haversine_term);
75
+ float64x2_t sqrt_complement = vsqrtq_f64(vsubq_f64(one, haversine_term));
76
+ float64x2_t central_angle = vmulq_f64(two, nk_atan2_f64x2_neon_(sqrt_haversine, sqrt_complement));
77
+
78
+ return vmulq_f64(earth_radius, central_angle);
79
+ }
80
+
81
+ NK_PUBLIC void nk_haversine_f64_neon( //
82
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
83
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
84
+ nk_size_t n, nk_f64_t *results) {
85
+
86
+ while (n >= 2) {
87
+ float64x2_t first_latitudes = vld1q_f64(a_lats);
88
+ float64x2_t first_longitudes = vld1q_f64(a_lons);
89
+ float64x2_t second_latitudes = vld1q_f64(b_lats);
90
+ float64x2_t second_longitudes = vld1q_f64(b_lons);
91
+
92
+ float64x2_t distances = nk_haversine_f64x2_neon_(first_latitudes, first_longitudes, second_latitudes,
93
+ second_longitudes);
94
+ vst1q_f64(results, distances);
95
+
96
+ a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
97
+ }
98
+
99
+ // Handle tail with partial loads (n can only be 0 or 1 here)
100
+ if (n > 0) {
101
+ nk_b128_vec_t a_lat_vec, a_lon_vec, b_lat_vec, b_lon_vec, result_vec;
102
+ nk_partial_load_b64x2_serial_(a_lats, &a_lat_vec, n);
103
+ nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
104
+ nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
105
+ nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
106
+ float64x2_t distances = nk_haversine_f64x2_neon_(a_lat_vec.f64x2, a_lon_vec.f64x2, b_lat_vec.f64x2,
107
+ b_lon_vec.f64x2);
108
+ result_vec.f64x2 = distances;
109
+ nk_partial_store_b64x2_serial_(&result_vec, results, n);
110
+ }
111
+ }
112
+
113
+ NK_INTERNAL float32x4_t nk_haversine_f32x4_neon_( //
114
+ float32x4_t first_latitudes, float32x4_t first_longitudes, //
115
+ float32x4_t second_latitudes, float32x4_t second_longitudes) {
116
+
117
+ float32x4_t const earth_radius = vdupq_n_f32((float)NK_EARTH_MEDIATORIAL_RADIUS);
118
+ float32x4_t const half = vdupq_n_f32(0.5f);
119
+ float32x4_t const one = vdupq_n_f32(1.0f);
120
+ float32x4_t const two = vdupq_n_f32(2.0f);
121
+
122
+ float32x4_t latitude_delta = vsubq_f32(second_latitudes, first_latitudes);
123
+ float32x4_t longitude_delta = vsubq_f32(second_longitudes, first_longitudes);
124
+
125
+ // Haversine terms: sin²(Δ/2)
126
+ float32x4_t latitude_delta_half = vmulq_f32(latitude_delta, half);
127
+ float32x4_t longitude_delta_half = vmulq_f32(longitude_delta, half);
128
+ float32x4_t sin_latitude_delta_half = nk_sin_f32x4_neon_(latitude_delta_half);
129
+ float32x4_t sin_longitude_delta_half = nk_sin_f32x4_neon_(longitude_delta_half);
130
+ float32x4_t sin_squared_latitude_delta_half = vmulq_f32(sin_latitude_delta_half, sin_latitude_delta_half);
131
+ float32x4_t sin_squared_longitude_delta_half = vmulq_f32(sin_longitude_delta_half, sin_longitude_delta_half);
132
+
133
+ // Latitude cosine product
134
+ float32x4_t cos_first_latitude = nk_cos_f32x4_neon_(first_latitudes);
135
+ float32x4_t cos_second_latitude = nk_cos_f32x4_neon_(second_latitudes);
136
+ float32x4_t cos_latitude_product = vmulq_f32(cos_first_latitude, cos_second_latitude);
137
+
138
+ // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
139
+ float32x4_t haversine_term = vaddq_f32(sin_squared_latitude_delta_half,
140
+ vmulq_f32(cos_latitude_product, sin_squared_longitude_delta_half));
141
+
142
+ // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
143
+ float32x4_t zero = vdupq_n_f32(0.0f);
144
+ haversine_term = vmaxq_f32(zero, vminq_f32(one, haversine_term));
145
+
146
+ // Central angle: c = 2 × atan2(√a, √(1-a))
147
+ float32x4_t sqrt_haversine = vsqrtq_f32(haversine_term);
148
+ float32x4_t sqrt_complement = vsqrtq_f32(vsubq_f32(one, haversine_term));
149
+ float32x4_t central_angle = vmulq_f32(two, nk_atan2_f32x4_neon_(sqrt_haversine, sqrt_complement));
150
+
151
+ return vmulq_f32(earth_radius, central_angle);
152
+ }
153
+
154
+ NK_PUBLIC void nk_haversine_f32_neon( //
155
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
156
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
157
+ nk_size_t n, nk_f32_t *results) {
158
+
159
+ while (n >= 4) {
160
+ float32x4_t first_latitudes = vld1q_f32(a_lats);
161
+ float32x4_t first_longitudes = vld1q_f32(a_lons);
162
+ float32x4_t second_latitudes = vld1q_f32(b_lats);
163
+ float32x4_t second_longitudes = vld1q_f32(b_lons);
164
+
165
+ float32x4_t distances = nk_haversine_f32x4_neon_(first_latitudes, first_longitudes, second_latitudes,
166
+ second_longitudes);
167
+ vst1q_f32(results, distances);
168
+
169
+ a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
170
+ }
171
+
172
+ // Handle tail with partial loads (n can be 0-3 here)
173
+ if (n > 0) {
174
+ nk_b128_vec_t a_lat_vec, a_lon_vec, b_lat_vec, b_lon_vec, result_vec;
175
+ nk_partial_load_b32x4_serial_(a_lats, &a_lat_vec, n);
176
+ nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
177
+ nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
178
+ nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
179
+ float32x4_t distances = nk_haversine_f32x4_neon_(a_lat_vec.f32x4, a_lon_vec.f32x4, b_lat_vec.f32x4,
180
+ b_lon_vec.f32x4);
181
+ result_vec.f32x4 = distances;
182
+ nk_partial_store_b32x4_serial_(&result_vec, results, n);
183
+ }
184
+ }
185
+
186
+ /**
187
+ * @brief NEON helper for Vincenty's geodesic distance on 2 f64 point pairs.
188
+ * @note This is a true SIMD implementation using masked convergence tracking via blending.
189
+ */
190
+ NK_INTERNAL float64x2_t nk_vincenty_f64x2_neon_( //
191
+ float64x2_t first_latitudes, float64x2_t first_longitudes, //
192
+ float64x2_t second_latitudes, float64x2_t second_longitudes) {
193
+
194
+ float64x2_t const equatorial_radius = vdupq_n_f64(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
195
+ float64x2_t const polar_radius = vdupq_n_f64(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
196
+ float64x2_t const flattening = vdupq_n_f64(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
197
+ float64x2_t const convergence_threshold = vdupq_n_f64(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
198
+ float64x2_t const one = vdupq_n_f64(1.0);
199
+ float64x2_t const two = vdupq_n_f64(2.0);
200
+ float64x2_t const three = vdupq_n_f64(3.0);
201
+ float64x2_t const four = vdupq_n_f64(4.0);
202
+ float64x2_t const six = vdupq_n_f64(6.0);
203
+ float64x2_t const sixteen = vdupq_n_f64(16.0);
204
+ float64x2_t const epsilon = vdupq_n_f64(1e-15);
205
+
206
+ // Longitude difference
207
+ float64x2_t longitude_difference = vsubq_f64(second_longitudes, first_longitudes);
208
+
209
+ // Reduced latitudes: tan(U) = (1-f) * tan(lat)
210
+ float64x2_t one_minus_f = vsubq_f64(one, flattening);
211
+ float64x2_t tan_first = vdivq_f64(nk_sin_f64x2_neon_(first_latitudes), nk_cos_f64x2_neon_(first_latitudes));
212
+ float64x2_t tan_second = vdivq_f64(nk_sin_f64x2_neon_(second_latitudes), nk_cos_f64x2_neon_(second_latitudes));
213
+ float64x2_t tan_reduced_first = vmulq_f64(one_minus_f, tan_first);
214
+ float64x2_t tan_reduced_second = vmulq_f64(one_minus_f, tan_second);
215
+
216
+ // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
217
+ float64x2_t cos_reduced_first = vdivq_f64(one, vsqrtq_f64(vfmaq_f64(one, tan_reduced_first, tan_reduced_first)));
218
+ float64x2_t sin_reduced_first = vmulq_f64(tan_reduced_first, cos_reduced_first);
219
+ float64x2_t cos_reduced_second = vdivq_f64(one, vsqrtq_f64(vfmaq_f64(one, tan_reduced_second, tan_reduced_second)));
220
+ float64x2_t sin_reduced_second = vmulq_f64(tan_reduced_second, cos_reduced_second);
221
+
222
+ // Initialize lambda and tracking variables
223
+ float64x2_t lambda = longitude_difference;
224
+ float64x2_t sin_angular_distance, cos_angular_distance, angular_distance;
225
+ float64x2_t sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
226
+
227
+ // Track convergence and coincident points using masks
228
+ uint64x2_t converged_mask = vdupq_n_u64(0);
229
+ uint64x2_t coincident_mask = vdupq_n_u64(0);
230
+
231
+ for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
232
+ // Check if all lanes converged
233
+ uint64_t converged_bits = vgetq_lane_u64(converged_mask, 0) & vgetq_lane_u64(converged_mask, 1);
234
+ if (converged_bits) break;
235
+
236
+ float64x2_t sin_lambda = nk_sin_f64x2_neon_(lambda);
237
+ float64x2_t cos_lambda = nk_cos_f64x2_neon_(lambda);
238
+
239
+ // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
240
+ float64x2_t cross_term = vmulq_f64(cos_reduced_second, sin_lambda);
241
+ float64x2_t mixed_term = vsubq_f64(vmulq_f64(cos_reduced_first, sin_reduced_second),
242
+ vmulq_f64(vmulq_f64(sin_reduced_first, cos_reduced_second), cos_lambda));
243
+ float64x2_t sin_angular_dist_sq = vfmaq_f64(vmulq_f64(mixed_term, mixed_term), cross_term, cross_term);
244
+ sin_angular_distance = vsqrtq_f64(sin_angular_dist_sq);
245
+
246
+ // Check for coincident points (sin_angular_distance ≈ 0)
247
+ coincident_mask = vcltq_f64(sin_angular_distance, epsilon);
248
+
249
+ // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
250
+ cos_angular_distance = vfmaq_f64(vmulq_f64(sin_reduced_first, sin_reduced_second),
251
+ vmulq_f64(cos_reduced_first, cos_reduced_second), cos_lambda);
252
+
253
+ // angular_distance = atan2(sin, cos)
254
+ angular_distance = nk_atan2_f64x2_neon_(sin_angular_distance, cos_angular_distance);
255
+
256
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
257
+ // Avoid division by zero by using blending
258
+ float64x2_t safe_sin_angular = vbslq_f64(coincident_mask, one, sin_angular_distance);
259
+ sin_azimuth = vdivq_f64(vmulq_f64(vmulq_f64(cos_reduced_first, cos_reduced_second), sin_lambda),
260
+ safe_sin_angular);
261
+ cos_squared_azimuth = vsubq_f64(one, vmulq_f64(sin_azimuth, sin_azimuth));
262
+
263
+ // Handle equatorial case: cos²α ≈ 0
264
+ uint64x2_t equatorial_mask = vcltq_f64(cos_squared_azimuth, epsilon);
265
+ float64x2_t safe_cos_sq_azimuth = vbslq_f64(equatorial_mask, one, cos_squared_azimuth);
266
+
267
+ // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
268
+ float64x2_t sin_product = vmulq_f64(sin_reduced_first, sin_reduced_second);
269
+ cos_double_angular_midpoint = vsubq_f64(cos_angular_distance,
270
+ vdivq_f64(vmulq_f64(two, sin_product), safe_cos_sq_azimuth));
271
+ cos_double_angular_midpoint = vbslq_f64(equatorial_mask, vdupq_n_f64(0.0), cos_double_angular_midpoint);
272
+
273
+ // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
274
+ float64x2_t correction_factor = vmulq_f64(
275
+ vdivq_f64(flattening, sixteen),
276
+ vmulq_f64(cos_squared_azimuth, vfmaq_f64(four, flattening, vfmsq_f64(four, three, cos_squared_azimuth))));
277
+
278
+ // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
279
+ float64x2_t cos_2sm_sq = vmulq_f64(cos_double_angular_midpoint, cos_double_angular_midpoint);
280
+ // innermost = -1 + 2 × cos²(2σₘ)
281
+ float64x2_t innermost = vfmaq_f64(vdupq_n_f64(-1.0), two, cos_2sm_sq);
282
+ // middle = cos(2σₘ) + C × cos(σ) × innermost
283
+ float64x2_t middle = vfmaq_f64(cos_double_angular_midpoint, vmulq_f64(correction_factor, cos_angular_distance),
284
+ innermost);
285
+ // inner = C × sin(σ) × middle
286
+ float64x2_t inner = vmulq_f64(vmulq_f64(correction_factor, sin_angular_distance), middle);
287
+
288
+ // λ' = L + (1-C) * f * sin_α * (σ + inner)
289
+ float64x2_t lambda_new = vfmaq_f64(
290
+ longitude_difference, vmulq_f64(vmulq_f64(vsubq_f64(one, correction_factor), flattening), sin_azimuth),
291
+ vaddq_f64(angular_distance, inner));
292
+
293
+ // Check convergence: |λ - λ'| < threshold
294
+ float64x2_t lambda_diff = vsubq_f64(lambda_new, lambda);
295
+ float64x2_t lambda_diff_abs = vabsq_f64(lambda_diff);
296
+ uint64x2_t newly_converged = vcltq_f64(lambda_diff_abs, convergence_threshold);
297
+ converged_mask = vorrq_u64(converged_mask, newly_converged);
298
+
299
+ // Only update lambda for non-converged lanes
300
+ lambda = vbslq_f64(converged_mask, lambda, lambda_new);
301
+ }
302
+
303
+ // Final distance calculation
304
+ // u² = cos²α * (a² - b²) / b²
305
+ float64x2_t a_sq = vmulq_f64(equatorial_radius, equatorial_radius);
306
+ float64x2_t b_sq = vmulq_f64(polar_radius, polar_radius);
307
+ float64x2_t u_squared = vdivq_f64(vmulq_f64(cos_squared_azimuth, vsubq_f64(a_sq, b_sq)), b_sq);
308
+
309
+ // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
310
+ float64x2_t series_a = vfmaq_f64(vdupq_n_f64(320.0), u_squared, vdupq_n_f64(-175.0));
311
+ series_a = vfmaq_f64(vdupq_n_f64(-768.0), u_squared, series_a);
312
+ series_a = vfmaq_f64(vdupq_n_f64(4096.0), u_squared, series_a);
313
+ series_a = vfmaq_f64(one, vdivq_f64(u_squared, vdupq_n_f64(16384.0)), series_a);
314
+
315
+ // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
316
+ float64x2_t series_b = vfmaq_f64(vdupq_n_f64(74.0), u_squared, vdupq_n_f64(-47.0));
317
+ series_b = vfmaq_f64(vdupq_n_f64(-128.0), u_squared, series_b);
318
+ series_b = vfmaq_f64(vdupq_n_f64(256.0), u_squared, series_b);
319
+ series_b = vmulq_f64(vdivq_f64(u_squared, vdupq_n_f64(1024.0)), series_b);
320
+
321
+ // Δσ = B × sin(σ) × (cos(2σₘ) + B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 +
322
+ // 4 × cos²(2σₘ))))
323
+ float64x2_t cos_2sm_sq = vmulq_f64(cos_double_angular_midpoint, cos_double_angular_midpoint);
324
+ float64x2_t sin_sq = vmulq_f64(sin_angular_distance, sin_angular_distance);
325
+ float64x2_t term1 = vfmaq_f64(vdupq_n_f64(-1.0), two, cos_2sm_sq);
326
+ term1 = vmulq_f64(cos_angular_distance, term1);
327
+ float64x2_t term2 = vfmaq_f64(vdupq_n_f64(-3.0), four, sin_sq);
328
+ float64x2_t term3 = vfmaq_f64(vdupq_n_f64(-3.0), four, cos_2sm_sq);
329
+ term2 = vmulq_f64(vmulq_f64(vdivq_f64(series_b, six), cos_double_angular_midpoint), vmulq_f64(term2, term3));
330
+ float64x2_t delta_sigma = vmulq_f64(
331
+ series_b,
332
+ vmulq_f64(sin_angular_distance, vaddq_f64(cos_double_angular_midpoint,
333
+ vmulq_f64(vdivq_f64(series_b, four), vsubq_f64(term1, term2)))));
334
+
335
+ // s = b * A * (σ - Δσ)
336
+ float64x2_t distances = vmulq_f64(vmulq_f64(polar_radius, series_a), vsubq_f64(angular_distance, delta_sigma));
337
+
338
+ // Set coincident points to zero
339
+ distances = vbslq_f64(coincident_mask, vdupq_n_f64(0.0), distances);
340
+
341
+ return distances;
342
+ }
343
+
344
+ NK_PUBLIC void nk_vincenty_f64_neon( //
345
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
346
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
347
+ nk_size_t n, nk_f64_t *results) {
348
+
349
+ while (n >= 2) {
350
+ float64x2_t first_latitudes = vld1q_f64(a_lats);
351
+ float64x2_t first_longitudes = vld1q_f64(a_lons);
352
+ float64x2_t second_latitudes = vld1q_f64(b_lats);
353
+ float64x2_t second_longitudes = vld1q_f64(b_lons);
354
+
355
+ float64x2_t distances = nk_vincenty_f64x2_neon_(first_latitudes, first_longitudes, second_latitudes,
356
+ second_longitudes);
357
+ vst1q_f64(results, distances);
358
+
359
+ a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
360
+ }
361
+
362
+ // Handle remaining elements with partial loads (n can only be 0 or 1 here)
363
+ if (n > 0) {
364
+ nk_b128_vec_t a_lat_vec, a_lon_vec, b_lat_vec, b_lon_vec, result_vec;
365
+ nk_partial_load_b64x2_serial_(a_lats, &a_lat_vec, n);
366
+ nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
367
+ nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
368
+ nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
369
+ float64x2_t distances = nk_vincenty_f64x2_neon_(a_lat_vec.f64x2, a_lon_vec.f64x2, b_lat_vec.f64x2,
370
+ b_lon_vec.f64x2);
371
+ result_vec.f64x2 = distances;
372
+ nk_partial_store_b64x2_serial_(&result_vec, results, n);
373
+ }
374
+ }
375
+
376
+ /**
377
+ * @brief NEON helper for Vincenty's geodesic distance on 4 f32 point pairs.
378
+ * @note This is a true SIMD implementation using masked convergence tracking via blending.
379
+ */
380
+ NK_INTERNAL float32x4_t nk_vincenty_f32x4_neon_( //
381
+ float32x4_t first_latitudes, float32x4_t first_longitudes, //
382
+ float32x4_t second_latitudes, float32x4_t second_longitudes) {
383
+
384
+ float32x4_t const equatorial_radius = vdupq_n_f32((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
385
+ float32x4_t const polar_radius = vdupq_n_f32((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
386
+ float32x4_t const flattening = vdupq_n_f32(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
387
+ float32x4_t const convergence_threshold = vdupq_n_f32(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
388
+ float32x4_t const one = vdupq_n_f32(1.0f);
389
+ float32x4_t const two = vdupq_n_f32(2.0f);
390
+ float32x4_t const three = vdupq_n_f32(3.0f);
391
+ float32x4_t const four = vdupq_n_f32(4.0f);
392
+ float32x4_t const six = vdupq_n_f32(6.0f);
393
+ float32x4_t const sixteen = vdupq_n_f32(16.0f);
394
+ float32x4_t const epsilon = vdupq_n_f32(1e-7f);
395
+
396
+ // Longitude difference
397
+ float32x4_t longitude_difference = vsubq_f32(second_longitudes, first_longitudes);
398
+
399
+ // Reduced latitudes: tan(U) = (1-f) * tan(lat)
400
+ float32x4_t one_minus_f = vsubq_f32(one, flattening);
401
+ float32x4_t tan_first = vdivq_f32(nk_sin_f32x4_neon_(first_latitudes), nk_cos_f32x4_neon_(first_latitudes));
402
+ float32x4_t tan_second = vdivq_f32(nk_sin_f32x4_neon_(second_latitudes), nk_cos_f32x4_neon_(second_latitudes));
403
+ float32x4_t tan_reduced_first = vmulq_f32(one_minus_f, tan_first);
404
+ float32x4_t tan_reduced_second = vmulq_f32(one_minus_f, tan_second);
405
+
406
+ // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
407
+ float32x4_t cos_reduced_first = vdivq_f32(one, vsqrtq_f32(vfmaq_f32(one, tan_reduced_first, tan_reduced_first)));
408
+ float32x4_t sin_reduced_first = vmulq_f32(tan_reduced_first, cos_reduced_first);
409
+ float32x4_t cos_reduced_second = vdivq_f32(one, vsqrtq_f32(vfmaq_f32(one, tan_reduced_second, tan_reduced_second)));
410
+ float32x4_t sin_reduced_second = vmulq_f32(tan_reduced_second, cos_reduced_second);
411
+
412
+ // Initialize lambda and tracking variables
413
+ float32x4_t lambda = longitude_difference;
414
+ float32x4_t sin_angular_distance, cos_angular_distance, angular_distance;
415
+ float32x4_t sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
416
+
417
+ // Track convergence and coincident points using masks
418
+ uint32x4_t converged_mask = vdupq_n_u32(0);
419
+ uint32x4_t coincident_mask = vdupq_n_u32(0);
420
+
421
+ for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
422
+ // Check if all lanes converged (all bits set = 0xFFFFFFFF per lane)
423
+ uint32_t converged_bits = vminvq_u32(converged_mask);
424
+ if (converged_bits == 0xFFFFFFFF) break;
425
+
426
+ float32x4_t sin_lambda = nk_sin_f32x4_neon_(lambda);
427
+ float32x4_t cos_lambda = nk_cos_f32x4_neon_(lambda);
428
+
429
+ // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
430
+ float32x4_t cross_term = vmulq_f32(cos_reduced_second, sin_lambda);
431
+ float32x4_t mixed_term = vsubq_f32(vmulq_f32(cos_reduced_first, sin_reduced_second),
432
+ vmulq_f32(vmulq_f32(sin_reduced_first, cos_reduced_second), cos_lambda));
433
+ float32x4_t sin_angular_dist_sq = vfmaq_f32(vmulq_f32(mixed_term, mixed_term), cross_term, cross_term);
434
+ sin_angular_distance = vsqrtq_f32(sin_angular_dist_sq);
435
+
436
+ // Check for coincident points (sin_angular_distance ≈ 0)
437
+ coincident_mask = vcltq_f32(sin_angular_distance, epsilon);
438
+
439
+ // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
440
+ cos_angular_distance = vfmaq_f32(vmulq_f32(sin_reduced_first, sin_reduced_second),
441
+ vmulq_f32(cos_reduced_first, cos_reduced_second), cos_lambda);
442
+
443
+ // angular_distance = atan2(sin, cos)
444
+ angular_distance = nk_atan2_f32x4_neon_(sin_angular_distance, cos_angular_distance);
445
+
446
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
447
+ float32x4_t safe_sin_angular = vbslq_f32(coincident_mask, one, sin_angular_distance);
448
+ sin_azimuth = vdivq_f32(vmulq_f32(vmulq_f32(cos_reduced_first, cos_reduced_second), sin_lambda),
449
+ safe_sin_angular);
450
+ cos_squared_azimuth = vsubq_f32(one, vmulq_f32(sin_azimuth, sin_azimuth));
451
+
452
+ // Handle equatorial case: cos²α ≈ 0
453
+ uint32x4_t equatorial_mask = vcltq_f32(cos_squared_azimuth, epsilon);
454
+ float32x4_t safe_cos_sq_azimuth = vbslq_f32(equatorial_mask, one, cos_squared_azimuth);
455
+
456
+ // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
457
+ float32x4_t sin_product = vmulq_f32(sin_reduced_first, sin_reduced_second);
458
+ cos_double_angular_midpoint = vsubq_f32(cos_angular_distance,
459
+ vdivq_f32(vmulq_f32(two, sin_product), safe_cos_sq_azimuth));
460
+ cos_double_angular_midpoint = vbslq_f32(equatorial_mask, vdupq_n_f32(0.0f), cos_double_angular_midpoint);
461
+
462
+ // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
463
+ float32x4_t correction_factor = vmulq_f32(
464
+ vdivq_f32(flattening, sixteen),
465
+ vmulq_f32(cos_squared_azimuth, vfmaq_f32(four, flattening, vfmsq_f32(four, three, cos_squared_azimuth))));
466
+
467
+ // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
468
+ float32x4_t cos_2sm_sq = vmulq_f32(cos_double_angular_midpoint, cos_double_angular_midpoint);
469
+ float32x4_t innermost = vfmaq_f32(vdupq_n_f32(-1.0f), two, cos_2sm_sq);
470
+ float32x4_t middle = vfmaq_f32(cos_double_angular_midpoint, vmulq_f32(correction_factor, cos_angular_distance),
471
+ innermost);
472
+ float32x4_t inner = vmulq_f32(vmulq_f32(correction_factor, sin_angular_distance), middle);
473
+
474
+ float32x4_t lambda_new = vfmaq_f32(
475
+ longitude_difference, vmulq_f32(vmulq_f32(vsubq_f32(one, correction_factor), flattening), sin_azimuth),
476
+ vaddq_f32(angular_distance, inner));
477
+
478
+ // Check convergence: |λ - λ'| < threshold
479
+ float32x4_t lambda_diff = vsubq_f32(lambda_new, lambda);
480
+ float32x4_t lambda_diff_abs = vabsq_f32(lambda_diff);
481
+ uint32x4_t newly_converged = vcltq_f32(lambda_diff_abs, convergence_threshold);
482
+ converged_mask = vorrq_u32(converged_mask, newly_converged);
483
+
484
+ // Only update lambda for non-converged lanes
485
+ lambda = vbslq_f32(converged_mask, lambda, lambda_new);
486
+ }
487
+
488
+ // Final distance calculation
489
+ float32x4_t a_sq = vmulq_f32(equatorial_radius, equatorial_radius);
490
+ float32x4_t b_sq = vmulq_f32(polar_radius, polar_radius);
491
+ float32x4_t u_squared = vdivq_f32(vmulq_f32(cos_squared_azimuth, vsubq_f32(a_sq, b_sq)), b_sq);
492
+
493
+ // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
494
+ float32x4_t series_a = vfmaq_f32(vdupq_n_f32(320.0f), u_squared, vdupq_n_f32(-175.0f));
495
+ series_a = vfmaq_f32(vdupq_n_f32(-768.0f), u_squared, series_a);
496
+ series_a = vfmaq_f32(vdupq_n_f32(4096.0f), u_squared, series_a);
497
+ series_a = vfmaq_f32(one, vdivq_f32(u_squared, vdupq_n_f32(16384.0f)), series_a);
498
+
499
+ // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
500
+ float32x4_t series_b = vfmaq_f32(vdupq_n_f32(74.0f), u_squared, vdupq_n_f32(-47.0f));
501
+ series_b = vfmaq_f32(vdupq_n_f32(-128.0f), u_squared, series_b);
502
+ series_b = vfmaq_f32(vdupq_n_f32(256.0f), u_squared, series_b);
503
+ series_b = vmulq_f32(vdivq_f32(u_squared, vdupq_n_f32(1024.0f)), series_b);
504
+
505
+ // Δσ calculation
506
+ float32x4_t cos_2sm_sq = vmulq_f32(cos_double_angular_midpoint, cos_double_angular_midpoint);
507
+ float32x4_t sin_sq = vmulq_f32(sin_angular_distance, sin_angular_distance);
508
+ float32x4_t term1 = vfmaq_f32(vdupq_n_f32(-1.0f), two, cos_2sm_sq);
509
+ term1 = vmulq_f32(cos_angular_distance, term1);
510
+ float32x4_t term2 = vfmaq_f32(vdupq_n_f32(-3.0f), four, sin_sq);
511
+ float32x4_t term3 = vfmaq_f32(vdupq_n_f32(-3.0f), four, cos_2sm_sq);
512
+ term2 = vmulq_f32(vmulq_f32(vdivq_f32(series_b, six), cos_double_angular_midpoint), vmulq_f32(term2, term3));
513
+ float32x4_t delta_sigma = vmulq_f32(
514
+ series_b,
515
+ vmulq_f32(sin_angular_distance, vaddq_f32(cos_double_angular_midpoint,
516
+ vmulq_f32(vdivq_f32(series_b, four), vsubq_f32(term1, term2)))));
517
+
518
+ // s = b * A * (σ - Δσ)
519
+ float32x4_t distances = vmulq_f32(vmulq_f32(polar_radius, series_a), vsubq_f32(angular_distance, delta_sigma));
520
+
521
+ // Set coincident points to zero
522
+ distances = vbslq_f32(coincident_mask, vdupq_n_f32(0.0f), distances);
523
+
524
+ return distances;
525
+ }
526
+
527
+ NK_PUBLIC void nk_vincenty_f32_neon( //
528
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
529
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
530
+ nk_size_t n, nk_f32_t *results) {
531
+
532
+ while (n >= 4) {
533
+ float32x4_t first_latitudes = vld1q_f32(a_lats);
534
+ float32x4_t first_longitudes = vld1q_f32(a_lons);
535
+ float32x4_t second_latitudes = vld1q_f32(b_lats);
536
+ float32x4_t second_longitudes = vld1q_f32(b_lons);
537
+
538
+ float32x4_t distances = nk_vincenty_f32x4_neon_(first_latitudes, first_longitudes, second_latitudes,
539
+ second_longitudes);
540
+ vst1q_f32(results, distances);
541
+
542
+ a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
543
+ }
544
+
545
+ // Handle remaining elements with partial loads (n can be 1-3 here)
546
+ if (n > 0) {
547
+ nk_b128_vec_t a_lat_vec, a_lon_vec, b_lat_vec, b_lon_vec, result_vec;
548
+ nk_partial_load_b32x4_serial_(a_lats, &a_lat_vec, n);
549
+ nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
550
+ nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
551
+ nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
552
+ float32x4_t distances = nk_vincenty_f32x4_neon_(a_lat_vec.f32x4, a_lon_vec.f32x4, b_lat_vec.f32x4,
553
+ b_lon_vec.f32x4);
554
+ result_vec.f32x4 = distances;
555
+ nk_partial_store_b32x4_serial_(&result_vec, results, n);
556
+ }
557
+ }
558
+
559
+ #if defined(__clang__)
560
+ #pragma clang attribute pop
561
+ #elif defined(__GNUC__)
562
+ #pragma GCC pop_options
563
+ #endif
564
+
565
+ #if defined(__cplusplus)
566
+ } // extern "C"
567
+ #endif
568
+
569
+ #endif // NK_TARGET_NEON
570
+ #endif // NK_TARGET_ARM_
571
+ #endif // NK_GEOSPATIAL_NEON_H