numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -8,11 +8,11 @@
8
8
  *
9
9
  * @section geospatial_neon_instructions Key NEON Geospatial Instructions
10
10
  *
11
- * Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
12
- * vfmaq_f32 FMLA.S (vec) 4c @ V0123 4c @ V0123 4c @ V0123
13
- * vfmaq_f64 FMLA.D (vec) 4c @ V0123 4c @ V0123 4c @ V0123
14
- * vsqrtq_f32 FSQRT.S (vec) 10c @ V02 10c @ V02 9c @ V02
15
- * vsqrtq_f64 FSQRT.D (vec) 13c @ V02 16c @ V02 16c @ V02
11
+ * Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
12
+ * vfmaq_f32 FMLA.S (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
13
+ * vfmaq_f64 FMLA.D (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
14
+ * vsqrtq_f32 FSQRT.S (vec) 10cy @ V02 10cy @ V02 9cy @ V02
15
+ * vsqrtq_f64 FSQRT.D (vec) 13cy @ V02 16cy @ V02 16cy @ V02
16
16
  */
17
17
  #ifndef NK_GEOSPATIAL_NEON_H
18
18
  #define NK_GEOSPATIAL_NEON_H
@@ -38,44 +38,48 @@ extern "C" {
38
38
  * These require NEON trigonometric kernels from trigonometry/neon.h.
39
39
  */
40
40
 
41
- NK_INTERNAL float64x2_t nk_haversine_f64x2_neon_( //
42
- float64x2_t first_latitudes, float64x2_t first_longitudes, //
43
- float64x2_t second_latitudes, float64x2_t second_longitudes) {
41
+ NK_INTERNAL float64x2_t nk_haversine_f64x2_neon_( //
42
+ float64x2_t first_latitudes_f64x2, float64x2_t first_longitudes_f64x2, //
43
+ float64x2_t second_latitudes_f64x2, float64x2_t second_longitudes_f64x2) {
44
44
 
45
- float64x2_t const earth_radius = vdupq_n_f64(NK_EARTH_MEDIATORIAL_RADIUS);
46
- float64x2_t const half = vdupq_n_f64(0.5);
47
- float64x2_t const one = vdupq_n_f64(1.0);
48
- float64x2_t const two = vdupq_n_f64(2.0);
45
+ float64x2_t const earth_radius_f64x2 = vdupq_n_f64(NK_EARTH_MEDIATORIAL_RADIUS);
46
+ float64x2_t const half_f64x2 = vdupq_n_f64(0.5);
47
+ float64x2_t const one_f64x2 = vdupq_n_f64(1.0);
48
+ float64x2_t const two_f64x2 = vdupq_n_f64(2.0);
49
49
 
50
- float64x2_t latitude_delta = vsubq_f64(second_latitudes, first_latitudes);
51
- float64x2_t longitude_delta = vsubq_f64(second_longitudes, first_longitudes);
50
+ float64x2_t latitude_delta_f64x2 = vsubq_f64(second_latitudes_f64x2, first_latitudes_f64x2);
51
+ float64x2_t longitude_delta_f64x2 = vsubq_f64(second_longitudes_f64x2, first_longitudes_f64x2);
52
52
 
53
53
  // Haversine terms: sin²(Δ/2)
54
- float64x2_t latitude_delta_half = vmulq_f64(latitude_delta, half);
55
- float64x2_t longitude_delta_half = vmulq_f64(longitude_delta, half);
56
- float64x2_t sin_latitude_delta_half = nk_sin_f64x2_neon_(latitude_delta_half);
57
- float64x2_t sin_longitude_delta_half = nk_sin_f64x2_neon_(longitude_delta_half);
58
- float64x2_t sin_squared_latitude_delta_half = vmulq_f64(sin_latitude_delta_half, sin_latitude_delta_half);
59
- float64x2_t sin_squared_longitude_delta_half = vmulq_f64(sin_longitude_delta_half, sin_longitude_delta_half);
54
+ float64x2_t latitude_delta_half_f64x2 = vmulq_f64(latitude_delta_f64x2, half_f64x2);
55
+ float64x2_t longitude_delta_half_f64x2 = vmulq_f64(longitude_delta_f64x2, half_f64x2);
56
+ float64x2_t sin_latitude_delta_half_f64x2 = nk_sin_f64x2_neon_(latitude_delta_half_f64x2);
57
+ float64x2_t sin_longitude_delta_half_f64x2 = nk_sin_f64x2_neon_(longitude_delta_half_f64x2);
58
+ float64x2_t sin_squared_latitude_delta_half_f64x2 = vmulq_f64(sin_latitude_delta_half_f64x2,
59
+ sin_latitude_delta_half_f64x2);
60
+ float64x2_t sin_squared_longitude_delta_half_f64x2 = vmulq_f64(sin_longitude_delta_half_f64x2,
61
+ sin_longitude_delta_half_f64x2);
60
62
 
61
63
  // Latitude cosine product
62
- float64x2_t cos_first_latitude = nk_cos_f64x2_neon_(first_latitudes);
63
- float64x2_t cos_second_latitude = nk_cos_f64x2_neon_(second_latitudes);
64
- float64x2_t cos_latitude_product = vmulq_f64(cos_first_latitude, cos_second_latitude);
64
+ float64x2_t cos_first_latitude_f64x2 = nk_cos_f64x2_neon_(first_latitudes_f64x2);
65
+ float64x2_t cos_second_latitude_f64x2 = nk_cos_f64x2_neon_(second_latitudes_f64x2);
66
+ float64x2_t cos_latitude_product_f64x2 = vmulq_f64(cos_first_latitude_f64x2, cos_second_latitude_f64x2);
65
67
 
66
68
  // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
67
- float64x2_t haversine_term = vaddq_f64(sin_squared_latitude_delta_half,
68
- vmulq_f64(cos_latitude_product, sin_squared_longitude_delta_half));
69
- // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
70
- float64x2_t zero = vdupq_n_f64(0.0);
71
- haversine_term = vmaxq_f64(zero, vminq_f64(one, haversine_term));
69
+ float64x2_t haversine_term_f64x2 = vaddq_f64(
70
+ sin_squared_latitude_delta_half_f64x2,
71
+ vmulq_f64(cos_latitude_product_f64x2, sin_squared_longitude_delta_half_f64x2));
72
+ // Clamp haversine_term_f64x2 to [0, 1] to prevent NaN from sqrt of negative values
73
+ float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
74
+ haversine_term_f64x2 = vmaxq_f64(zero_f64x2, vminq_f64(one_f64x2, haversine_term_f64x2));
72
75
 
73
76
  // Central angle: c = 2 × atan2(√a, √(1-a))
74
- float64x2_t sqrt_haversine = vsqrtq_f64(haversine_term);
75
- float64x2_t sqrt_complement = vsqrtq_f64(vsubq_f64(one, haversine_term));
76
- float64x2_t central_angle = vmulq_f64(two, nk_atan2_f64x2_neon_(sqrt_haversine, sqrt_complement));
77
+ float64x2_t sqrt_haversine_f64x2 = vsqrtq_f64(haversine_term_f64x2);
78
+ float64x2_t sqrt_complement_f64x2 = vsqrtq_f64(vsubq_f64(one_f64x2, haversine_term_f64x2));
79
+ float64x2_t central_angle_f64x2 = vmulq_f64(two_f64x2,
80
+ nk_atan2_f64x2_neon_(sqrt_haversine_f64x2, sqrt_complement_f64x2));
77
81
 
78
- return vmulq_f64(earth_radius, central_angle);
82
+ return vmulq_f64(earth_radius_f64x2, central_angle_f64x2);
79
83
  }
80
84
 
81
85
  NK_PUBLIC void nk_haversine_f64_neon( //
@@ -84,14 +88,14 @@ NK_PUBLIC void nk_haversine_f64_neon( //
84
88
  nk_size_t n, nk_f64_t *results) {
85
89
 
86
90
  while (n >= 2) {
87
- float64x2_t first_latitudes = vld1q_f64(a_lats);
88
- float64x2_t first_longitudes = vld1q_f64(a_lons);
89
- float64x2_t second_latitudes = vld1q_f64(b_lats);
90
- float64x2_t second_longitudes = vld1q_f64(b_lons);
91
+ float64x2_t first_latitudes_f64x2 = vld1q_f64(a_lats);
92
+ float64x2_t first_longitudes_f64x2 = vld1q_f64(a_lons);
93
+ float64x2_t second_latitudes_f64x2 = vld1q_f64(b_lats);
94
+ float64x2_t second_longitudes_f64x2 = vld1q_f64(b_lons);
91
95
 
92
- float64x2_t distances = nk_haversine_f64x2_neon_(first_latitudes, first_longitudes, second_latitudes,
93
- second_longitudes);
94
- vst1q_f64(results, distances);
96
+ float64x2_t distances_f64x2 = nk_haversine_f64x2_neon_(first_latitudes_f64x2, first_longitudes_f64x2,
97
+ second_latitudes_f64x2, second_longitudes_f64x2);
98
+ vst1q_f64(results, distances_f64x2);
95
99
 
96
100
  a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
97
101
  }
@@ -103,52 +107,56 @@ NK_PUBLIC void nk_haversine_f64_neon( //
103
107
  nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
104
108
  nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
105
109
  nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
106
- float64x2_t distances = nk_haversine_f64x2_neon_(a_lat_vec.f64x2, a_lon_vec.f64x2, b_lat_vec.f64x2,
107
- b_lon_vec.f64x2);
108
- result_vec.f64x2 = distances;
110
+ float64x2_t distances_f64x2 = nk_haversine_f64x2_neon_(a_lat_vec.f64x2, a_lon_vec.f64x2, b_lat_vec.f64x2,
111
+ b_lon_vec.f64x2);
112
+ result_vec.f64x2 = distances_f64x2;
109
113
  nk_partial_store_b64x2_serial_(&result_vec, results, n);
110
114
  }
111
115
  }
112
116
 
113
- NK_INTERNAL float32x4_t nk_haversine_f32x4_neon_( //
114
- float32x4_t first_latitudes, float32x4_t first_longitudes, //
115
- float32x4_t second_latitudes, float32x4_t second_longitudes) {
117
+ NK_INTERNAL float32x4_t nk_haversine_f32x4_neon_( //
118
+ float32x4_t first_latitudes_f32x4, float32x4_t first_longitudes_f32x4, //
119
+ float32x4_t second_latitudes_f32x4, float32x4_t second_longitudes_f32x4) {
116
120
 
117
- float32x4_t const earth_radius = vdupq_n_f32((float)NK_EARTH_MEDIATORIAL_RADIUS);
118
- float32x4_t const half = vdupq_n_f32(0.5f);
119
- float32x4_t const one = vdupq_n_f32(1.0f);
120
- float32x4_t const two = vdupq_n_f32(2.0f);
121
+ float32x4_t const earth_radius_f32x4 = vdupq_n_f32((float)NK_EARTH_MEDIATORIAL_RADIUS);
122
+ float32x4_t const half_f32x4 = vdupq_n_f32(0.5f);
123
+ float32x4_t const one_f32x4 = vdupq_n_f32(1.0f);
124
+ float32x4_t const two_f32x4 = vdupq_n_f32(2.0f);
121
125
 
122
- float32x4_t latitude_delta = vsubq_f32(second_latitudes, first_latitudes);
123
- float32x4_t longitude_delta = vsubq_f32(second_longitudes, first_longitudes);
126
+ float32x4_t latitude_delta_f32x4 = vsubq_f32(second_latitudes_f32x4, first_latitudes_f32x4);
127
+ float32x4_t longitude_delta_f32x4 = vsubq_f32(second_longitudes_f32x4, first_longitudes_f32x4);
124
128
 
125
129
  // Haversine terms: sin²(Δ/2)
126
- float32x4_t latitude_delta_half = vmulq_f32(latitude_delta, half);
127
- float32x4_t longitude_delta_half = vmulq_f32(longitude_delta, half);
128
- float32x4_t sin_latitude_delta_half = nk_sin_f32x4_neon_(latitude_delta_half);
129
- float32x4_t sin_longitude_delta_half = nk_sin_f32x4_neon_(longitude_delta_half);
130
- float32x4_t sin_squared_latitude_delta_half = vmulq_f32(sin_latitude_delta_half, sin_latitude_delta_half);
131
- float32x4_t sin_squared_longitude_delta_half = vmulq_f32(sin_longitude_delta_half, sin_longitude_delta_half);
130
+ float32x4_t latitude_delta_half_f32x4 = vmulq_f32(latitude_delta_f32x4, half_f32x4);
131
+ float32x4_t longitude_delta_half_f32x4 = vmulq_f32(longitude_delta_f32x4, half_f32x4);
132
+ float32x4_t sin_latitude_delta_half_f32x4 = nk_sin_f32x4_neon_(latitude_delta_half_f32x4);
133
+ float32x4_t sin_longitude_delta_half_f32x4 = nk_sin_f32x4_neon_(longitude_delta_half_f32x4);
134
+ float32x4_t sin_squared_latitude_delta_half_f32x4 = vmulq_f32(sin_latitude_delta_half_f32x4,
135
+ sin_latitude_delta_half_f32x4);
136
+ float32x4_t sin_squared_longitude_delta_half_f32x4 = vmulq_f32(sin_longitude_delta_half_f32x4,
137
+ sin_longitude_delta_half_f32x4);
132
138
 
133
139
  // Latitude cosine product
134
- float32x4_t cos_first_latitude = nk_cos_f32x4_neon_(first_latitudes);
135
- float32x4_t cos_second_latitude = nk_cos_f32x4_neon_(second_latitudes);
136
- float32x4_t cos_latitude_product = vmulq_f32(cos_first_latitude, cos_second_latitude);
140
+ float32x4_t cos_first_latitude_f32x4 = nk_cos_f32x4_neon_(first_latitudes_f32x4);
141
+ float32x4_t cos_second_latitude_f32x4 = nk_cos_f32x4_neon_(second_latitudes_f32x4);
142
+ float32x4_t cos_latitude_product_f32x4 = vmulq_f32(cos_first_latitude_f32x4, cos_second_latitude_f32x4);
137
143
 
138
144
  // a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
139
- float32x4_t haversine_term = vaddq_f32(sin_squared_latitude_delta_half,
140
- vmulq_f32(cos_latitude_product, sin_squared_longitude_delta_half));
145
+ float32x4_t haversine_term_f32x4 = vaddq_f32(
146
+ sin_squared_latitude_delta_half_f32x4,
147
+ vmulq_f32(cos_latitude_product_f32x4, sin_squared_longitude_delta_half_f32x4));
141
148
 
142
149
  // Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
143
- float32x4_t zero = vdupq_n_f32(0.0f);
144
- haversine_term = vmaxq_f32(zero, vminq_f32(one, haversine_term));
150
+ float32x4_t zero_f32x4 = vdupq_n_f32(0.0f);
151
+ haversine_term_f32x4 = vmaxq_f32(zero_f32x4, vminq_f32(one_f32x4, haversine_term_f32x4));
145
152
 
146
153
  // Central angle: c = 2 × atan2(√a, √(1-a))
147
- float32x4_t sqrt_haversine = vsqrtq_f32(haversine_term);
148
- float32x4_t sqrt_complement = vsqrtq_f32(vsubq_f32(one, haversine_term));
149
- float32x4_t central_angle = vmulq_f32(two, nk_atan2_f32x4_neon_(sqrt_haversine, sqrt_complement));
154
+ float32x4_t sqrt_haversine_f32x4 = vsqrtq_f32(haversine_term_f32x4);
155
+ float32x4_t sqrt_complement_f32x4 = vsqrtq_f32(vsubq_f32(one_f32x4, haversine_term_f32x4));
156
+ float32x4_t central_angle_f32x4 = vmulq_f32(two_f32x4,
157
+ nk_atan2_f32x4_neon_(sqrt_haversine_f32x4, sqrt_complement_f32x4));
150
158
 
151
- return vmulq_f32(earth_radius, central_angle);
159
+ return vmulq_f32(earth_radius_f32x4, central_angle_f32x4);
152
160
  }
153
161
 
154
162
  NK_PUBLIC void nk_haversine_f32_neon( //
@@ -157,14 +165,14 @@ NK_PUBLIC void nk_haversine_f32_neon( //
157
165
  nk_size_t n, nk_f32_t *results) {
158
166
 
159
167
  while (n >= 4) {
160
- float32x4_t first_latitudes = vld1q_f32(a_lats);
161
- float32x4_t first_longitudes = vld1q_f32(a_lons);
162
- float32x4_t second_latitudes = vld1q_f32(b_lats);
163
- float32x4_t second_longitudes = vld1q_f32(b_lons);
168
+ float32x4_t first_latitudes_f32x4 = vld1q_f32(a_lats);
169
+ float32x4_t first_longitudes_f32x4 = vld1q_f32(a_lons);
170
+ float32x4_t second_latitudes_f32x4 = vld1q_f32(b_lats);
171
+ float32x4_t second_longitudes_f32x4 = vld1q_f32(b_lons);
164
172
 
165
- float32x4_t distances = nk_haversine_f32x4_neon_(first_latitudes, first_longitudes, second_latitudes,
166
- second_longitudes);
167
- vst1q_f32(results, distances);
173
+ float32x4_t distances_f32x4 = nk_haversine_f32x4_neon_(first_latitudes_f32x4, first_longitudes_f32x4,
174
+ second_latitudes_f32x4, second_longitudes_f32x4);
175
+ vst1q_f32(results, distances_f32x4);
168
176
 
169
177
  a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
170
178
  }
@@ -176,9 +184,9 @@ NK_PUBLIC void nk_haversine_f32_neon( //
176
184
  nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
177
185
  nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
178
186
  nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
179
- float32x4_t distances = nk_haversine_f32x4_neon_(a_lat_vec.f32x4, a_lon_vec.f32x4, b_lat_vec.f32x4,
180
- b_lon_vec.f32x4);
181
- result_vec.f32x4 = distances;
187
+ float32x4_t distances_f32x4 = nk_haversine_f32x4_neon_(a_lat_vec.f32x4, a_lon_vec.f32x4, b_lat_vec.f32x4,
188
+ b_lon_vec.f32x4);
189
+ result_vec.f32x4 = distances_f32x4;
182
190
  nk_partial_store_b32x4_serial_(&result_vec, results, n);
183
191
  }
184
192
  }
@@ -187,158 +195,176 @@ NK_PUBLIC void nk_haversine_f32_neon( //
187
195
  * @brief NEON helper for Vincenty's geodesic distance on 2 f64 point pairs.
188
196
  * @note This is a true SIMD implementation using masked convergence tracking via blending.
189
197
  */
190
- NK_INTERNAL float64x2_t nk_vincenty_f64x2_neon_( //
191
- float64x2_t first_latitudes, float64x2_t first_longitudes, //
192
- float64x2_t second_latitudes, float64x2_t second_longitudes) {
193
-
194
- float64x2_t const equatorial_radius = vdupq_n_f64(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
195
- float64x2_t const polar_radius = vdupq_n_f64(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
196
- float64x2_t const flattening = vdupq_n_f64(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
197
- float64x2_t const convergence_threshold = vdupq_n_f64(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
198
- float64x2_t const one = vdupq_n_f64(1.0);
199
- float64x2_t const two = vdupq_n_f64(2.0);
200
- float64x2_t const three = vdupq_n_f64(3.0);
201
- float64x2_t const four = vdupq_n_f64(4.0);
202
- float64x2_t const six = vdupq_n_f64(6.0);
203
- float64x2_t const sixteen = vdupq_n_f64(16.0);
204
- float64x2_t const epsilon = vdupq_n_f64(1e-15);
198
+ NK_INTERNAL float64x2_t nk_vincenty_f64x2_neon_( //
199
+ float64x2_t first_latitudes_f64x2, float64x2_t first_longitudes_f64x2, //
200
+ float64x2_t second_latitudes_f64x2, float64x2_t second_longitudes_f64x2) {
201
+
202
+ float64x2_t const equatorial_radius_f64x2 = vdupq_n_f64(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
203
+ float64x2_t const polar_radius_f64x2 = vdupq_n_f64(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
204
+ float64x2_t const flattening_f64x2 = vdupq_n_f64(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
205
+ float64x2_t const convergence_threshold_f64x2 = vdupq_n_f64(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
206
+ float64x2_t const one_f64x2 = vdupq_n_f64(1.0);
207
+ float64x2_t const two_f64x2 = vdupq_n_f64(2.0);
208
+ float64x2_t const three_f64x2 = vdupq_n_f64(3.0);
209
+ float64x2_t const four_f64x2 = vdupq_n_f64(4.0);
210
+ float64x2_t const six_f64x2 = vdupq_n_f64(6.0);
211
+ float64x2_t const sixteen_f64x2 = vdupq_n_f64(16.0);
212
+ float64x2_t const epsilon_f64x2 = vdupq_n_f64(1e-15);
205
213
 
206
214
  // Longitude difference
207
- float64x2_t longitude_difference = vsubq_f64(second_longitudes, first_longitudes);
215
+ float64x2_t longitude_difference_f64x2 = vsubq_f64(second_longitudes_f64x2, first_longitudes_f64x2);
208
216
 
209
217
  // Reduced latitudes: tan(U) = (1-f) * tan(lat)
210
- float64x2_t one_minus_f = vsubq_f64(one, flattening);
211
- float64x2_t tan_first = vdivq_f64(nk_sin_f64x2_neon_(first_latitudes), nk_cos_f64x2_neon_(first_latitudes));
212
- float64x2_t tan_second = vdivq_f64(nk_sin_f64x2_neon_(second_latitudes), nk_cos_f64x2_neon_(second_latitudes));
213
- float64x2_t tan_reduced_first = vmulq_f64(one_minus_f, tan_first);
214
- float64x2_t tan_reduced_second = vmulq_f64(one_minus_f, tan_second);
218
+ float64x2_t one_minus_f_f64x2 = vsubq_f64(one_f64x2, flattening_f64x2);
219
+ float64x2_t tan_first_f64x2 = vdivq_f64(nk_sin_f64x2_neon_(first_latitudes_f64x2),
220
+ nk_cos_f64x2_neon_(first_latitudes_f64x2));
221
+ float64x2_t tan_second_f64x2 = vdivq_f64(nk_sin_f64x2_neon_(second_latitudes_f64x2),
222
+ nk_cos_f64x2_neon_(second_latitudes_f64x2));
223
+ float64x2_t tan_reduced_first_f64x2 = vmulq_f64(one_minus_f_f64x2, tan_first_f64x2);
224
+ float64x2_t tan_reduced_second_f64x2 = vmulq_f64(one_minus_f_f64x2, tan_second_f64x2);
215
225
 
216
226
  // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
217
- float64x2_t cos_reduced_first = vdivq_f64(one, vsqrtq_f64(vfmaq_f64(one, tan_reduced_first, tan_reduced_first)));
218
- float64x2_t sin_reduced_first = vmulq_f64(tan_reduced_first, cos_reduced_first);
219
- float64x2_t cos_reduced_second = vdivq_f64(one, vsqrtq_f64(vfmaq_f64(one, tan_reduced_second, tan_reduced_second)));
220
- float64x2_t sin_reduced_second = vmulq_f64(tan_reduced_second, cos_reduced_second);
221
-
222
- // Initialize lambda and tracking variables
223
- float64x2_t lambda = longitude_difference;
224
- float64x2_t sin_angular_distance, cos_angular_distance, angular_distance;
225
- float64x2_t sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
227
+ float64x2_t cos_reduced_first_f64x2 = vdivq_f64(
228
+ one_f64x2, vsqrtq_f64(vfmaq_f64(one_f64x2, tan_reduced_first_f64x2, tan_reduced_first_f64x2)));
229
+ float64x2_t sin_reduced_first_f64x2 = vmulq_f64(tan_reduced_first_f64x2, cos_reduced_first_f64x2);
230
+ float64x2_t cos_reduced_second_f64x2 = vdivq_f64(
231
+ one_f64x2, vsqrtq_f64(vfmaq_f64(one_f64x2, tan_reduced_second_f64x2, tan_reduced_second_f64x2)));
232
+ float64x2_t sin_reduced_second_f64x2 = vmulq_f64(tan_reduced_second_f64x2, cos_reduced_second_f64x2);
233
+
234
+ // Initialize lambda_f64x2 and tracking variables
235
+ float64x2_t lambda_f64x2 = longitude_difference_f64x2;
236
+ float64x2_t sin_angular_distance_f64x2, cos_angular_distance_f64x2, angular_distance_f64x2;
237
+ float64x2_t sin_azimuth_f64x2, cos_squared_azimuth_f64x2, cos_double_angular_midpoint_f64x2;
226
238
 
227
239
  // Track convergence and coincident points using masks
228
- uint64x2_t converged_mask = vdupq_n_u64(0);
229
- uint64x2_t coincident_mask = vdupq_n_u64(0);
240
+ uint64x2_t converged_mask_u64x2 = vdupq_n_u64(0);
241
+ uint64x2_t coincident_mask_u64x2 = vdupq_n_u64(0);
230
242
 
231
243
  for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
232
244
  // Check if all lanes converged
233
- uint64_t converged_bits = vgetq_lane_u64(converged_mask, 0) & vgetq_lane_u64(converged_mask, 1);
245
+ nk_u64_t converged_bits = vgetq_lane_u64(converged_mask_u64x2, 0) & vgetq_lane_u64(converged_mask_u64x2, 1);
234
246
  if (converged_bits) break;
235
247
 
236
- float64x2_t sin_lambda = nk_sin_f64x2_neon_(lambda);
237
- float64x2_t cos_lambda = nk_cos_f64x2_neon_(lambda);
248
+ float64x2_t sin_lambda_f64x2 = nk_sin_f64x2_neon_(lambda_f64x2);
249
+ float64x2_t cos_lambda_f64x2 = nk_cos_f64x2_neon_(lambda_f64x2);
238
250
 
239
- // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
240
- float64x2_t cross_term = vmulq_f64(cos_reduced_second, sin_lambda);
241
- float64x2_t mixed_term = vsubq_f64(vmulq_f64(cos_reduced_first, sin_reduced_second),
242
- vmulq_f64(vmulq_f64(sin_reduced_first, cos_reduced_second), cos_lambda));
243
- float64x2_t sin_angular_dist_sq = vfmaq_f64(vmulq_f64(mixed_term, mixed_term), cross_term, cross_term);
244
- sin_angular_distance = vsqrtq_f64(sin_angular_dist_sq);
251
+ // sin²(angular_distance_f64x2) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
252
+ float64x2_t cross_term_f64x2 = vmulq_f64(cos_reduced_second_f64x2, sin_lambda_f64x2);
253
+ float64x2_t mixed_term_f64x2 = vsubq_f64(
254
+ vmulq_f64(cos_reduced_first_f64x2, sin_reduced_second_f64x2),
255
+ vmulq_f64(vmulq_f64(sin_reduced_first_f64x2, cos_reduced_second_f64x2), cos_lambda_f64x2));
256
+ float64x2_t sin_angular_dist_sq_f64x2 = vfmaq_f64(vmulq_f64(mixed_term_f64x2, mixed_term_f64x2),
257
+ cross_term_f64x2, cross_term_f64x2);
258
+ sin_angular_distance_f64x2 = vsqrtq_f64(sin_angular_dist_sq_f64x2);
245
259
 
246
- // Check for coincident points (sin_angular_distance ≈ 0)
247
- coincident_mask = vcltq_f64(sin_angular_distance, epsilon);
260
+ // Check for coincident points (sin_angular_distance_f64x2 ≈ 0)
261
+ coincident_mask_u64x2 = vcltq_f64(sin_angular_distance_f64x2, epsilon_f64x2);
248
262
 
249
- // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
250
- cos_angular_distance = vfmaq_f64(vmulq_f64(sin_reduced_first, sin_reduced_second),
251
- vmulq_f64(cos_reduced_first, cos_reduced_second), cos_lambda);
263
+ // cos(angular_distance_f64x2) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
264
+ cos_angular_distance_f64x2 = vfmaq_f64(vmulq_f64(sin_reduced_first_f64x2, sin_reduced_second_f64x2),
265
+ vmulq_f64(cos_reduced_first_f64x2, cos_reduced_second_f64x2),
266
+ cos_lambda_f64x2);
252
267
 
253
- // angular_distance = atan2(sin, cos)
254
- angular_distance = nk_atan2_f64x2_neon_(sin_angular_distance, cos_angular_distance);
268
+ // angular_distance_f64x2 = atan2(sin, cos)
269
+ angular_distance_f64x2 = nk_atan2_f64x2_neon_(sin_angular_distance_f64x2, cos_angular_distance_f64x2);
255
270
 
256
- // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
271
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f64x2)
257
272
  // Avoid division by zero by using blending
258
- float64x2_t safe_sin_angular = vbslq_f64(coincident_mask, one, sin_angular_distance);
259
- sin_azimuth = vdivq_f64(vmulq_f64(vmulq_f64(cos_reduced_first, cos_reduced_second), sin_lambda),
260
- safe_sin_angular);
261
- cos_squared_azimuth = vsubq_f64(one, vmulq_f64(sin_azimuth, sin_azimuth));
273
+ float64x2_t safe_sin_angular_f64x2 = vbslq_f64(coincident_mask_u64x2, one_f64x2, sin_angular_distance_f64x2);
274
+ sin_azimuth_f64x2 = vdivq_f64(
275
+ vmulq_f64(vmulq_f64(cos_reduced_first_f64x2, cos_reduced_second_f64x2), sin_lambda_f64x2),
276
+ safe_sin_angular_f64x2);
277
+ cos_squared_azimuth_f64x2 = vsubq_f64(one_f64x2, vmulq_f64(sin_azimuth_f64x2, sin_azimuth_f64x2));
262
278
 
263
279
  // Handle equatorial case: cos²α ≈ 0
264
- uint64x2_t equatorial_mask = vcltq_f64(cos_squared_azimuth, epsilon);
265
- float64x2_t safe_cos_sq_azimuth = vbslq_f64(equatorial_mask, one, cos_squared_azimuth);
280
+ uint64x2_t equatorial_mask_u64x2 = vcltq_f64(cos_squared_azimuth_f64x2, epsilon_f64x2);
281
+ float64x2_t safe_cos_sq_azimuth_f64x2 = vbslq_f64(equatorial_mask_u64x2, one_f64x2, cos_squared_azimuth_f64x2);
266
282
 
267
283
  // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
268
- float64x2_t sin_product = vmulq_f64(sin_reduced_first, sin_reduced_second);
269
- cos_double_angular_midpoint = vsubq_f64(cos_angular_distance,
270
- vdivq_f64(vmulq_f64(two, sin_product), safe_cos_sq_azimuth));
271
- cos_double_angular_midpoint = vbslq_f64(equatorial_mask, vdupq_n_f64(0.0), cos_double_angular_midpoint);
284
+ float64x2_t sin_product_f64x2 = vmulq_f64(sin_reduced_first_f64x2, sin_reduced_second_f64x2);
285
+ cos_double_angular_midpoint_f64x2 = vsubq_f64(
286
+ cos_angular_distance_f64x2, vdivq_f64(vmulq_f64(two_f64x2, sin_product_f64x2), safe_cos_sq_azimuth_f64x2));
287
+ cos_double_angular_midpoint_f64x2 = vbslq_f64(equatorial_mask_u64x2, vdupq_n_f64(0.0),
288
+ cos_double_angular_midpoint_f64x2);
272
289
 
273
290
  // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
274
- float64x2_t correction_factor = vmulq_f64(
275
- vdivq_f64(flattening, sixteen),
276
- vmulq_f64(cos_squared_azimuth, vfmaq_f64(four, flattening, vfmsq_f64(four, three, cos_squared_azimuth))));
291
+ float64x2_t correction_factor_f64x2 = vmulq_f64(
292
+ vdivq_f64(flattening_f64x2, sixteen_f64x2),
293
+ vmulq_f64(cos_squared_azimuth_f64x2,
294
+ vfmaq_f64(four_f64x2, flattening_f64x2,
295
+ vfmsq_f64(four_f64x2, three_f64x2, cos_squared_azimuth_f64x2))));
277
296
 
278
297
  // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
279
- float64x2_t cos_2sm_sq = vmulq_f64(cos_double_angular_midpoint, cos_double_angular_midpoint);
280
- // innermost = -1 + 2 × cos²(2σₘ)
281
- float64x2_t innermost = vfmaq_f64(vdupq_n_f64(-1.0), two, cos_2sm_sq);
282
- // middle = cos(2σₘ) + C × cos(σ) × innermost
283
- float64x2_t middle = vfmaq_f64(cos_double_angular_midpoint, vmulq_f64(correction_factor, cos_angular_distance),
284
- innermost);
285
- // inner = C × sin(σ) × middle
286
- float64x2_t inner = vmulq_f64(vmulq_f64(correction_factor, sin_angular_distance), middle);
287
-
288
- // λ' = L + (1-C) * f * sin_α * (σ + inner)
289
- float64x2_t lambda_new = vfmaq_f64(
290
- longitude_difference, vmulq_f64(vmulq_f64(vsubq_f64(one, correction_factor), flattening), sin_azimuth),
291
- vaddq_f64(angular_distance, inner));
298
+ float64x2_t cos_2sm_sq_f64x2 = vmulq_f64(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
299
+ // innermost_f64x2 = -1 + 2 × cos²(2σₘ)
300
+ float64x2_t innermost_f64x2 = vfmaq_f64(vdupq_n_f64(-1.0), two_f64x2, cos_2sm_sq_f64x2);
301
+ // middle_f64x2 = cos(2σₘ) + C × cos(σ) × innermost_f64x2
302
+ float64x2_t middle_f64x2 = vfmaq_f64(cos_double_angular_midpoint_f64x2,
303
+ vmulq_f64(correction_factor_f64x2, cos_angular_distance_f64x2),
304
+ innermost_f64x2);
305
+ // inner_f64x2 = C × sin(σ) × middle_f64x2
306
+ float64x2_t inner_f64x2 = vmulq_f64(vmulq_f64(correction_factor_f64x2, sin_angular_distance_f64x2),
307
+ middle_f64x2);
308
+
309
+ // λ' = L + (1-C) * f * sin_α * (σ + inner_f64x2)
310
+ float64x2_t lambda_new_f64x2 = vfmaq_f64(
311
+ longitude_difference_f64x2,
312
+ vmulq_f64(vmulq_f64(vsubq_f64(one_f64x2, correction_factor_f64x2), flattening_f64x2), sin_azimuth_f64x2),
313
+ vaddq_f64(angular_distance_f64x2, inner_f64x2));
292
314
 
293
315
  // Check convergence: |λ - λ'| < threshold
294
- float64x2_t lambda_diff = vsubq_f64(lambda_new, lambda);
295
- float64x2_t lambda_diff_abs = vabsq_f64(lambda_diff);
296
- uint64x2_t newly_converged = vcltq_f64(lambda_diff_abs, convergence_threshold);
297
- converged_mask = vorrq_u64(converged_mask, newly_converged);
316
+ float64x2_t lambda_diff_f64x2 = vsubq_f64(lambda_new_f64x2, lambda_f64x2);
317
+ float64x2_t lambda_diff_abs_f64x2 = vabsq_f64(lambda_diff_f64x2);
318
+ uint64x2_t newly_converged_u64x2 = vcltq_f64(lambda_diff_abs_f64x2, convergence_threshold_f64x2);
319
+ converged_mask_u64x2 = vorrq_u64(converged_mask_u64x2, newly_converged_u64x2);
298
320
 
299
- // Only update lambda for non-converged lanes
300
- lambda = vbslq_f64(converged_mask, lambda, lambda_new);
321
+ // Only update lambda_f64x2 for non-converged lanes
322
+ lambda_f64x2 = vbslq_f64(converged_mask_u64x2, lambda_f64x2, lambda_new_f64x2);
301
323
  }
302
324
 
303
325
  // Final distance calculation
304
326
  // u² = cos²α * (a² - b²) / b²
305
- float64x2_t a_sq = vmulq_f64(equatorial_radius, equatorial_radius);
306
- float64x2_t b_sq = vmulq_f64(polar_radius, polar_radius);
307
- float64x2_t u_squared = vdivq_f64(vmulq_f64(cos_squared_azimuth, vsubq_f64(a_sq, b_sq)), b_sq);
327
+ float64x2_t a_sq_f64x2 = vmulq_f64(equatorial_radius_f64x2, equatorial_radius_f64x2);
328
+ float64x2_t b_sq_f64x2 = vmulq_f64(polar_radius_f64x2, polar_radius_f64x2);
329
+ float64x2_t u_squared_f64x2 = vdivq_f64(vmulq_f64(cos_squared_azimuth_f64x2, vsubq_f64(a_sq_f64x2, b_sq_f64x2)),
330
+ b_sq_f64x2);
308
331
 
309
332
  // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
310
- float64x2_t series_a = vfmaq_f64(vdupq_n_f64(320.0), u_squared, vdupq_n_f64(-175.0));
311
- series_a = vfmaq_f64(vdupq_n_f64(-768.0), u_squared, series_a);
312
- series_a = vfmaq_f64(vdupq_n_f64(4096.0), u_squared, series_a);
313
- series_a = vfmaq_f64(one, vdivq_f64(u_squared, vdupq_n_f64(16384.0)), series_a);
333
+ float64x2_t series_a_f64x2 = vfmaq_f64(vdupq_n_f64(320.0), u_squared_f64x2, vdupq_n_f64(-175.0));
334
+ series_a_f64x2 = vfmaq_f64(vdupq_n_f64(-768.0), u_squared_f64x2, series_a_f64x2);
335
+ series_a_f64x2 = vfmaq_f64(vdupq_n_f64(4096.0), u_squared_f64x2, series_a_f64x2);
336
+ series_a_f64x2 = vfmaq_f64(one_f64x2, vdivq_f64(u_squared_f64x2, vdupq_n_f64(16384.0)), series_a_f64x2);
314
337
 
315
338
  // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
316
- float64x2_t series_b = vfmaq_f64(vdupq_n_f64(74.0), u_squared, vdupq_n_f64(-47.0));
317
- series_b = vfmaq_f64(vdupq_n_f64(-128.0), u_squared, series_b);
318
- series_b = vfmaq_f64(vdupq_n_f64(256.0), u_squared, series_b);
319
- series_b = vmulq_f64(vdivq_f64(u_squared, vdupq_n_f64(1024.0)), series_b);
339
+ float64x2_t series_b_f64x2 = vfmaq_f64(vdupq_n_f64(74.0), u_squared_f64x2, vdupq_n_f64(-47.0));
340
+ series_b_f64x2 = vfmaq_f64(vdupq_n_f64(-128.0), u_squared_f64x2, series_b_f64x2);
341
+ series_b_f64x2 = vfmaq_f64(vdupq_n_f64(256.0), u_squared_f64x2, series_b_f64x2);
342
+ series_b_f64x2 = vmulq_f64(vdivq_f64(u_squared_f64x2, vdupq_n_f64(1024.0)), series_b_f64x2);
320
343
 
321
344
  // Δσ = B × sin(σ) × (cos(2σₘ) + B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 +
322
345
  // 4 × cos²(2σₘ))))
323
- float64x2_t cos_2sm_sq = vmulq_f64(cos_double_angular_midpoint, cos_double_angular_midpoint);
324
- float64x2_t sin_sq = vmulq_f64(sin_angular_distance, sin_angular_distance);
325
- float64x2_t term1 = vfmaq_f64(vdupq_n_f64(-1.0), two, cos_2sm_sq);
326
- term1 = vmulq_f64(cos_angular_distance, term1);
327
- float64x2_t term2 = vfmaq_f64(vdupq_n_f64(-3.0), four, sin_sq);
328
- float64x2_t term3 = vfmaq_f64(vdupq_n_f64(-3.0), four, cos_2sm_sq);
329
- term2 = vmulq_f64(vmulq_f64(vdivq_f64(series_b, six), cos_double_angular_midpoint), vmulq_f64(term2, term3));
330
- float64x2_t delta_sigma = vmulq_f64(
331
- series_b,
332
- vmulq_f64(sin_angular_distance, vaddq_f64(cos_double_angular_midpoint,
333
- vmulq_f64(vdivq_f64(series_b, four), vsubq_f64(term1, term2)))));
346
+ float64x2_t cos_2sm_sq_f64x2 = vmulq_f64(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
347
+ float64x2_t sin_sq_f64x2 = vmulq_f64(sin_angular_distance_f64x2, sin_angular_distance_f64x2);
348
+ float64x2_t term1_f64x2 = vfmaq_f64(vdupq_n_f64(-1.0), two_f64x2, cos_2sm_sq_f64x2);
349
+ term1_f64x2 = vmulq_f64(cos_angular_distance_f64x2, term1_f64x2);
350
+ float64x2_t term2_f64x2 = vfmaq_f64(vdupq_n_f64(-3.0), four_f64x2, sin_sq_f64x2);
351
+ float64x2_t term3_f64x2 = vfmaq_f64(vdupq_n_f64(-3.0), four_f64x2, cos_2sm_sq_f64x2);
352
+ term2_f64x2 = vmulq_f64(vmulq_f64(vdivq_f64(series_b_f64x2, six_f64x2), cos_double_angular_midpoint_f64x2),
353
+ vmulq_f64(term2_f64x2, term3_f64x2));
354
+ float64x2_t delta_sigma_f64x2 = vmulq_f64(
355
+ series_b_f64x2,
356
+ vmulq_f64(sin_angular_distance_f64x2,
357
+ vaddq_f64(cos_double_angular_midpoint_f64x2,
358
+ vmulq_f64(vdivq_f64(series_b_f64x2, four_f64x2), vsubq_f64(term1_f64x2, term2_f64x2)))));
334
359
 
335
360
  // s = b * A * (σ - Δσ)
336
- float64x2_t distances = vmulq_f64(vmulq_f64(polar_radius, series_a), vsubq_f64(angular_distance, delta_sigma));
361
+ float64x2_t distances_f64x2 = vmulq_f64(vmulq_f64(polar_radius_f64x2, series_a_f64x2),
362
+ vsubq_f64(angular_distance_f64x2, delta_sigma_f64x2));
337
363
 
338
364
  // Set coincident points to zero
339
- distances = vbslq_f64(coincident_mask, vdupq_n_f64(0.0), distances);
365
+ distances_f64x2 = vbslq_f64(coincident_mask_u64x2, vdupq_n_f64(0.0), distances_f64x2);
340
366
 
341
- return distances;
367
+ return distances_f64x2;
342
368
  }
343
369
 
344
370
  NK_PUBLIC void nk_vincenty_f64_neon( //
@@ -347,14 +373,14 @@ NK_PUBLIC void nk_vincenty_f64_neon( //
347
373
  nk_size_t n, nk_f64_t *results) {
348
374
 
349
375
  while (n >= 2) {
350
- float64x2_t first_latitudes = vld1q_f64(a_lats);
351
- float64x2_t first_longitudes = vld1q_f64(a_lons);
352
- float64x2_t second_latitudes = vld1q_f64(b_lats);
353
- float64x2_t second_longitudes = vld1q_f64(b_lons);
376
+ float64x2_t first_latitudes_f64x2 = vld1q_f64(a_lats);
377
+ float64x2_t first_longitudes_f64x2 = vld1q_f64(a_lons);
378
+ float64x2_t second_latitudes_f64x2 = vld1q_f64(b_lats);
379
+ float64x2_t second_longitudes_f64x2 = vld1q_f64(b_lons);
354
380
 
355
- float64x2_t distances = nk_vincenty_f64x2_neon_(first_latitudes, first_longitudes, second_latitudes,
356
- second_longitudes);
357
- vst1q_f64(results, distances);
381
+ float64x2_t distances_f64x2 = nk_vincenty_f64x2_neon_(first_latitudes_f64x2, first_longitudes_f64x2,
382
+ second_latitudes_f64x2, second_longitudes_f64x2);
383
+ vst1q_f64(results, distances_f64x2);
358
384
 
359
385
  a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
360
386
  }
@@ -366,9 +392,9 @@ NK_PUBLIC void nk_vincenty_f64_neon( //
366
392
  nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
367
393
  nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
368
394
  nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
369
- float64x2_t distances = nk_vincenty_f64x2_neon_(a_lat_vec.f64x2, a_lon_vec.f64x2, b_lat_vec.f64x2,
370
- b_lon_vec.f64x2);
371
- result_vec.f64x2 = distances;
395
+ float64x2_t distances_f64x2 = nk_vincenty_f64x2_neon_(a_lat_vec.f64x2, a_lon_vec.f64x2, b_lat_vec.f64x2,
396
+ b_lon_vec.f64x2);
397
+ result_vec.f64x2 = distances_f64x2;
372
398
  nk_partial_store_b64x2_serial_(&result_vec, results, n);
373
399
  }
374
400
  }
@@ -377,151 +403,169 @@ NK_PUBLIC void nk_vincenty_f64_neon( //
377
403
  * @brief NEON helper for Vincenty's geodesic distance on 4 f32 point pairs.
378
404
  * @note This is a true SIMD implementation using masked convergence tracking via blending.
379
405
  */
380
- NK_INTERNAL float32x4_t nk_vincenty_f32x4_neon_( //
381
- float32x4_t first_latitudes, float32x4_t first_longitudes, //
382
- float32x4_t second_latitudes, float32x4_t second_longitudes) {
383
-
384
- float32x4_t const equatorial_radius = vdupq_n_f32((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
385
- float32x4_t const polar_radius = vdupq_n_f32((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
386
- float32x4_t const flattening = vdupq_n_f32(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
387
- float32x4_t const convergence_threshold = vdupq_n_f32(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
388
- float32x4_t const one = vdupq_n_f32(1.0f);
389
- float32x4_t const two = vdupq_n_f32(2.0f);
390
- float32x4_t const three = vdupq_n_f32(3.0f);
391
- float32x4_t const four = vdupq_n_f32(4.0f);
392
- float32x4_t const six = vdupq_n_f32(6.0f);
393
- float32x4_t const sixteen = vdupq_n_f32(16.0f);
394
- float32x4_t const epsilon = vdupq_n_f32(1e-7f);
406
+ NK_INTERNAL float32x4_t nk_vincenty_f32x4_neon_( //
407
+ float32x4_t first_latitudes_f32x4, float32x4_t first_longitudes_f32x4, //
408
+ float32x4_t second_latitudes_f32x4, float32x4_t second_longitudes_f32x4) {
409
+
410
+ float32x4_t const equatorial_radius_f32x4 = vdupq_n_f32((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
411
+ float32x4_t const polar_radius_f32x4 = vdupq_n_f32((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
412
+ float32x4_t const flattening_f32x4 = vdupq_n_f32(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
413
+ float32x4_t const convergence_threshold_f32x4 = vdupq_n_f32(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
414
+ float32x4_t const one_f32x4 = vdupq_n_f32(1.0f);
415
+ float32x4_t const two_f32x4 = vdupq_n_f32(2.0f);
416
+ float32x4_t const three_f32x4 = vdupq_n_f32(3.0f);
417
+ float32x4_t const four_f32x4 = vdupq_n_f32(4.0f);
418
+ float32x4_t const six_f32x4 = vdupq_n_f32(6.0f);
419
+ float32x4_t const sixteen_f32x4 = vdupq_n_f32(16.0f);
420
+ float32x4_t const epsilon_f32x4 = vdupq_n_f32(1e-7f);
395
421
 
396
422
  // Longitude difference
397
- float32x4_t longitude_difference = vsubq_f32(second_longitudes, first_longitudes);
423
+ float32x4_t longitude_difference_f32x4 = vsubq_f32(second_longitudes_f32x4, first_longitudes_f32x4);
398
424
 
399
425
  // Reduced latitudes: tan(U) = (1-f) * tan(lat)
400
- float32x4_t one_minus_f = vsubq_f32(one, flattening);
401
- float32x4_t tan_first = vdivq_f32(nk_sin_f32x4_neon_(first_latitudes), nk_cos_f32x4_neon_(first_latitudes));
402
- float32x4_t tan_second = vdivq_f32(nk_sin_f32x4_neon_(second_latitudes), nk_cos_f32x4_neon_(second_latitudes));
403
- float32x4_t tan_reduced_first = vmulq_f32(one_minus_f, tan_first);
404
- float32x4_t tan_reduced_second = vmulq_f32(one_minus_f, tan_second);
426
+ float32x4_t one_minus_f_f32x4 = vsubq_f32(one_f32x4, flattening_f32x4);
427
+ float32x4_t tan_first_f32x4 = vdivq_f32(nk_sin_f32x4_neon_(first_latitudes_f32x4),
428
+ nk_cos_f32x4_neon_(first_latitudes_f32x4));
429
+ float32x4_t tan_second_f32x4 = vdivq_f32(nk_sin_f32x4_neon_(second_latitudes_f32x4),
430
+ nk_cos_f32x4_neon_(second_latitudes_f32x4));
431
+ float32x4_t tan_reduced_first_f32x4 = vmulq_f32(one_minus_f_f32x4, tan_first_f32x4);
432
+ float32x4_t tan_reduced_second_f32x4 = vmulq_f32(one_minus_f_f32x4, tan_second_f32x4);
405
433
 
406
434
  // cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
407
- float32x4_t cos_reduced_first = vdivq_f32(one, vsqrtq_f32(vfmaq_f32(one, tan_reduced_first, tan_reduced_first)));
408
- float32x4_t sin_reduced_first = vmulq_f32(tan_reduced_first, cos_reduced_first);
409
- float32x4_t cos_reduced_second = vdivq_f32(one, vsqrtq_f32(vfmaq_f32(one, tan_reduced_second, tan_reduced_second)));
410
- float32x4_t sin_reduced_second = vmulq_f32(tan_reduced_second, cos_reduced_second);
411
-
412
- // Initialize lambda and tracking variables
413
- float32x4_t lambda = longitude_difference;
414
- float32x4_t sin_angular_distance, cos_angular_distance, angular_distance;
415
- float32x4_t sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
435
+ float32x4_t cos_reduced_first_f32x4 = vdivq_f32(
436
+ one_f32x4, vsqrtq_f32(vfmaq_f32(one_f32x4, tan_reduced_first_f32x4, tan_reduced_first_f32x4)));
437
+ float32x4_t sin_reduced_first_f32x4 = vmulq_f32(tan_reduced_first_f32x4, cos_reduced_first_f32x4);
438
+ float32x4_t cos_reduced_second_f32x4 = vdivq_f32(
439
+ one_f32x4, vsqrtq_f32(vfmaq_f32(one_f32x4, tan_reduced_second_f32x4, tan_reduced_second_f32x4)));
440
+ float32x4_t sin_reduced_second_f32x4 = vmulq_f32(tan_reduced_second_f32x4, cos_reduced_second_f32x4);
441
+
442
+ // Initialize lambda_f32x4 and tracking variables
443
+ float32x4_t lambda_f32x4 = longitude_difference_f32x4;
444
+ float32x4_t sin_angular_distance_f32x4, cos_angular_distance_f32x4, angular_distance_f32x4;
445
+ float32x4_t sin_azimuth_f32x4, cos_squared_azimuth_f32x4, cos_double_angular_midpoint_f32x4;
416
446
 
417
447
  // Track convergence and coincident points using masks
418
- uint32x4_t converged_mask = vdupq_n_u32(0);
419
- uint32x4_t coincident_mask = vdupq_n_u32(0);
448
+ uint32x4_t converged_mask_u32x4 = vdupq_n_u32(0);
449
+ uint32x4_t coincident_mask_u32x4 = vdupq_n_u32(0);
420
450
 
421
451
  for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
422
452
  // Check if all lanes converged (all bits set = 0xFFFFFFFF per lane)
423
- uint32_t converged_bits = vminvq_u32(converged_mask);
453
+ nk_u32_t converged_bits = vminvq_u32(converged_mask_u32x4);
424
454
  if (converged_bits == 0xFFFFFFFF) break;
425
455
 
426
- float32x4_t sin_lambda = nk_sin_f32x4_neon_(lambda);
427
- float32x4_t cos_lambda = nk_cos_f32x4_neon_(lambda);
456
+ float32x4_t sin_lambda_f32x4 = nk_sin_f32x4_neon_(lambda_f32x4);
457
+ float32x4_t cos_lambda_f32x4 = nk_cos_f32x4_neon_(lambda_f32x4);
428
458
 
429
- // sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
430
- float32x4_t cross_term = vmulq_f32(cos_reduced_second, sin_lambda);
431
- float32x4_t mixed_term = vsubq_f32(vmulq_f32(cos_reduced_first, sin_reduced_second),
432
- vmulq_f32(vmulq_f32(sin_reduced_first, cos_reduced_second), cos_lambda));
433
- float32x4_t sin_angular_dist_sq = vfmaq_f32(vmulq_f32(mixed_term, mixed_term), cross_term, cross_term);
434
- sin_angular_distance = vsqrtq_f32(sin_angular_dist_sq);
459
+ // sin²(angular_distance_f32x4) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
460
+ float32x4_t cross_term_f32x4 = vmulq_f32(cos_reduced_second_f32x4, sin_lambda_f32x4);
461
+ float32x4_t mixed_term_f32x4 = vsubq_f32(
462
+ vmulq_f32(cos_reduced_first_f32x4, sin_reduced_second_f32x4),
463
+ vmulq_f32(vmulq_f32(sin_reduced_first_f32x4, cos_reduced_second_f32x4), cos_lambda_f32x4));
464
+ float32x4_t sin_angular_dist_sq_f32x4 = vfmaq_f32(vmulq_f32(mixed_term_f32x4, mixed_term_f32x4),
465
+ cross_term_f32x4, cross_term_f32x4);
466
+ sin_angular_distance_f32x4 = vsqrtq_f32(sin_angular_dist_sq_f32x4);
435
467
 
436
- // Check for coincident points (sin_angular_distance ≈ 0)
437
- coincident_mask = vcltq_f32(sin_angular_distance, epsilon);
468
+ // Check for coincident points (sin_angular_distance_f32x4 ≈ 0)
469
+ coincident_mask_u32x4 = vcltq_f32(sin_angular_distance_f32x4, epsilon_f32x4);
438
470
 
439
- // cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
440
- cos_angular_distance = vfmaq_f32(vmulq_f32(sin_reduced_first, sin_reduced_second),
441
- vmulq_f32(cos_reduced_first, cos_reduced_second), cos_lambda);
471
+ // cos(angular_distance_f32x4) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
472
+ cos_angular_distance_f32x4 = vfmaq_f32(vmulq_f32(sin_reduced_first_f32x4, sin_reduced_second_f32x4),
473
+ vmulq_f32(cos_reduced_first_f32x4, cos_reduced_second_f32x4),
474
+ cos_lambda_f32x4);
442
475
 
443
- // angular_distance = atan2(sin, cos)
444
- angular_distance = nk_atan2_f32x4_neon_(sin_angular_distance, cos_angular_distance);
476
+ // angular_distance_f32x4 = atan2(sin, cos)
477
+ angular_distance_f32x4 = nk_atan2_f32x4_neon_(sin_angular_distance_f32x4, cos_angular_distance_f32x4);
445
478
 
446
- // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
447
- float32x4_t safe_sin_angular = vbslq_f32(coincident_mask, one, sin_angular_distance);
448
- sin_azimuth = vdivq_f32(vmulq_f32(vmulq_f32(cos_reduced_first, cos_reduced_second), sin_lambda),
449
- safe_sin_angular);
450
- cos_squared_azimuth = vsubq_f32(one, vmulq_f32(sin_azimuth, sin_azimuth));
479
+ // sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f32x4)
480
+ float32x4_t safe_sin_angular_f32x4 = vbslq_f32(coincident_mask_u32x4, one_f32x4, sin_angular_distance_f32x4);
481
+ sin_azimuth_f32x4 = vdivq_f32(
482
+ vmulq_f32(vmulq_f32(cos_reduced_first_f32x4, cos_reduced_second_f32x4), sin_lambda_f32x4),
483
+ safe_sin_angular_f32x4);
484
+ cos_squared_azimuth_f32x4 = vsubq_f32(one_f32x4, vmulq_f32(sin_azimuth_f32x4, sin_azimuth_f32x4));
451
485
 
452
486
  // Handle equatorial case: cos²α ≈ 0
453
- uint32x4_t equatorial_mask = vcltq_f32(cos_squared_azimuth, epsilon);
454
- float32x4_t safe_cos_sq_azimuth = vbslq_f32(equatorial_mask, one, cos_squared_azimuth);
487
+ uint32x4_t equatorial_mask_u32x4 = vcltq_f32(cos_squared_azimuth_f32x4, epsilon_f32x4);
488
+ float32x4_t safe_cos_sq_azimuth_f32x4 = vbslq_f32(equatorial_mask_u32x4, one_f32x4, cos_squared_azimuth_f32x4);
455
489
 
456
490
  // cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
457
- float32x4_t sin_product = vmulq_f32(sin_reduced_first, sin_reduced_second);
458
- cos_double_angular_midpoint = vsubq_f32(cos_angular_distance,
459
- vdivq_f32(vmulq_f32(two, sin_product), safe_cos_sq_azimuth));
460
- cos_double_angular_midpoint = vbslq_f32(equatorial_mask, vdupq_n_f32(0.0f), cos_double_angular_midpoint);
491
+ float32x4_t sin_product_f32x4 = vmulq_f32(sin_reduced_first_f32x4, sin_reduced_second_f32x4);
492
+ cos_double_angular_midpoint_f32x4 = vsubq_f32(
493
+ cos_angular_distance_f32x4, vdivq_f32(vmulq_f32(two_f32x4, sin_product_f32x4), safe_cos_sq_azimuth_f32x4));
494
+ cos_double_angular_midpoint_f32x4 = vbslq_f32(equatorial_mask_u32x4, vdupq_n_f32(0.0f),
495
+ cos_double_angular_midpoint_f32x4);
461
496
 
462
497
  // C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
463
- float32x4_t correction_factor = vmulq_f32(
464
- vdivq_f32(flattening, sixteen),
465
- vmulq_f32(cos_squared_azimuth, vfmaq_f32(four, flattening, vfmsq_f32(four, three, cos_squared_azimuth))));
498
+ float32x4_t correction_factor_f32x4 = vmulq_f32(
499
+ vdivq_f32(flattening_f32x4, sixteen_f32x4),
500
+ vmulq_f32(cos_squared_azimuth_f32x4,
501
+ vfmaq_f32(four_f32x4, flattening_f32x4,
502
+ vfmsq_f32(four_f32x4, three_f32x4, cos_squared_azimuth_f32x4))));
466
503
 
467
504
  // λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
468
- float32x4_t cos_2sm_sq = vmulq_f32(cos_double_angular_midpoint, cos_double_angular_midpoint);
469
- float32x4_t innermost = vfmaq_f32(vdupq_n_f32(-1.0f), two, cos_2sm_sq);
470
- float32x4_t middle = vfmaq_f32(cos_double_angular_midpoint, vmulq_f32(correction_factor, cos_angular_distance),
471
- innermost);
472
- float32x4_t inner = vmulq_f32(vmulq_f32(correction_factor, sin_angular_distance), middle);
473
-
474
- float32x4_t lambda_new = vfmaq_f32(
475
- longitude_difference, vmulq_f32(vmulq_f32(vsubq_f32(one, correction_factor), flattening), sin_azimuth),
476
- vaddq_f32(angular_distance, inner));
505
+ float32x4_t cos_2sm_sq_f32x4 = vmulq_f32(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
506
+ float32x4_t innermost_f32x4 = vfmaq_f32(vdupq_n_f32(-1.0f), two_f32x4, cos_2sm_sq_f32x4);
507
+ float32x4_t middle_f32x4 = vfmaq_f32(cos_double_angular_midpoint_f32x4,
508
+ vmulq_f32(correction_factor_f32x4, cos_angular_distance_f32x4),
509
+ innermost_f32x4);
510
+ float32x4_t inner_f32x4 = vmulq_f32(vmulq_f32(correction_factor_f32x4, sin_angular_distance_f32x4),
511
+ middle_f32x4);
512
+
513
+ float32x4_t lambda_new_f32x4 = vfmaq_f32(
514
+ longitude_difference_f32x4,
515
+ vmulq_f32(vmulq_f32(vsubq_f32(one_f32x4, correction_factor_f32x4), flattening_f32x4), sin_azimuth_f32x4),
516
+ vaddq_f32(angular_distance_f32x4, inner_f32x4));
477
517
 
478
518
  // Check convergence: |λ - λ'| < threshold
479
- float32x4_t lambda_diff = vsubq_f32(lambda_new, lambda);
480
- float32x4_t lambda_diff_abs = vabsq_f32(lambda_diff);
481
- uint32x4_t newly_converged = vcltq_f32(lambda_diff_abs, convergence_threshold);
482
- converged_mask = vorrq_u32(converged_mask, newly_converged);
519
+ float32x4_t lambda_diff_f32x4 = vsubq_f32(lambda_new_f32x4, lambda_f32x4);
520
+ float32x4_t lambda_diff_abs_f32x4 = vabsq_f32(lambda_diff_f32x4);
521
+ uint32x4_t newly_converged_u32x4 = vcltq_f32(lambda_diff_abs_f32x4, convergence_threshold_f32x4);
522
+ converged_mask_u32x4 = vorrq_u32(converged_mask_u32x4, newly_converged_u32x4);
483
523
 
484
- // Only update lambda for non-converged lanes
485
- lambda = vbslq_f32(converged_mask, lambda, lambda_new);
524
+ // Only update lambda_f32x4 for non-converged lanes
525
+ lambda_f32x4 = vbslq_f32(converged_mask_u32x4, lambda_f32x4, lambda_new_f32x4);
486
526
  }
487
527
 
488
528
  // Final distance calculation
489
- float32x4_t a_sq = vmulq_f32(equatorial_radius, equatorial_radius);
490
- float32x4_t b_sq = vmulq_f32(polar_radius, polar_radius);
491
- float32x4_t u_squared = vdivq_f32(vmulq_f32(cos_squared_azimuth, vsubq_f32(a_sq, b_sq)), b_sq);
529
+ float32x4_t a_sq_f32x4 = vmulq_f32(equatorial_radius_f32x4, equatorial_radius_f32x4);
530
+ float32x4_t b_sq_f32x4 = vmulq_f32(polar_radius_f32x4, polar_radius_f32x4);
531
+ float32x4_t u_squared_f32x4 = vdivq_f32(vmulq_f32(cos_squared_azimuth_f32x4, vsubq_f32(a_sq_f32x4, b_sq_f32x4)),
532
+ b_sq_f32x4);
492
533
 
493
534
  // A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
494
- float32x4_t series_a = vfmaq_f32(vdupq_n_f32(320.0f), u_squared, vdupq_n_f32(-175.0f));
495
- series_a = vfmaq_f32(vdupq_n_f32(-768.0f), u_squared, series_a);
496
- series_a = vfmaq_f32(vdupq_n_f32(4096.0f), u_squared, series_a);
497
- series_a = vfmaq_f32(one, vdivq_f32(u_squared, vdupq_n_f32(16384.0f)), series_a);
535
+ float32x4_t series_a_f32x4 = vfmaq_f32(vdupq_n_f32(320.0f), u_squared_f32x4, vdupq_n_f32(-175.0f));
536
+ series_a_f32x4 = vfmaq_f32(vdupq_n_f32(-768.0f), u_squared_f32x4, series_a_f32x4);
537
+ series_a_f32x4 = vfmaq_f32(vdupq_n_f32(4096.0f), u_squared_f32x4, series_a_f32x4);
538
+ series_a_f32x4 = vfmaq_f32(one_f32x4, vdivq_f32(u_squared_f32x4, vdupq_n_f32(16384.0f)), series_a_f32x4);
498
539
 
499
540
  // B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
500
- float32x4_t series_b = vfmaq_f32(vdupq_n_f32(74.0f), u_squared, vdupq_n_f32(-47.0f));
501
- series_b = vfmaq_f32(vdupq_n_f32(-128.0f), u_squared, series_b);
502
- series_b = vfmaq_f32(vdupq_n_f32(256.0f), u_squared, series_b);
503
- series_b = vmulq_f32(vdivq_f32(u_squared, vdupq_n_f32(1024.0f)), series_b);
541
+ float32x4_t series_b_f32x4 = vfmaq_f32(vdupq_n_f32(74.0f), u_squared_f32x4, vdupq_n_f32(-47.0f));
542
+ series_b_f32x4 = vfmaq_f32(vdupq_n_f32(-128.0f), u_squared_f32x4, series_b_f32x4);
543
+ series_b_f32x4 = vfmaq_f32(vdupq_n_f32(256.0f), u_squared_f32x4, series_b_f32x4);
544
+ series_b_f32x4 = vmulq_f32(vdivq_f32(u_squared_f32x4, vdupq_n_f32(1024.0f)), series_b_f32x4);
504
545
 
505
546
  // Δσ calculation
506
- float32x4_t cos_2sm_sq = vmulq_f32(cos_double_angular_midpoint, cos_double_angular_midpoint);
507
- float32x4_t sin_sq = vmulq_f32(sin_angular_distance, sin_angular_distance);
508
- float32x4_t term1 = vfmaq_f32(vdupq_n_f32(-1.0f), two, cos_2sm_sq);
509
- term1 = vmulq_f32(cos_angular_distance, term1);
510
- float32x4_t term2 = vfmaq_f32(vdupq_n_f32(-3.0f), four, sin_sq);
511
- float32x4_t term3 = vfmaq_f32(vdupq_n_f32(-3.0f), four, cos_2sm_sq);
512
- term2 = vmulq_f32(vmulq_f32(vdivq_f32(series_b, six), cos_double_angular_midpoint), vmulq_f32(term2, term3));
513
- float32x4_t delta_sigma = vmulq_f32(
514
- series_b,
515
- vmulq_f32(sin_angular_distance, vaddq_f32(cos_double_angular_midpoint,
516
- vmulq_f32(vdivq_f32(series_b, four), vsubq_f32(term1, term2)))));
547
+ float32x4_t cos_2sm_sq_f32x4 = vmulq_f32(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
548
+ float32x4_t sin_sq_f32x4 = vmulq_f32(sin_angular_distance_f32x4, sin_angular_distance_f32x4);
549
+ float32x4_t term1_f32x4 = vfmaq_f32(vdupq_n_f32(-1.0f), two_f32x4, cos_2sm_sq_f32x4);
550
+ term1_f32x4 = vmulq_f32(cos_angular_distance_f32x4, term1_f32x4);
551
+ float32x4_t term2_f32x4 = vfmaq_f32(vdupq_n_f32(-3.0f), four_f32x4, sin_sq_f32x4);
552
+ float32x4_t term3_f32x4 = vfmaq_f32(vdupq_n_f32(-3.0f), four_f32x4, cos_2sm_sq_f32x4);
553
+ term2_f32x4 = vmulq_f32(vmulq_f32(vdivq_f32(series_b_f32x4, six_f32x4), cos_double_angular_midpoint_f32x4),
554
+ vmulq_f32(term2_f32x4, term3_f32x4));
555
+ float32x4_t delta_sigma_f32x4 = vmulq_f32(
556
+ series_b_f32x4,
557
+ vmulq_f32(sin_angular_distance_f32x4,
558
+ vaddq_f32(cos_double_angular_midpoint_f32x4,
559
+ vmulq_f32(vdivq_f32(series_b_f32x4, four_f32x4), vsubq_f32(term1_f32x4, term2_f32x4)))));
517
560
 
518
561
  // s = b * A * (σ - Δσ)
519
- float32x4_t distances = vmulq_f32(vmulq_f32(polar_radius, series_a), vsubq_f32(angular_distance, delta_sigma));
562
+ float32x4_t distances_f32x4 = vmulq_f32(vmulq_f32(polar_radius_f32x4, series_a_f32x4),
563
+ vsubq_f32(angular_distance_f32x4, delta_sigma_f32x4));
520
564
 
521
565
  // Set coincident points to zero
522
- distances = vbslq_f32(coincident_mask, vdupq_n_f32(0.0f), distances);
566
+ distances_f32x4 = vbslq_f32(coincident_mask_u32x4, vdupq_n_f32(0.0f), distances_f32x4);
523
567
 
524
- return distances;
568
+ return distances_f32x4;
525
569
  }
526
570
 
527
571
  NK_PUBLIC void nk_vincenty_f32_neon( //
@@ -530,14 +574,14 @@ NK_PUBLIC void nk_vincenty_f32_neon( //
530
574
  nk_size_t n, nk_f32_t *results) {
531
575
 
532
576
  while (n >= 4) {
533
- float32x4_t first_latitudes = vld1q_f32(a_lats);
534
- float32x4_t first_longitudes = vld1q_f32(a_lons);
535
- float32x4_t second_latitudes = vld1q_f32(b_lats);
536
- float32x4_t second_longitudes = vld1q_f32(b_lons);
577
+ float32x4_t first_latitudes_f32x4 = vld1q_f32(a_lats);
578
+ float32x4_t first_longitudes_f32x4 = vld1q_f32(a_lons);
579
+ float32x4_t second_latitudes_f32x4 = vld1q_f32(b_lats);
580
+ float32x4_t second_longitudes_f32x4 = vld1q_f32(b_lons);
537
581
 
538
- float32x4_t distances = nk_vincenty_f32x4_neon_(first_latitudes, first_longitudes, second_latitudes,
539
- second_longitudes);
540
- vst1q_f32(results, distances);
582
+ float32x4_t distances_f32x4 = nk_vincenty_f32x4_neon_(first_latitudes_f32x4, first_longitudes_f32x4,
583
+ second_latitudes_f32x4, second_longitudes_f32x4);
584
+ vst1q_f32(results, distances_f32x4);
541
585
 
542
586
  a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
543
587
  }
@@ -549,9 +593,9 @@ NK_PUBLIC void nk_vincenty_f32_neon( //
549
593
  nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
550
594
  nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
551
595
  nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
552
- float32x4_t distances = nk_vincenty_f32x4_neon_(a_lat_vec.f32x4, a_lon_vec.f32x4, b_lat_vec.f32x4,
553
- b_lon_vec.f32x4);
554
- result_vec.f32x4 = distances;
596
+ float32x4_t distances_f32x4 = nk_vincenty_f32x4_neon_(a_lat_vec.f32x4, a_lon_vec.f32x4, b_lat_vec.f32x4,
597
+ b_lon_vec.f32x4);
598
+ result_vec.f32x4 = distances_f32x4;
555
599
  nk_partial_store_b32x4_serial_(&result_vec, results, n);
556
600
  }
557
601
  }