numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,701 @@
1
+ /**
2
+ * @brief SIMD-accelerated Geospatial Distances for RISC-V.
3
+ * @file include/numkong/geospatial/rvv.h
4
+ * @author Ash Vardanian
5
+ * @date February 6, 2026
6
+ *
7
+ * @sa include/numkong/geospatial.h
8
+ *
9
+ * Implements Haversine and Vincenty geodesic distance computations using RVV 1.0 intrinsics
10
+ * with LMUL=4 (m4) grouping for maximum throughput. The variable-length vector loop uses
11
+ * `__riscv_vsetvl_e64m4` / `__riscv_vsetvl_e32m4` so each iteration processes as many
12
+ * point-pairs as the hardware vector length allows, with no scalar tail handling needed.
13
+ *
14
+ * Trigonometric helpers (sin, cos, atan2) come from trigonometry/rvv.h which provides
15
+ * polynomial approximations operating on `vfloat64m4_t` / `vfloat32m4_t` vectors.
16
+ *
17
+ * Vincenty convergence tracking uses RVV mask registers (`vbool16_t` / `vbool8_t`) with
18
+ * `__riscv_vcpop_m` to check if all lanes have converged, and `__riscv_vmerge` for
19
+ * per-lane conditional updates.
20
+ *
21
+ * @section rvv_geospatial_instructions Key RVV Geospatial Instructions
22
+ *
23
+ * Intrinsic Purpose
24
+ * __riscv_vfsqrt_v_f64m4(x, vl) Square root (f64, LMUL=4)
25
+ * __riscv_vfsqrt_v_f32m4(x, vl) Square root (f32, LMUL=4)
26
+ * __riscv_vfdiv_vv_f64m4(a, b, vl) Division (f64, LMUL=4)
27
+ * __riscv_vfdiv_vv_f32m4(a, b, vl) Division (f32, LMUL=4)
28
+ * __riscv_vfmadd_vv_f64m4(a, b, c, vl) Fused multiply-add: a*b+c (f64)
29
+ * __riscv_vfmadd_vv_f32m4(a, b, c, vl) Fused multiply-add: a*b+c (f32)
30
+ * __riscv_vcpop_m_b16(mask, vl) Count set bits in mask (convergence check)
31
+ * __riscv_vmerge_vvm_f64m4(a, b, m, vl) Conditional merge (per-lane select)
32
+ */
33
+ #ifndef NK_GEOSPATIAL_RVV_H
34
+ #define NK_GEOSPATIAL_RVV_H
35
+
36
+ #if NK_TARGET_RISCV_
37
+ #if NK_TARGET_RVV
38
+
39
+ #include "numkong/types.h"
40
+ #include "numkong/trigonometry/rvv.h" // nk_f64m4_sin_rvv_, nk_f64m4_cos_rvv_, nk_f64m4_atan2_rvv_, etc.
41
+
42
+ #if defined(__clang__)
43
+ #pragma clang attribute push(__attribute__((target("arch=+v"))), apply_to = function)
44
+ #elif defined(__GNUC__)
45
+ #pragma GCC push_options
46
+ #pragma GCC target("arch=+v")
47
+ #endif
48
+
49
+ #if defined(__cplusplus)
50
+ extern "C" {
51
+ #endif
52
+
53
+ /* RVV implementations using LMUL=4 vectors for f64 and f32 geospatial distances.
54
+ * These require RVV trigonometric kernels from trigonometry/rvv.h.
55
+ */
56
+
57
+ #pragma region - Haversine Distance
58
+
59
+ /**
60
+ * @brief RVV internal kernel for Haversine distance on vector_length f64 point pairs.
61
+ *
62
+ * Haversine formula:
63
+ * dlat = lat2 - lat1
64
+ * dlon = lon2 - lon1
65
+ * a = sin^2(dlat/2) + cos(lat1) * cos(lat2) * sin^2(dlon/2)
66
+ * c = 2 * atan2(sqrt(a), sqrt(1 - a))
67
+ * distance = R * c
68
+ *
69
+ * where R = NK_EARTH_MEDIATORIAL_RADIUS.
70
+ */
71
+ NK_INTERNAL void nk_haversine_f64_rvv_kernel_( //
72
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
73
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
74
+ nk_size_t vector_length, nk_f64_t *results) {
75
+
76
+ vfloat64m4_t lat1 = __riscv_vle64_v_f64m4(a_lats, vector_length);
77
+ vfloat64m4_t lon1 = __riscv_vle64_v_f64m4(a_lons, vector_length);
78
+ vfloat64m4_t lat2 = __riscv_vle64_v_f64m4(b_lats, vector_length);
79
+ vfloat64m4_t lon2 = __riscv_vle64_v_f64m4(b_lons, vector_length);
80
+
81
+ vfloat64m4_t dlat = __riscv_vfsub_vv_f64m4(lat2, lat1, vector_length);
82
+ vfloat64m4_t dlon = __riscv_vfsub_vv_f64m4(lon2, lon1, vector_length);
83
+
84
+ // sin(dlat/2) and sin(dlon/2)
85
+ vfloat64m4_t half_dlat = __riscv_vfmul_vf_f64m4(dlat, 0.5, vector_length);
86
+ vfloat64m4_t half_dlon = __riscv_vfmul_vf_f64m4(dlon, 0.5, vector_length);
87
+ vfloat64m4_t sin_half_dlat = nk_f64m4_sin_rvv_(half_dlat, vector_length);
88
+ vfloat64m4_t sin_half_dlon = nk_f64m4_sin_rvv_(half_dlon, vector_length);
89
+
90
+ // sin^2(dlat/2) and sin^2(dlon/2)
91
+ vfloat64m4_t sin_sq_half_dlat = __riscv_vfmul_vv_f64m4(sin_half_dlat, sin_half_dlat, vector_length);
92
+ vfloat64m4_t sin_sq_half_dlon = __riscv_vfmul_vv_f64m4(sin_half_dlon, sin_half_dlon, vector_length);
93
+
94
+ // cos(lat1) * cos(lat2)
95
+ vfloat64m4_t cos_lat1 = nk_f64m4_cos_rvv_(lat1, vector_length);
96
+ vfloat64m4_t cos_lat2 = nk_f64m4_cos_rvv_(lat2, vector_length);
97
+ vfloat64m4_t cos_product = __riscv_vfmul_vv_f64m4(cos_lat1, cos_lat2, vector_length);
98
+
99
+ // a = sin^2(dlat/2) + cos(lat1)*cos(lat2)*sin^2(dlon/2)
100
+ vfloat64m4_t haversine_term = __riscv_vfmadd_vv_f64m4(cos_product, sin_sq_half_dlon, sin_sq_half_dlat,
101
+ vector_length);
102
+
103
+ // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
104
+ vfloat64m4_t zero = __riscv_vfmv_v_f_f64m4(0.0, vector_length);
105
+ vfloat64m4_t one = __riscv_vfmv_v_f_f64m4(1.0, vector_length);
106
+ haversine_term = __riscv_vfmax_vv_f64m4(zero, haversine_term, vector_length);
107
+ haversine_term = __riscv_vfmin_vv_f64m4(one, haversine_term, vector_length);
108
+
109
+ // Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
110
+ vfloat64m4_t sqrt_haversine = __riscv_vfsqrt_v_f64m4(haversine_term, vector_length);
111
+ vfloat64m4_t complement = __riscv_vfsub_vv_f64m4(one, haversine_term, vector_length);
112
+ vfloat64m4_t sqrt_complement = __riscv_vfsqrt_v_f64m4(complement, vector_length);
113
+ vfloat64m4_t central_angle = nk_f64m4_atan2_rvv_(sqrt_haversine, sqrt_complement, vector_length);
114
+ central_angle = __riscv_vfmul_vf_f64m4(central_angle, 2.0, vector_length);
115
+
116
+ // distance = R * c
117
+ vfloat64m4_t distances = __riscv_vfmul_vf_f64m4(central_angle, NK_EARTH_MEDIATORIAL_RADIUS, vector_length);
118
+ __riscv_vse64_v_f64m4(results, distances, vector_length);
119
+ }
120
+
121
+ NK_PUBLIC void nk_haversine_f64_rvv( //
122
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
123
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
124
+ nk_size_t n, nk_f64_t *results) {
125
+
126
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a_lats += vector_length, a_lons += vector_length,
127
+ b_lats += vector_length, b_lons += vector_length, results += vector_length) {
128
+ vector_length = __riscv_vsetvl_e64m4(n);
129
+ nk_haversine_f64_rvv_kernel_(a_lats, a_lons, b_lats, b_lons, vector_length, results);
130
+ }
131
+ }
132
+
133
+ /**
134
+ * @brief RVV internal kernel for Haversine distance on vector_length f32 point pairs.
135
+ */
136
+ NK_INTERNAL void nk_haversine_f32_rvv_kernel_( //
137
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
138
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
139
+ nk_size_t vector_length, nk_f32_t *results) {
140
+
141
+ vfloat32m4_t lat1 = __riscv_vle32_v_f32m4(a_lats, vector_length);
142
+ vfloat32m4_t lon1 = __riscv_vle32_v_f32m4(a_lons, vector_length);
143
+ vfloat32m4_t lat2 = __riscv_vle32_v_f32m4(b_lats, vector_length);
144
+ vfloat32m4_t lon2 = __riscv_vle32_v_f32m4(b_lons, vector_length);
145
+
146
+ vfloat32m4_t dlat = __riscv_vfsub_vv_f32m4(lat2, lat1, vector_length);
147
+ vfloat32m4_t dlon = __riscv_vfsub_vv_f32m4(lon2, lon1, vector_length);
148
+
149
+ // sin(dlat/2) and sin(dlon/2)
150
+ vfloat32m4_t half_dlat = __riscv_vfmul_vf_f32m4(dlat, 0.5f, vector_length);
151
+ vfloat32m4_t half_dlon = __riscv_vfmul_vf_f32m4(dlon, 0.5f, vector_length);
152
+ vfloat32m4_t sin_half_dlat = nk_f32m4_sin_rvv_(half_dlat, vector_length);
153
+ vfloat32m4_t sin_half_dlon = nk_f32m4_sin_rvv_(half_dlon, vector_length);
154
+
155
+ // sin^2(dlat/2) and sin^2(dlon/2)
156
+ vfloat32m4_t sin_sq_half_dlat = __riscv_vfmul_vv_f32m4(sin_half_dlat, sin_half_dlat, vector_length);
157
+ vfloat32m4_t sin_sq_half_dlon = __riscv_vfmul_vv_f32m4(sin_half_dlon, sin_half_dlon, vector_length);
158
+
159
+ // cos(lat1) * cos(lat2)
160
+ vfloat32m4_t cos_lat1 = nk_f32m4_cos_rvv_(lat1, vector_length);
161
+ vfloat32m4_t cos_lat2 = nk_f32m4_cos_rvv_(lat2, vector_length);
162
+ vfloat32m4_t cos_product = __riscv_vfmul_vv_f32m4(cos_lat1, cos_lat2, vector_length);
163
+
164
+ // a = sin^2(dlat/2) + cos(lat1)*cos(lat2)*sin^2(dlon/2)
165
+ vfloat32m4_t haversine_term = __riscv_vfmadd_vv_f32m4(cos_product, sin_sq_half_dlon, sin_sq_half_dlat,
166
+ vector_length);
167
+
168
+ // Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
169
+ vfloat32m4_t zero = __riscv_vfmv_v_f_f32m4(0.0f, vector_length);
170
+ vfloat32m4_t one = __riscv_vfmv_v_f_f32m4(1.0f, vector_length);
171
+ haversine_term = __riscv_vfmax_vv_f32m4(zero, haversine_term, vector_length);
172
+ haversine_term = __riscv_vfmin_vv_f32m4(one, haversine_term, vector_length);
173
+
174
+ // Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
175
+ vfloat32m4_t sqrt_haversine = __riscv_vfsqrt_v_f32m4(haversine_term, vector_length);
176
+ vfloat32m4_t complement = __riscv_vfsub_vv_f32m4(one, haversine_term, vector_length);
177
+ vfloat32m4_t sqrt_complement = __riscv_vfsqrt_v_f32m4(complement, vector_length);
178
+ vfloat32m4_t central_angle = nk_f32m4_atan2_rvv_(sqrt_haversine, sqrt_complement, vector_length);
179
+ central_angle = __riscv_vfmul_vf_f32m4(central_angle, 2.0f, vector_length);
180
+
181
+ // distance = R * c
182
+ vfloat32m4_t distances = __riscv_vfmul_vf_f32m4(central_angle, (nk_f32_t)NK_EARTH_MEDIATORIAL_RADIUS,
183
+ vector_length);
184
+ __riscv_vse32_v_f32m4(results, distances, vector_length);
185
+ }
186
+
187
+ NK_PUBLIC void nk_haversine_f32_rvv( //
188
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
189
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
190
+ nk_size_t n, nk_f32_t *results) {
191
+
192
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a_lats += vector_length, a_lons += vector_length,
193
+ b_lats += vector_length, b_lons += vector_length, results += vector_length) {
194
+ vector_length = __riscv_vsetvl_e32m4(n);
195
+ nk_haversine_f32_rvv_kernel_(a_lats, a_lons, b_lats, b_lons, vector_length, results);
196
+ }
197
+ }
198
+
199
+ #pragma endregion - Haversine Distance
200
+
201
+ #pragma region - Vincenty Distance
202
+
203
+ /**
204
+ * @brief RVV internal kernel for Vincenty's geodesic distance on vector_length f64 point pairs.
205
+ * @note This is a true SIMD implementation using masked convergence tracking via vmerge.
206
+ *
207
+ * Vincenty's formulae iterate to solve the geodesic on an oblate spheroid (WGS-84 ellipsoid).
208
+ * Each SIMD lane tracks its own convergence state via mask registers. The loop terminates
209
+ * when all lanes have converged (vcpop == vector_length) or after NK_VINCENTY_MAX_ITERATIONS.
210
+ */
211
+ NK_INTERNAL void nk_vincenty_f64_rvv_kernel_( //
212
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
213
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
214
+ nk_size_t vector_length, nk_f64_t *results) {
215
+
216
+ vfloat64m4_t lat1 = __riscv_vle64_v_f64m4(a_lats, vector_length);
217
+ vfloat64m4_t lon1 = __riscv_vle64_v_f64m4(a_lons, vector_length);
218
+ vfloat64m4_t lat2 = __riscv_vle64_v_f64m4(b_lats, vector_length);
219
+ vfloat64m4_t lon2 = __riscv_vle64_v_f64m4(b_lons, vector_length);
220
+
221
+ vfloat64m4_t const v_equatorial_radius = __riscv_vfmv_v_f_f64m4(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS,
222
+ vector_length);
223
+ vfloat64m4_t const v_polar_radius = __riscv_vfmv_v_f_f64m4(NK_EARTH_ELLIPSOID_POLAR_RADIUS, vector_length);
224
+ nk_f64_t const flattening_scalar = 1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING;
225
+ vfloat64m4_t const v_flattening = __riscv_vfmv_v_f_f64m4(flattening_scalar, vector_length);
226
+ vfloat64m4_t const v_convergence = __riscv_vfmv_v_f_f64m4(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64, vector_length);
227
+ vfloat64m4_t const v_one = __riscv_vfmv_v_f_f64m4(1.0, vector_length);
228
+ vfloat64m4_t const v_two = __riscv_vfmv_v_f_f64m4(2.0, vector_length);
229
+ vfloat64m4_t const v_three = __riscv_vfmv_v_f_f64m4(3.0, vector_length);
230
+ vfloat64m4_t const v_four = __riscv_vfmv_v_f_f64m4(4.0, vector_length);
231
+ vfloat64m4_t const v_six = __riscv_vfmv_v_f_f64m4(6.0, vector_length);
232
+ vfloat64m4_t const v_sixteen = __riscv_vfmv_v_f_f64m4(16.0, vector_length);
233
+ vfloat64m4_t const v_epsilon = __riscv_vfmv_v_f_f64m4(1e-15, vector_length);
234
+ vfloat64m4_t const v_zero = __riscv_vfmv_v_f_f64m4(0.0, vector_length);
235
+ vfloat64m4_t const v_neg_one = __riscv_vfmv_v_f_f64m4(-1.0, vector_length);
236
+
237
+ // Longitude difference
238
+ vfloat64m4_t longitude_difference = __riscv_vfsub_vv_f64m4(lon2, lon1, vector_length);
239
+
240
+ // Reduced latitudes: tan(U) = (1-f) * tan(lat)
241
+ vfloat64m4_t one_minus_f = __riscv_vfsub_vv_f64m4(v_one, v_flattening, vector_length);
242
+ vfloat64m4_t sin_lat1 = nk_f64m4_sin_rvv_(lat1, vector_length);
243
+ vfloat64m4_t cos_lat1 = nk_f64m4_cos_rvv_(lat1, vector_length);
244
+ vfloat64m4_t sin_lat2 = nk_f64m4_sin_rvv_(lat2, vector_length);
245
+ vfloat64m4_t cos_lat2 = nk_f64m4_cos_rvv_(lat2, vector_length);
246
+ vfloat64m4_t tan_first = __riscv_vfdiv_vv_f64m4(sin_lat1, cos_lat1, vector_length);
247
+ vfloat64m4_t tan_second = __riscv_vfdiv_vv_f64m4(sin_lat2, cos_lat2, vector_length);
248
+ vfloat64m4_t tan_reduced_first = __riscv_vfmul_vv_f64m4(one_minus_f, tan_first, vector_length);
249
+ vfloat64m4_t tan_reduced_second = __riscv_vfmul_vv_f64m4(one_minus_f, tan_second, vector_length);
250
+
251
+ // cos(U) = 1/sqrt(1 + tan^2(U)), sin(U) = tan(U) * cos(U)
252
+ vfloat64m4_t tan_sq_first = __riscv_vfmadd_vv_f64m4(tan_reduced_first, tan_reduced_first, v_one, vector_length);
253
+ vfloat64m4_t cos_reduced_first = __riscv_vfdiv_vv_f64m4(v_one, __riscv_vfsqrt_v_f64m4(tan_sq_first, vector_length),
254
+ vector_length);
255
+ vfloat64m4_t sin_reduced_first = __riscv_vfmul_vv_f64m4(tan_reduced_first, cos_reduced_first, vector_length);
256
+
257
+ vfloat64m4_t tan_sq_second = __riscv_vfmadd_vv_f64m4(tan_reduced_second, tan_reduced_second, v_one, vector_length);
258
+ vfloat64m4_t cos_reduced_second = __riscv_vfdiv_vv_f64m4(
259
+ v_one, __riscv_vfsqrt_v_f64m4(tan_sq_second, vector_length), vector_length);
260
+ vfloat64m4_t sin_reduced_second = __riscv_vfmul_vv_f64m4(tan_reduced_second, cos_reduced_second, vector_length);
261
+
262
+ // Initialize lambda and tracking variables
263
+ vfloat64m4_t lambda = longitude_difference;
264
+ vfloat64m4_t sin_angular_distance = v_zero;
265
+ vfloat64m4_t cos_angular_distance = v_zero;
266
+ vfloat64m4_t angular_distance = v_zero;
267
+ vfloat64m4_t sin_azimuth = v_zero;
268
+ vfloat64m4_t cos_squared_azimuth = v_zero;
269
+ vfloat64m4_t cos_double_angular_midpoint = v_zero;
270
+
271
+ // Track convergence and coincident points using masks
272
+ // vbool16_t is the mask type for LMUL=4 with 64-bit elements (64/4 = 16)
273
+ vbool16_t converged_mask_b16 = __riscv_vmfeq_vv_f64m4_b16(v_zero, v_one, vector_length); // all false
274
+ vbool16_t coincident_mask_b16 = converged_mask_b16;
275
+
276
+ for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
277
+ // Check if all lanes converged
278
+ if (__riscv_vcpop_m_b16(converged_mask_b16, vector_length) == vector_length) break;
279
+
280
+ vfloat64m4_t sin_lambda = nk_f64m4_sin_rvv_(lambda, vector_length);
281
+ vfloat64m4_t cos_lambda = nk_f64m4_cos_rvv_(lambda, vector_length);
282
+
283
+ // sin^2(angular_distance) = (cos(U2)*sin(l))^2 + (cos(U1)*sin(U2) - sin(U1)*cos(U2)*cos(l))^2
284
+ vfloat64m4_t cross_term = __riscv_vfmul_vv_f64m4(cos_reduced_second, sin_lambda, vector_length);
285
+ vfloat64m4_t sin1_cos2_cosl = __riscv_vfmul_vv_f64m4(sin_reduced_first, cos_reduced_second, vector_length);
286
+ sin1_cos2_cosl = __riscv_vfmul_vv_f64m4(sin1_cos2_cosl, cos_lambda, vector_length);
287
+ vfloat64m4_t mixed_term = __riscv_vfmul_vv_f64m4(cos_reduced_first, sin_reduced_second, vector_length);
288
+ mixed_term = __riscv_vfsub_vv_f64m4(mixed_term, sin1_cos2_cosl, vector_length);
289
+
290
+ vfloat64m4_t sin_angular_dist_sq = __riscv_vfmul_vv_f64m4(cross_term, cross_term, vector_length);
291
+ sin_angular_dist_sq = __riscv_vfmadd_vv_f64m4(mixed_term, mixed_term, sin_angular_dist_sq, vector_length);
292
+ sin_angular_distance = __riscv_vfsqrt_v_f64m4(sin_angular_dist_sq, vector_length);
293
+
294
+ // Check for coincident points (sin_angular_distance < epsilon)
295
+ coincident_mask_b16 = __riscv_vmflt_vv_f64m4_b16(sin_angular_distance, v_epsilon, vector_length);
296
+
297
+ // cos(angular_distance) = sin(U1)*sin(U2) + cos(U1)*cos(U2)*cos(l)
298
+ vfloat64m4_t cos1_cos2 = __riscv_vfmul_vv_f64m4(cos_reduced_first, cos_reduced_second, vector_length);
299
+ cos_angular_distance = __riscv_vfmul_vv_f64m4(sin_reduced_first, sin_reduced_second, vector_length);
300
+ cos_angular_distance = __riscv_vfmadd_vv_f64m4(cos1_cos2, cos_lambda, cos_angular_distance, vector_length);
301
+
302
+ // angular_distance = atan2(sin, cos)
303
+ angular_distance = nk_f64m4_atan2_rvv_(sin_angular_distance, cos_angular_distance, vector_length);
304
+
305
+ // sin(azimuth) = cos(U1)*cos(U2)*sin(l) / sin(angular_distance)
306
+ // Avoid division by zero by substituting 1.0 for coincident lanes
307
+ vfloat64m4_t safe_sin_angular = __riscv_vfmerge_vfm_f64m4(sin_angular_distance, 1.0, coincident_mask_b16,
308
+ vector_length);
309
+ vfloat64m4_t numerator = __riscv_vfmul_vv_f64m4(cos1_cos2, sin_lambda, vector_length);
310
+ sin_azimuth = __riscv_vfdiv_vv_f64m4(numerator, safe_sin_angular, vector_length);
311
+ cos_squared_azimuth = __riscv_vfnmsub_vv_f64m4(sin_azimuth, sin_azimuth, v_one, vector_length);
312
+
313
+ // Handle equatorial case: cos^2(a) < epsilon
314
+ vbool16_t equatorial_mask_b16 = __riscv_vmflt_vv_f64m4_b16(cos_squared_azimuth, v_epsilon, vector_length);
315
+ vfloat64m4_t safe_cos_sq_azimuth = __riscv_vfmerge_vfm_f64m4(cos_squared_azimuth, 1.0, equatorial_mask_b16,
316
+ vector_length);
317
+
318
+ // cos(2sm) = cos(s) - 2*sin(U1)*sin(U2) / cos^2(a)
319
+ vfloat64m4_t sin_product = __riscv_vfmul_vv_f64m4(sin_reduced_first, sin_reduced_second, vector_length);
320
+ vfloat64m4_t two_sin_product = __riscv_vfmul_vv_f64m4(v_two, sin_product, vector_length);
321
+ cos_double_angular_midpoint = __riscv_vfdiv_vv_f64m4(two_sin_product, safe_cos_sq_azimuth, vector_length);
322
+ cos_double_angular_midpoint = __riscv_vfsub_vv_f64m4(cos_angular_distance, cos_double_angular_midpoint,
323
+ vector_length);
324
+ // Set to zero for equatorial case
325
+ cos_double_angular_midpoint = __riscv_vfmerge_vfm_f64m4(cos_double_angular_midpoint, 0.0, equatorial_mask_b16,
326
+ vector_length);
327
+
328
+ // C = f/16 * cos^2(a) * (4 + f*(4 - 3*cos^2(a)))
329
+ // inner = 4 - 3*cos^2(a)
330
+ vfloat64m4_t inner_c = __riscv_vfnmsub_vv_f64m4(v_three, cos_squared_azimuth, v_four, vector_length);
331
+ // 4 + f * inner_c
332
+ vfloat64m4_t outer_c = __riscv_vfmadd_vv_f64m4(v_flattening, inner_c, v_four, vector_length);
333
+ // f/16 * cos^2(a) * outer_c
334
+ vfloat64m4_t correction_factor = __riscv_vfdiv_vv_f64m4(v_flattening, v_sixteen, vector_length);
335
+ correction_factor = __riscv_vfmul_vv_f64m4(correction_factor, cos_squared_azimuth, vector_length);
336
+ correction_factor = __riscv_vfmul_vv_f64m4(correction_factor, outer_c, vector_length);
337
+
338
+ // lambda' = L + (1-C)*f*sin(a)*(s + C*sin(s)*(cos(2sm) + C*cos(s)*(-1 + 2*cos^2(2sm))))
339
+ vfloat64m4_t cos_2sm_sq = __riscv_vfmul_vv_f64m4(cos_double_angular_midpoint, cos_double_angular_midpoint,
340
+ vector_length);
341
+ // innermost = -1 + 2*cos^2(2sm)
342
+ vfloat64m4_t innermost = __riscv_vfmadd_vv_f64m4(v_two, cos_2sm_sq, v_neg_one, vector_length);
343
+ // middle = cos(2sm) + C*cos(s)*innermost
344
+ vfloat64m4_t c_cos_s = __riscv_vfmul_vv_f64m4(correction_factor, cos_angular_distance, vector_length);
345
+ vfloat64m4_t middle = __riscv_vfmadd_vv_f64m4(c_cos_s, innermost, cos_double_angular_midpoint, vector_length);
346
+ // inner = C*sin(s)*middle
347
+ vfloat64m4_t c_sin_s = __riscv_vfmul_vv_f64m4(correction_factor, sin_angular_distance, vector_length);
348
+ vfloat64m4_t inner_val = __riscv_vfmul_vv_f64m4(c_sin_s, middle, vector_length);
349
+
350
+ // (1-C)*f*sin_a*(s + inner)
351
+ vfloat64m4_t one_minus_c = __riscv_vfsub_vv_f64m4(v_one, correction_factor, vector_length);
352
+ vfloat64m4_t f_sin_a = __riscv_vfmul_vv_f64m4(v_flattening, sin_azimuth, vector_length);
353
+ vfloat64m4_t s_plus_inner = __riscv_vfadd_vv_f64m4(angular_distance, inner_val, vector_length);
354
+ vfloat64m4_t adjustment = __riscv_vfmul_vv_f64m4(one_minus_c, f_sin_a, vector_length);
355
+ adjustment = __riscv_vfmul_vv_f64m4(adjustment, s_plus_inner, vector_length);
356
+ vfloat64m4_t lambda_new = __riscv_vfadd_vv_f64m4(longitude_difference, adjustment, vector_length);
357
+
358
+ // Check convergence: |lambda - lambda'| < threshold
359
+ vfloat64m4_t lambda_diff = __riscv_vfsub_vv_f64m4(lambda_new, lambda, vector_length);
360
+ // Absolute value via sign-bit clearing
361
+ vfloat64m4_t lambda_diff_abs = __riscv_vfsgnjx_vv_f64m4(lambda_diff, lambda_diff, vector_length);
362
+ vbool16_t newly_converged_b16 = __riscv_vmflt_vv_f64m4_b16(lambda_diff_abs, v_convergence, vector_length);
363
+ converged_mask_b16 = __riscv_vmor_mm_b16(converged_mask_b16, newly_converged_b16, vector_length);
364
+
365
+ // Only update lambda for non-converged lanes
366
+ lambda = __riscv_vmerge_vvm_f64m4(lambda_new, lambda, converged_mask_b16, vector_length);
367
+ }
368
+
369
+ // Final distance calculation
370
+ // u^2 = cos^2(a) * (a^2 - b^2) / b^2
371
+ vfloat64m4_t a_sq = __riscv_vfmul_vv_f64m4(v_equatorial_radius, v_equatorial_radius, vector_length);
372
+ vfloat64m4_t b_sq = __riscv_vfmul_vv_f64m4(v_polar_radius, v_polar_radius, vector_length);
373
+ vfloat64m4_t a_sq_minus_b_sq = __riscv_vfsub_vv_f64m4(a_sq, b_sq, vector_length);
374
+ vfloat64m4_t u_squared = __riscv_vfmul_vv_f64m4(cos_squared_azimuth, a_sq_minus_b_sq, vector_length);
375
+ u_squared = __riscv_vfdiv_vv_f64m4(u_squared, b_sq, vector_length);
376
+
377
+ // A = 1 + u^2/16384 * (4096 + u^2*(-768 + u^2*(320 - 175*u^2)))
378
+ vfloat64m4_t series_a = __riscv_vfmul_vf_f64m4(u_squared, -175.0, vector_length);
379
+ series_a = __riscv_vfadd_vf_f64m4(series_a, 320.0, vector_length);
380
+ series_a = __riscv_vfmadd_vv_f64m4(u_squared, series_a, __riscv_vfmv_v_f_f64m4(-768.0, vector_length),
381
+ vector_length);
382
+ series_a = __riscv_vfmadd_vv_f64m4(u_squared, series_a, __riscv_vfmv_v_f_f64m4(4096.0, vector_length),
383
+ vector_length);
384
+ vfloat64m4_t u_sq_over_16384 = __riscv_vfmul_vf_f64m4(u_squared, 1.0 / 16384.0, vector_length);
385
+ series_a = __riscv_vfmadd_vv_f64m4(u_sq_over_16384, series_a, v_one, vector_length);
386
+
387
+ // B = u^2/1024 * (256 + u^2*(-128 + u^2*(74 - 47*u^2)))
388
+ vfloat64m4_t series_b = __riscv_vfmul_vf_f64m4(u_squared, -47.0, vector_length);
389
+ series_b = __riscv_vfadd_vf_f64m4(series_b, 74.0, vector_length);
390
+ series_b = __riscv_vfmadd_vv_f64m4(u_squared, series_b, __riscv_vfmv_v_f_f64m4(-128.0, vector_length),
391
+ vector_length);
392
+ series_b = __riscv_vfmadd_vv_f64m4(u_squared, series_b, __riscv_vfmv_v_f_f64m4(256.0, vector_length),
393
+ vector_length);
394
+ vfloat64m4_t u_sq_over_1024 = __riscv_vfmul_vf_f64m4(u_squared, 1.0 / 1024.0, vector_length);
395
+ series_b = __riscv_vfmul_vv_f64m4(u_sq_over_1024, series_b, vector_length);
396
+
397
+ // Delta-sigma = B*sin(s)*(cos(2sm) + B/4*(cos(s)*(-1+2*cos^2(2sm)) -
398
+ // B/6*cos(2sm)*(-3+4*sin^2(s))*(-3+4*cos^2(2sm))))
399
+ vfloat64m4_t cos_2sm_sq = __riscv_vfmul_vv_f64m4(cos_double_angular_midpoint, cos_double_angular_midpoint,
400
+ vector_length);
401
+ vfloat64m4_t sin_sq = __riscv_vfmul_vv_f64m4(sin_angular_distance, sin_angular_distance, vector_length);
402
+
403
+ // term1 = cos(s) * (-1 + 2*cos^2(2sm))
404
+ vfloat64m4_t term1 = __riscv_vfmadd_vv_f64m4(v_two, cos_2sm_sq, v_neg_one, vector_length);
405
+ term1 = __riscv_vfmul_vv_f64m4(cos_angular_distance, term1, vector_length);
406
+
407
+ // term2 = B/6 * cos(2sm) * (-3 + 4*sin^2(s)) * (-3 + 4*cos^2(2sm))
408
+ vfloat64m4_t neg_three = __riscv_vfmv_v_f_f64m4(-3.0, vector_length);
409
+ vfloat64m4_t factor_sin = __riscv_vfmadd_vv_f64m4(v_four, sin_sq, neg_three, vector_length);
410
+ vfloat64m4_t factor_cos = __riscv_vfmadd_vv_f64m4(v_four, cos_2sm_sq, neg_three, vector_length);
411
+ vfloat64m4_t b_over_6 = __riscv_vfdiv_vv_f64m4(series_b, v_six, vector_length);
412
+ vfloat64m4_t term2 = __riscv_vfmul_vv_f64m4(b_over_6, cos_double_angular_midpoint, vector_length);
413
+ term2 = __riscv_vfmul_vv_f64m4(term2, factor_sin, vector_length);
414
+ term2 = __riscv_vfmul_vv_f64m4(term2, factor_cos, vector_length);
415
+
416
+ // B/4 * (term1 - term2)
417
+ vfloat64m4_t b_over_4 = __riscv_vfdiv_vv_f64m4(series_b, v_four, vector_length);
418
+ vfloat64m4_t term1_minus_term2 = __riscv_vfsub_vv_f64m4(term1, term2, vector_length);
419
+ vfloat64m4_t b4_bracket = __riscv_vfmul_vv_f64m4(b_over_4, term1_minus_term2, vector_length);
420
+
421
+ // cos(2sm) + B/4*(...)
422
+ vfloat64m4_t bracket = __riscv_vfadd_vv_f64m4(cos_double_angular_midpoint, b4_bracket, vector_length);
423
+
424
+ // delta_sigma = B * sin(s) * bracket
425
+ vfloat64m4_t delta_sigma = __riscv_vfmul_vv_f64m4(series_b, sin_angular_distance, vector_length);
426
+ delta_sigma = __riscv_vfmul_vv_f64m4(delta_sigma, bracket, vector_length);
427
+
428
+ // s = b * A * (sigma - delta_sigma)
429
+ vfloat64m4_t sigma_minus_ds = __riscv_vfsub_vv_f64m4(angular_distance, delta_sigma, vector_length);
430
+ vfloat64m4_t distances = __riscv_vfmul_vv_f64m4(v_polar_radius, series_a, vector_length);
431
+ distances = __riscv_vfmul_vv_f64m4(distances, sigma_minus_ds, vector_length);
432
+
433
+ // Set coincident points to zero
434
+ distances = __riscv_vfmerge_vfm_f64m4(distances, 0.0, coincident_mask_b16, vector_length);
435
+
436
+ __riscv_vse64_v_f64m4(results, distances, vector_length);
437
+ }
438
+
439
+ NK_PUBLIC void nk_vincenty_f64_rvv( //
440
+ nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
441
+ nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
442
+ nk_size_t n, nk_f64_t *results) {
443
+
444
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a_lats += vector_length, a_lons += vector_length,
445
+ b_lats += vector_length, b_lons += vector_length, results += vector_length) {
446
+ vector_length = __riscv_vsetvl_e64m4(n);
447
+ nk_vincenty_f64_rvv_kernel_(a_lats, a_lons, b_lats, b_lons, vector_length, results);
448
+ }
449
+ }
450
+
451
+ /**
452
+ * @brief RVV internal kernel for Vincenty's geodesic distance on vector_length f32 point pairs.
453
+ * @note This is a true SIMD implementation using masked convergence tracking via vmerge.
454
+ */
455
+ NK_INTERNAL void nk_vincenty_f32_rvv_kernel_( //
456
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
457
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
458
+ nk_size_t vector_length, nk_f32_t *results) {
459
+
460
+ vfloat32m4_t lat1 = __riscv_vle32_v_f32m4(a_lats, vector_length);
461
+ vfloat32m4_t lon1 = __riscv_vle32_v_f32m4(a_lons, vector_length);
462
+ vfloat32m4_t lat2 = __riscv_vle32_v_f32m4(b_lats, vector_length);
463
+ vfloat32m4_t lon2 = __riscv_vle32_v_f32m4(b_lons, vector_length);
464
+
465
+ vfloat32m4_t const v_equatorial_radius = __riscv_vfmv_v_f_f32m4((nk_f32_t)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS,
466
+ vector_length);
467
+ vfloat32m4_t const v_polar_radius = __riscv_vfmv_v_f_f32m4((nk_f32_t)NK_EARTH_ELLIPSOID_POLAR_RADIUS,
468
+ vector_length);
469
+ nk_f32_t const flattening_scalar = 1.0f / (nk_f32_t)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING;
470
+ vfloat32m4_t const v_flattening = __riscv_vfmv_v_f_f32m4(flattening_scalar, vector_length);
471
+ vfloat32m4_t const v_convergence = __riscv_vfmv_v_f_f32m4(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32, vector_length);
472
+ vfloat32m4_t const v_one = __riscv_vfmv_v_f_f32m4(1.0f, vector_length);
473
+ vfloat32m4_t const v_two = __riscv_vfmv_v_f_f32m4(2.0f, vector_length);
474
+ vfloat32m4_t const v_three = __riscv_vfmv_v_f_f32m4(3.0f, vector_length);
475
+ vfloat32m4_t const v_four = __riscv_vfmv_v_f_f32m4(4.0f, vector_length);
476
+ vfloat32m4_t const v_six = __riscv_vfmv_v_f_f32m4(6.0f, vector_length);
477
+ vfloat32m4_t const v_sixteen = __riscv_vfmv_v_f_f32m4(16.0f, vector_length);
478
+ vfloat32m4_t const v_epsilon = __riscv_vfmv_v_f_f32m4(1e-7f, vector_length);
479
+ vfloat32m4_t const v_zero = __riscv_vfmv_v_f_f32m4(0.0f, vector_length);
480
+ vfloat32m4_t const v_neg_one = __riscv_vfmv_v_f_f32m4(-1.0f, vector_length);
481
+
482
+ // Longitude difference
483
+ vfloat32m4_t longitude_difference = __riscv_vfsub_vv_f32m4(lon2, lon1, vector_length);
484
+
485
+ // Reduced latitudes: tan(U) = (1-f) * tan(lat)
486
+ vfloat32m4_t one_minus_f = __riscv_vfsub_vv_f32m4(v_one, v_flattening, vector_length);
487
+ vfloat32m4_t sin_lat1 = nk_f32m4_sin_rvv_(lat1, vector_length);
488
+ vfloat32m4_t cos_lat1 = nk_f32m4_cos_rvv_(lat1, vector_length);
489
+ vfloat32m4_t sin_lat2 = nk_f32m4_sin_rvv_(lat2, vector_length);
490
+ vfloat32m4_t cos_lat2 = nk_f32m4_cos_rvv_(lat2, vector_length);
491
+ vfloat32m4_t tan_first = __riscv_vfdiv_vv_f32m4(sin_lat1, cos_lat1, vector_length);
492
+ vfloat32m4_t tan_second = __riscv_vfdiv_vv_f32m4(sin_lat2, cos_lat2, vector_length);
493
+ vfloat32m4_t tan_reduced_first = __riscv_vfmul_vv_f32m4(one_minus_f, tan_first, vector_length);
494
+ vfloat32m4_t tan_reduced_second = __riscv_vfmul_vv_f32m4(one_minus_f, tan_second, vector_length);
495
+
496
+ // cos(U) = 1/sqrt(1 + tan^2(U)), sin(U) = tan(U) * cos(U)
497
+ vfloat32m4_t tan_sq_first = __riscv_vfmadd_vv_f32m4(tan_reduced_first, tan_reduced_first, v_one, vector_length);
498
+ vfloat32m4_t cos_reduced_first = __riscv_vfdiv_vv_f32m4(v_one, __riscv_vfsqrt_v_f32m4(tan_sq_first, vector_length),
499
+ vector_length);
500
+ vfloat32m4_t sin_reduced_first = __riscv_vfmul_vv_f32m4(tan_reduced_first, cos_reduced_first, vector_length);
501
+
502
+ vfloat32m4_t tan_sq_second = __riscv_vfmadd_vv_f32m4(tan_reduced_second, tan_reduced_second, v_one, vector_length);
503
+ vfloat32m4_t cos_reduced_second = __riscv_vfdiv_vv_f32m4(
504
+ v_one, __riscv_vfsqrt_v_f32m4(tan_sq_second, vector_length), vector_length);
505
+ vfloat32m4_t sin_reduced_second = __riscv_vfmul_vv_f32m4(tan_reduced_second, cos_reduced_second, vector_length);
506
+
507
+ // Initialize lambda and tracking variables
508
+ vfloat32m4_t lambda = longitude_difference;
509
+ vfloat32m4_t sin_angular_distance = v_zero;
510
+ vfloat32m4_t cos_angular_distance = v_zero;
511
+ vfloat32m4_t angular_distance = v_zero;
512
+ vfloat32m4_t sin_azimuth = v_zero;
513
+ vfloat32m4_t cos_squared_azimuth = v_zero;
514
+ vfloat32m4_t cos_double_angular_midpoint = v_zero;
515
+
516
+ // Track convergence and coincident points using masks
517
+ // vbool8_t is the mask type for LMUL=4 with 32-bit elements (32/4 = 8)
518
+ vbool8_t converged_mask_b8 = __riscv_vmfeq_vv_f32m4_b8(v_zero, v_one, vector_length); // all false
519
+ vbool8_t coincident_mask_b8 = converged_mask_b8;
520
+
521
+ for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
522
+ // Check if all lanes converged
523
+ if (__riscv_vcpop_m_b8(converged_mask_b8, vector_length) == vector_length) break;
524
+
525
+ vfloat32m4_t sin_lambda = nk_f32m4_sin_rvv_(lambda, vector_length);
526
+ vfloat32m4_t cos_lambda = nk_f32m4_cos_rvv_(lambda, vector_length);
527
+
528
+ // sin^2(angular_distance) = (cos(U2)*sin(l))^2 + (cos(U1)*sin(U2) - sin(U1)*cos(U2)*cos(l))^2
529
+ vfloat32m4_t cross_term = __riscv_vfmul_vv_f32m4(cos_reduced_second, sin_lambda, vector_length);
530
+ vfloat32m4_t sin1_cos2_cosl = __riscv_vfmul_vv_f32m4(sin_reduced_first, cos_reduced_second, vector_length);
531
+ sin1_cos2_cosl = __riscv_vfmul_vv_f32m4(sin1_cos2_cosl, cos_lambda, vector_length);
532
+ vfloat32m4_t mixed_term = __riscv_vfmul_vv_f32m4(cos_reduced_first, sin_reduced_second, vector_length);
533
+ mixed_term = __riscv_vfsub_vv_f32m4(mixed_term, sin1_cos2_cosl, vector_length);
534
+
535
+ vfloat32m4_t sin_angular_dist_sq = __riscv_vfmul_vv_f32m4(cross_term, cross_term, vector_length);
536
+ sin_angular_dist_sq = __riscv_vfmadd_vv_f32m4(mixed_term, mixed_term, sin_angular_dist_sq, vector_length);
537
+ sin_angular_distance = __riscv_vfsqrt_v_f32m4(sin_angular_dist_sq, vector_length);
538
+
539
+ // Check for coincident points (sin_angular_distance < epsilon)
540
+ coincident_mask_b8 = __riscv_vmflt_vv_f32m4_b8(sin_angular_distance, v_epsilon, vector_length);
541
+
542
+ // cos(angular_distance) = sin(U1)*sin(U2) + cos(U1)*cos(U2)*cos(l)
543
+ vfloat32m4_t cos1_cos2 = __riscv_vfmul_vv_f32m4(cos_reduced_first, cos_reduced_second, vector_length);
544
+ cos_angular_distance = __riscv_vfmul_vv_f32m4(sin_reduced_first, sin_reduced_second, vector_length);
545
+ cos_angular_distance = __riscv_vfmadd_vv_f32m4(cos1_cos2, cos_lambda, cos_angular_distance, vector_length);
546
+
547
+ // angular_distance = atan2(sin, cos)
548
+ angular_distance = nk_f32m4_atan2_rvv_(sin_angular_distance, cos_angular_distance, vector_length);
549
+
550
+ // sin(azimuth) = cos(U1)*cos(U2)*sin(l) / sin(angular_distance)
551
+ // Avoid division by zero by substituting 1.0 for coincident lanes
552
+ vfloat32m4_t safe_sin_angular = __riscv_vfmerge_vfm_f32m4(sin_angular_distance, 1.0f, coincident_mask_b8,
553
+ vector_length);
554
+ vfloat32m4_t numerator = __riscv_vfmul_vv_f32m4(cos1_cos2, sin_lambda, vector_length);
555
+ sin_azimuth = __riscv_vfdiv_vv_f32m4(numerator, safe_sin_angular, vector_length);
556
+ cos_squared_azimuth = __riscv_vfnmsub_vv_f32m4(sin_azimuth, sin_azimuth, v_one, vector_length);
557
+
558
+ // Handle equatorial case: cos^2(a) < epsilon
559
+ vbool8_t equatorial_mask_b8 = __riscv_vmflt_vv_f32m4_b8(cos_squared_azimuth, v_epsilon, vector_length);
560
+ vfloat32m4_t safe_cos_sq_azimuth = __riscv_vfmerge_vfm_f32m4(cos_squared_azimuth, 1.0f, equatorial_mask_b8,
561
+ vector_length);
562
+
563
+ // cos(2sm) = cos(s) - 2*sin(U1)*sin(U2) / cos^2(a)
564
+ vfloat32m4_t sin_product = __riscv_vfmul_vv_f32m4(sin_reduced_first, sin_reduced_second, vector_length);
565
+ vfloat32m4_t two_sin_product = __riscv_vfmul_vv_f32m4(v_two, sin_product, vector_length);
566
+ cos_double_angular_midpoint = __riscv_vfdiv_vv_f32m4(two_sin_product, safe_cos_sq_azimuth, vector_length);
567
+ cos_double_angular_midpoint = __riscv_vfsub_vv_f32m4(cos_angular_distance, cos_double_angular_midpoint,
568
+ vector_length);
569
+ // Set to zero for equatorial case
570
+ cos_double_angular_midpoint = __riscv_vfmerge_vfm_f32m4(cos_double_angular_midpoint, 0.0f, equatorial_mask_b8,
571
+ vector_length);
572
+
573
+ // C = f/16 * cos^2(a) * (4 + f*(4 - 3*cos^2(a)))
574
+ vfloat32m4_t inner_c = __riscv_vfnmsub_vv_f32m4(v_three, cos_squared_azimuth, v_four, vector_length);
575
+ vfloat32m4_t outer_c = __riscv_vfmadd_vv_f32m4(v_flattening, inner_c, v_four, vector_length);
576
+ vfloat32m4_t correction_factor = __riscv_vfdiv_vv_f32m4(v_flattening, v_sixteen, vector_length);
577
+ correction_factor = __riscv_vfmul_vv_f32m4(correction_factor, cos_squared_azimuth, vector_length);
578
+ correction_factor = __riscv_vfmul_vv_f32m4(correction_factor, outer_c, vector_length);
579
+
580
+ // lambda' = L + (1-C)*f*sin(a)*(s + C*sin(s)*(cos(2sm) + C*cos(s)*(-1 + 2*cos^2(2sm))))
581
+ vfloat32m4_t cos_2sm_sq = __riscv_vfmul_vv_f32m4(cos_double_angular_midpoint, cos_double_angular_midpoint,
582
+ vector_length);
583
+ vfloat32m4_t innermost = __riscv_vfmadd_vv_f32m4(v_two, cos_2sm_sq, v_neg_one, vector_length);
584
+ vfloat32m4_t c_cos_s = __riscv_vfmul_vv_f32m4(correction_factor, cos_angular_distance, vector_length);
585
+ vfloat32m4_t middle = __riscv_vfmadd_vv_f32m4(c_cos_s, innermost, cos_double_angular_midpoint, vector_length);
586
+ vfloat32m4_t c_sin_s = __riscv_vfmul_vv_f32m4(correction_factor, sin_angular_distance, vector_length);
587
+ vfloat32m4_t inner_val = __riscv_vfmul_vv_f32m4(c_sin_s, middle, vector_length);
588
+
589
+ vfloat32m4_t one_minus_c = __riscv_vfsub_vv_f32m4(v_one, correction_factor, vector_length);
590
+ vfloat32m4_t f_sin_a = __riscv_vfmul_vv_f32m4(v_flattening, sin_azimuth, vector_length);
591
+ vfloat32m4_t s_plus_inner = __riscv_vfadd_vv_f32m4(angular_distance, inner_val, vector_length);
592
+ vfloat32m4_t adjustment = __riscv_vfmul_vv_f32m4(one_minus_c, f_sin_a, vector_length);
593
+ adjustment = __riscv_vfmul_vv_f32m4(adjustment, s_plus_inner, vector_length);
594
+ vfloat32m4_t lambda_new = __riscv_vfadd_vv_f32m4(longitude_difference, adjustment, vector_length);
595
+
596
+ // Check convergence: |lambda - lambda'| < threshold
597
+ vfloat32m4_t lambda_diff = __riscv_vfsub_vv_f32m4(lambda_new, lambda, vector_length);
598
+ vfloat32m4_t lambda_diff_abs = __riscv_vfsgnjx_vv_f32m4(lambda_diff, lambda_diff, vector_length);
599
+ vbool8_t newly_converged_b8 = __riscv_vmflt_vv_f32m4_b8(lambda_diff_abs, v_convergence, vector_length);
600
+ converged_mask_b8 = __riscv_vmor_mm_b8(converged_mask_b8, newly_converged_b8, vector_length);
601
+
602
+ // Only update lambda for non-converged lanes
603
+ lambda = __riscv_vmerge_vvm_f32m4(lambda_new, lambda, converged_mask_b8, vector_length);
604
+ }
605
+
606
+ // Final distance calculation
607
+ // u^2 = cos^2(a) * (a^2 - b^2) / b^2
608
+ vfloat32m4_t a_sq = __riscv_vfmul_vv_f32m4(v_equatorial_radius, v_equatorial_radius, vector_length);
609
+ vfloat32m4_t b_sq = __riscv_vfmul_vv_f32m4(v_polar_radius, v_polar_radius, vector_length);
610
+ vfloat32m4_t a_sq_minus_b_sq = __riscv_vfsub_vv_f32m4(a_sq, b_sq, vector_length);
611
+ vfloat32m4_t u_squared = __riscv_vfmul_vv_f32m4(cos_squared_azimuth, a_sq_minus_b_sq, vector_length);
612
+ u_squared = __riscv_vfdiv_vv_f32m4(u_squared, b_sq, vector_length);
613
+
614
+ // A = 1 + u^2/16384 * (4096 + u^2*(-768 + u^2*(320 - 175*u^2)))
615
+ vfloat32m4_t series_a = __riscv_vfmul_vf_f32m4(u_squared, -175.0f, vector_length);
616
+ series_a = __riscv_vfadd_vf_f32m4(series_a, 320.0f, vector_length);
617
+ series_a = __riscv_vfmadd_vv_f32m4(u_squared, series_a, __riscv_vfmv_v_f_f32m4(-768.0f, vector_length),
618
+ vector_length);
619
+ series_a = __riscv_vfmadd_vv_f32m4(u_squared, series_a, __riscv_vfmv_v_f_f32m4(4096.0f, vector_length),
620
+ vector_length);
621
+ vfloat32m4_t u_sq_over_16384 = __riscv_vfmul_vf_f32m4(u_squared, 1.0f / 16384.0f, vector_length);
622
+ series_a = __riscv_vfmadd_vv_f32m4(u_sq_over_16384, series_a, v_one, vector_length);
623
+
624
+ // B = u^2/1024 * (256 + u^2*(-128 + u^2*(74 - 47*u^2)))
625
+ vfloat32m4_t series_b = __riscv_vfmul_vf_f32m4(u_squared, -47.0f, vector_length);
626
+ series_b = __riscv_vfadd_vf_f32m4(series_b, 74.0f, vector_length);
627
+ series_b = __riscv_vfmadd_vv_f32m4(u_squared, series_b, __riscv_vfmv_v_f_f32m4(-128.0f, vector_length),
628
+ vector_length);
629
+ series_b = __riscv_vfmadd_vv_f32m4(u_squared, series_b, __riscv_vfmv_v_f_f32m4(256.0f, vector_length),
630
+ vector_length);
631
+ vfloat32m4_t u_sq_over_1024 = __riscv_vfmul_vf_f32m4(u_squared, 1.0f / 1024.0f, vector_length);
632
+ series_b = __riscv_vfmul_vv_f32m4(u_sq_over_1024, series_b, vector_length);
633
+
634
+ // Delta-sigma calculation
635
+ vfloat32m4_t cos_2sm_sq = __riscv_vfmul_vv_f32m4(cos_double_angular_midpoint, cos_double_angular_midpoint,
636
+ vector_length);
637
+ vfloat32m4_t sin_sq = __riscv_vfmul_vv_f32m4(sin_angular_distance, sin_angular_distance, vector_length);
638
+
639
+ // term1 = cos(s) * (-1 + 2*cos^2(2sm))
640
+ vfloat32m4_t term1 = __riscv_vfmadd_vv_f32m4(v_two, cos_2sm_sq, v_neg_one, vector_length);
641
+ term1 = __riscv_vfmul_vv_f32m4(cos_angular_distance, term1, vector_length);
642
+
643
+ // term2 = B/6 * cos(2sm) * (-3 + 4*sin^2(s)) * (-3 + 4*cos^2(2sm))
644
+ vfloat32m4_t neg_three = __riscv_vfmv_v_f_f32m4(-3.0f, vector_length);
645
+ vfloat32m4_t factor_sin = __riscv_vfmadd_vv_f32m4(v_four, sin_sq, neg_three, vector_length);
646
+ vfloat32m4_t factor_cos = __riscv_vfmadd_vv_f32m4(v_four, cos_2sm_sq, neg_three, vector_length);
647
+ vfloat32m4_t b_over_6 = __riscv_vfdiv_vv_f32m4(series_b, v_six, vector_length);
648
+ vfloat32m4_t term2 = __riscv_vfmul_vv_f32m4(b_over_6, cos_double_angular_midpoint, vector_length);
649
+ term2 = __riscv_vfmul_vv_f32m4(term2, factor_sin, vector_length);
650
+ term2 = __riscv_vfmul_vv_f32m4(term2, factor_cos, vector_length);
651
+
652
+ // B/4 * (term1 - term2)
653
+ vfloat32m4_t b_over_4 = __riscv_vfdiv_vv_f32m4(series_b, v_four, vector_length);
654
+ vfloat32m4_t term1_minus_term2 = __riscv_vfsub_vv_f32m4(term1, term2, vector_length);
655
+ vfloat32m4_t b4_bracket = __riscv_vfmul_vv_f32m4(b_over_4, term1_minus_term2, vector_length);
656
+
657
+ // cos(2sm) + B/4*(...)
658
+ vfloat32m4_t bracket = __riscv_vfadd_vv_f32m4(cos_double_angular_midpoint, b4_bracket, vector_length);
659
+
660
+ // delta_sigma = B * sin(s) * bracket
661
+ vfloat32m4_t delta_sigma = __riscv_vfmul_vv_f32m4(series_b, sin_angular_distance, vector_length);
662
+ delta_sigma = __riscv_vfmul_vv_f32m4(delta_sigma, bracket, vector_length);
663
+
664
+ // s = b * A * (sigma - delta_sigma)
665
+ vfloat32m4_t sigma_minus_ds = __riscv_vfsub_vv_f32m4(angular_distance, delta_sigma, vector_length);
666
+ vfloat32m4_t distances = __riscv_vfmul_vv_f32m4(v_polar_radius, series_a, vector_length);
667
+ distances = __riscv_vfmul_vv_f32m4(distances, sigma_minus_ds, vector_length);
668
+
669
+ // Set coincident points to zero
670
+ distances = __riscv_vfmerge_vfm_f32m4(distances, 0.0f, coincident_mask_b8, vector_length);
671
+
672
+ __riscv_vse32_v_f32m4(results, distances, vector_length);
673
+ }
674
+
675
+ NK_PUBLIC void nk_vincenty_f32_rvv( //
676
+ nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
677
+ nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
678
+ nk_size_t n, nk_f32_t *results) {
679
+
680
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a_lats += vector_length, a_lons += vector_length,
681
+ b_lats += vector_length, b_lons += vector_length, results += vector_length) {
682
+ vector_length = __riscv_vsetvl_e32m4(n);
683
+ nk_vincenty_f32_rvv_kernel_(a_lats, a_lons, b_lats, b_lons, vector_length, results);
684
+ }
685
+ }
686
+
687
+ #pragma endregion - Vincenty Distance
688
+
689
+ #if defined(__cplusplus)
690
+ } // extern "C"
691
+ #endif
692
+
693
+ #if defined(__clang__)
694
+ #pragma clang attribute pop
695
+ #elif defined(__GNUC__)
696
+ #pragma GCC pop_options
697
+ #endif
698
+
699
+ #endif // NK_TARGET_RVV
700
+ #endif // NK_TARGET_RISCV_
701
+ #endif // NK_GEOSPATIAL_RVV_H