numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SIMD-accelerated Geospatial Distances.
|
|
3
|
+
* @file include/numkong/geospatial.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date July 1, 2023
|
|
6
|
+
*
|
|
7
|
+
* Contains following distance functions:
|
|
8
|
+
*
|
|
9
|
+
* - Haversine (Great Circle) distance for 2 points
|
|
10
|
+
* - Haversine (Great Circle) distance for 2 arrays of points
|
|
11
|
+
* - Vincenty's distance function for Oblate Spheroid Geodesics
|
|
12
|
+
*
|
|
13
|
+
* All outputs are in meters, and the input coordinates are in radians.
|
|
14
|
+
*
|
|
15
|
+
* For dtypes:
|
|
16
|
+
*
|
|
17
|
+
* - 64-bit IEEE-754 floating point → 64-bit
|
|
18
|
+
* - 32-bit IEEE-754 floating point → 32-bit
|
|
19
|
+
*
|
|
20
|
+
* Precision policy:
|
|
21
|
+
*
|
|
22
|
+
* - `f32` remains the throughput-oriented lane and intentionally stays narrow end-to-end.
|
|
23
|
+
* - `f64` is the higher-accuracy lane for the same formulas.
|
|
24
|
+
* - We do not widen `f32` outputs here because the dominant error comes from the geodesic model
|
|
25
|
+
* and transcendental approximations, not from long horizontal reductions.
|
|
26
|
+
*
|
|
27
|
+
* For hardware architectures:
|
|
28
|
+
*
|
|
29
|
+
* - Arm: NEON
|
|
30
|
+
* - x86: Haswell, Skylake
|
|
31
|
+
*
|
|
32
|
+
* @section haversine_similarity Low-Accuracy High-Performance Haversine Similarity
|
|
33
|
+
*
|
|
34
|
+
* In most cases, for distance computations, we don't need the exact Haversine formula.
|
|
35
|
+
* The very last part of the computation applies `asin(√x)` non-linear transformation.
|
|
36
|
+
* Both `asin` and `sqrt` are monotonically increasing functions, so their product is also
|
|
37
|
+
* monotonically increasing. This means, for relative similarity/closeness computation we
|
|
38
|
+
* can avoid that expensive last step.
|
|
39
|
+
*
|
|
40
|
+
* @section trig_approximations Trigonometric Approximations & SIMD Vectorization
|
|
41
|
+
*
|
|
42
|
+
* The trigonometric functions (sin, cos, atan2) use polynomial approximations with SLEEF-level
|
|
43
|
+
* error bounds (~3.5 ULP). For f64, this translates to ~1e-15 absolute error; for f32, ~1e-7.
|
|
44
|
+
*
|
|
45
|
+
* @section accuracy_comparison Accuracy Comparison: Haversine vs Vincenty
|
|
46
|
+
*
|
|
47
|
+
* Both algorithms compute geodesic distances, but with different Earth models:
|
|
48
|
+
*
|
|
49
|
+
* - Haversine: Sphere (R=6335km), 0.3% - 0.6% vs WGS-84, fast approximation, ranking
|
|
50
|
+
* - Vincenty: WGS-84 Ellipsoid, 0.01% - 0.2% vs WGS-84, high-precision navigation
|
|
51
|
+
*
|
|
52
|
+
* Vincenty is ~3-20x more accurate than Haversine for most routes. The improvement is most
|
|
53
|
+
* significant for long-distance routes and near-polar paths where Earth's oblateness matters.
|
|
54
|
+
*
|
|
55
|
+
* @note SIMD implementations may have slightly different results than serial due to
|
|
56
|
+
* floating-point ordering in iterative algorithms. For Vincenty, expect <0.001%
|
|
57
|
+
* difference between SIMD and serial implementations.
|
|
58
|
+
*
|
|
59
|
+
* @section vincenty_precision High-Precision Vincenty's Formulae & Earth Ellipsoid
|
|
60
|
+
*
|
|
61
|
+
* Several approximations of the Earth Ellipsoid exist, each defined by the Equatorial radius (m),
|
|
62
|
+
* Polar radius (m), and Inverse flattening. The earliest ones date back to 1738, when Pierre Louis
|
|
63
|
+
* Maupertuis in France suggested a shape, that is only 0.3% different from the most accurate modern
|
|
64
|
+
* estimates by the International Earth Rotation and Reference Systems Service (IERS).
|
|
65
|
+
* The Global Positioning System (GPS) uses the World Geodetic Systems's (WGS) WGS-84 standard.
|
|
66
|
+
* NumKong uses the newer & more accurate @b IERS-2003 standard, but allows overriding default parameters:
|
|
67
|
+
*
|
|
68
|
+
* #define NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS (6378136.6)
|
|
69
|
+
* #define NK_EARTH_ELLIPSOID_POLAR_RADIUS (6356751.9)
|
|
70
|
+
* #define NK_EARTH_ELLIPSOID_INVERSE_FLATTENING (298.25642)
|
|
71
|
+
*
|
|
72
|
+
* To revert from oblate spheroids to spheres, use `NK_EARTH_MEDIATORIAL_RADIUS`.
|
|
73
|
+
*
|
|
74
|
+
* @section x86_instructions Relevant x86 Instructions
|
|
75
|
+
*
|
|
76
|
+
* Haversine and Vincenty formulas require sqrt for the final distance calculation and division
|
|
77
|
+
* for Vincenty's iterative convergence. These are the most expensive operations (12-23 cycles)
|
|
78
|
+
* but only execute once per point-pair. The polynomial trig approximations use FMA chains.
|
|
79
|
+
* Note: ZMM sqrt is faster on Genoa (15c) than Ice Lake (19c) due to better 512-bit support.
|
|
80
|
+
*
|
|
81
|
+
* Intrinsic Instruction Ice Genoa
|
|
82
|
+
* _mm256_sqrt_ps VSQRTPS (YMM, YMM) 12c @ p0 15c @ p01
|
|
83
|
+
* _mm256_sqrt_pd VSQRTPD (YMM, YMM) 13c @ p0 21c @ p01
|
|
84
|
+
* _mm512_sqrt_ps VSQRTPS (ZMM, ZMM) 19c @ p05 15c @ p01
|
|
85
|
+
* _mm512_sqrt_pd VSQRTPD (ZMM, ZMM) 23c @ p05 21c @ p01
|
|
86
|
+
* _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11c @ p0 11c @ p01
|
|
87
|
+
* _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13c @ p0 13c @ p01
|
|
88
|
+
* _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4c @ p01 4c @ p01
|
|
89
|
+
* _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4c @ p01 4c @ p01
|
|
90
|
+
*
|
|
91
|
+
* @section arm_instructions Relevant ARM NEON/SVE Instructions
|
|
92
|
+
*
|
|
93
|
+
* ARM sqrt (FSQRT) has low throughput as it uses a dedicated V02 execution unit. This is
|
|
94
|
+
* acceptable since sqrt only appears once per distance calculation. FMA chains for trig
|
|
95
|
+
* polynomial evaluation pipeline well across all 4 V-units.
|
|
96
|
+
*
|
|
97
|
+
* Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
|
|
98
|
+
* vfmaq_f32 FMLA.S (vec) 4c @ V0123 4c @ V0123 4c @ V0123
|
|
99
|
+
* vfmaq_f64 FMLA.D (vec) 4c @ V0123 4c @ V0123 4c @ V0123
|
|
100
|
+
* vsqrtq_f32 FSQRT.S (vec) 10c @ V02 10c @ V02 9c @ V02
|
|
101
|
+
* vsqrtq_f64 FSQRT.D (vec) 13c @ V02 16c @ V02 16c @ V02
|
|
102
|
+
*
|
|
103
|
+
* @section references References
|
|
104
|
+
*
|
|
105
|
+
* - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
|
|
106
|
+
* - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
|
|
107
|
+
* - Earth Ellipsoid: https://en.wikipedia.org/wiki/Earth_ellipsoid
|
|
108
|
+
* - Oblate Spheroid Geodesic: https://mathworld.wolfram.com/OblateSpheroidGeodesic.html
|
|
109
|
+
* - Staging experiments: https://github.com/ashvardanian/HaversineMathKong
|
|
110
|
+
* - Speeding up atan2f by 50x: https://mazzo.li/posts/vectorized-atan2.html
|
|
111
|
+
* - Simplifying the GNU C Sine Function: https://www.awelm.com/posts/simplifying-the-gnu-c-sine-function/
|
|
112
|
+
*
|
|
113
|
+
*/
|
|
114
|
+
#ifndef NK_GEOSPATIAL_H
|
|
115
|
+
#define NK_GEOSPATIAL_H
|
|
116
|
+
|
|
117
|
+
#include "numkong/types.h"
|
|
118
|
+
#include "numkong/trigonometry.h"
|
|
119
|
+
|
|
120
|
+
/* Earth Ellipsoid Constants
|
|
121
|
+
* The default values use the IERS-2003 standard, but can be overridden before including this header.
|
|
122
|
+
*/
|
|
123
|
+
#ifndef NK_EARTH_MEDIATORIAL_RADIUS
|
|
124
|
+
#define NK_EARTH_MEDIATORIAL_RADIUS (6335439.0)
|
|
125
|
+
#endif
|
|
126
|
+
#ifndef NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS
|
|
127
|
+
#define NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS (6378136.6)
|
|
128
|
+
#endif
|
|
129
|
+
#ifndef NK_EARTH_ELLIPSOID_POLAR_RADIUS
|
|
130
|
+
#define NK_EARTH_ELLIPSOID_POLAR_RADIUS (6356751.9)
|
|
131
|
+
#endif
|
|
132
|
+
#ifndef NK_EARTH_ELLIPSOID_INVERSE_FLATTENING
|
|
133
|
+
#define NK_EARTH_ELLIPSOID_INVERSE_FLATTENING (298.25642)
|
|
134
|
+
#endif
|
|
135
|
+
#ifndef NK_VINCENTY_MAX_ITERATIONS
|
|
136
|
+
#define NK_VINCENTY_MAX_ITERATIONS 100
|
|
137
|
+
#endif
|
|
138
|
+
#ifndef NK_VINCENTY_CONVERGENCE_THRESHOLD_F64
|
|
139
|
+
#define NK_VINCENTY_CONVERGENCE_THRESHOLD_F64 1e-12
|
|
140
|
+
#endif
|
|
141
|
+
#ifndef NK_VINCENTY_CONVERGENCE_THRESHOLD_F32
|
|
142
|
+
#define NK_VINCENTY_CONVERGENCE_THRESHOLD_F32 1e-7f
|
|
143
|
+
#endif
|
|
144
|
+
|
|
145
|
+
#if defined(__cplusplus)
|
|
146
|
+
extern "C" {
|
|
147
|
+
#endif
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* @brief Haversine distance between two arrays of points on a sphere.
|
|
151
|
+
*
|
|
152
|
+
* @param[in] a_lats Latitudes of the first points, in radians.
|
|
153
|
+
* @param[in] a_lons Longitudes of the first points, in radians.
|
|
154
|
+
* @param[in] b_lats Latitudes of the second points, in radians.
|
|
155
|
+
* @param[in] b_lons Longitudes of the second points, in radians.
|
|
156
|
+
* @param[in] n The number of point pairs.
|
|
157
|
+
* @param[out] results Output distances in meters, length `n`.
|
|
158
|
+
*
|
|
159
|
+
* @note Inputs are in radians and outputs are in meters.
|
|
160
|
+
*/
|
|
161
|
+
NK_DYNAMIC void nk_haversine_f64( //
|
|
162
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
163
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
164
|
+
nk_size_t n, nk_f64_t *results);
|
|
165
|
+
|
|
166
|
+
/** @copydoc nk_haversine_f64 */
|
|
167
|
+
NK_DYNAMIC void nk_haversine_f32( //
|
|
168
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
169
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
170
|
+
nk_size_t n, nk_f32_t *results);
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* @brief Vincenty distance between two arrays of points on an oblate spheroid.
|
|
174
|
+
*
|
|
175
|
+
* @param[in] a_lats Latitudes of the first points, in radians.
|
|
176
|
+
* @param[in] a_lons Longitudes of the first points, in radians.
|
|
177
|
+
* @param[in] b_lats Latitudes of the second points, in radians.
|
|
178
|
+
* @param[in] b_lons Longitudes of the second points, in radians.
|
|
179
|
+
* @param[in] n The number of point pairs.
|
|
180
|
+
* @param[out] results Output distances in meters, length `n`.
|
|
181
|
+
*
|
|
182
|
+
* @note Inputs are in radians and outputs are in meters.
|
|
183
|
+
* @note Uses the Earth ellipsoid parameters configured via `NK_EARTH_ELLIPSOID_*`.
|
|
184
|
+
*/
|
|
185
|
+
NK_DYNAMIC void nk_vincenty_f64( //
|
|
186
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
187
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
188
|
+
nk_size_t n, nk_f64_t *results);
|
|
189
|
+
|
|
190
|
+
/** @copydoc nk_vincenty_f64 */
|
|
191
|
+
NK_DYNAMIC void nk_vincenty_f32( //
|
|
192
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
193
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
194
|
+
nk_size_t n, nk_f32_t *results);
|
|
195
|
+
|
|
196
|
+
/** @copydoc nk_haversine_f64 */
|
|
197
|
+
NK_PUBLIC void nk_haversine_f64_serial( //
|
|
198
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
199
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
200
|
+
nk_size_t n, nk_f64_t *results);
|
|
201
|
+
/** @copydoc nk_vincenty_f64 */
|
|
202
|
+
NK_PUBLIC void nk_vincenty_f64_serial( //
|
|
203
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
204
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
205
|
+
nk_size_t n, nk_f64_t *results);
|
|
206
|
+
/** @copydoc nk_haversine_f32 */
|
|
207
|
+
NK_PUBLIC void nk_haversine_f32_serial( //
|
|
208
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
209
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
210
|
+
nk_size_t n, nk_f32_t *results);
|
|
211
|
+
/** @copydoc nk_vincenty_f32 */
|
|
212
|
+
NK_PUBLIC void nk_vincenty_f32_serial( //
|
|
213
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
214
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
215
|
+
nk_size_t n, nk_f32_t *results);
|
|
216
|
+
|
|
217
|
+
#if NK_TARGET_NEON
|
|
218
|
+
/** @copydoc nk_haversine_f64 */
|
|
219
|
+
NK_PUBLIC void nk_haversine_f64_neon( //
|
|
220
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
221
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
222
|
+
nk_size_t n, nk_f64_t *results);
|
|
223
|
+
/** @copydoc nk_vincenty_f64 */
|
|
224
|
+
NK_PUBLIC void nk_vincenty_f64_neon( //
|
|
225
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
226
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
227
|
+
nk_size_t n, nk_f64_t *results);
|
|
228
|
+
/** @copydoc nk_haversine_f32 */
|
|
229
|
+
NK_PUBLIC void nk_haversine_f32_neon( //
|
|
230
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
231
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
232
|
+
nk_size_t n, nk_f32_t *results);
|
|
233
|
+
/** @copydoc nk_vincenty_f32 */
|
|
234
|
+
NK_PUBLIC void nk_vincenty_f32_neon( //
|
|
235
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
236
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
237
|
+
nk_size_t n, nk_f32_t *results);
|
|
238
|
+
#endif // NK_TARGET_NEON
|
|
239
|
+
|
|
240
|
+
#if NK_TARGET_HASWELL
|
|
241
|
+
/** @copydoc nk_haversine_f64 */
|
|
242
|
+
NK_PUBLIC void nk_haversine_f64_haswell( //
|
|
243
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
244
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
245
|
+
nk_size_t n, nk_f64_t *results);
|
|
246
|
+
/** @copydoc nk_vincenty_f64 */
|
|
247
|
+
NK_PUBLIC void nk_vincenty_f64_haswell( //
|
|
248
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
249
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
250
|
+
nk_size_t n, nk_f64_t *results);
|
|
251
|
+
/** @copydoc nk_haversine_f32 */
|
|
252
|
+
NK_PUBLIC void nk_haversine_f32_haswell( //
|
|
253
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
254
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
255
|
+
nk_size_t n, nk_f32_t *results);
|
|
256
|
+
/** @copydoc nk_vincenty_f32 */
|
|
257
|
+
NK_PUBLIC void nk_vincenty_f32_haswell( //
|
|
258
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
259
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
260
|
+
nk_size_t n, nk_f32_t *results);
|
|
261
|
+
#endif // NK_TARGET_HASWELL
|
|
262
|
+
|
|
263
|
+
#if NK_TARGET_SKYLAKE
|
|
264
|
+
/** @copydoc nk_haversine_f64 */
|
|
265
|
+
NK_PUBLIC void nk_haversine_f64_skylake( //
|
|
266
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
267
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
268
|
+
nk_size_t n, nk_f64_t *results);
|
|
269
|
+
/** @copydoc nk_vincenty_f64 */
|
|
270
|
+
NK_PUBLIC void nk_vincenty_f64_skylake( //
|
|
271
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
272
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
273
|
+
nk_size_t n, nk_f64_t *results);
|
|
274
|
+
/** @copydoc nk_haversine_f32 */
|
|
275
|
+
NK_PUBLIC void nk_haversine_f32_skylake( //
|
|
276
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
277
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
278
|
+
nk_size_t n, nk_f32_t *results);
|
|
279
|
+
/** @copydoc nk_vincenty_f32 */
|
|
280
|
+
NK_PUBLIC void nk_vincenty_f32_skylake( //
|
|
281
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
282
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
283
|
+
nk_size_t n, nk_f32_t *results);
|
|
284
|
+
#endif // NK_TARGET_SKYLAKE
|
|
285
|
+
|
|
286
|
+
#if NK_TARGET_V128RELAXED
|
|
287
|
+
/** @copydoc nk_haversine_f64 */
|
|
288
|
+
NK_PUBLIC void nk_haversine_f64_v128relaxed( //
|
|
289
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
290
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
291
|
+
nk_size_t n, nk_f64_t *results);
|
|
292
|
+
/** @copydoc nk_vincenty_f64 */
|
|
293
|
+
NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
|
|
294
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
295
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
296
|
+
nk_size_t n, nk_f64_t *results);
|
|
297
|
+
/** @copydoc nk_haversine_f32 */
|
|
298
|
+
NK_PUBLIC void nk_haversine_f32_v128relaxed( //
|
|
299
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
300
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
301
|
+
nk_size_t n, nk_f32_t *results);
|
|
302
|
+
/** @copydoc nk_vincenty_f32 */
|
|
303
|
+
NK_PUBLIC void nk_vincenty_f32_v128relaxed( //
|
|
304
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
305
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
306
|
+
nk_size_t n, nk_f32_t *results);
|
|
307
|
+
#endif // NK_TARGET_V128RELAXED
|
|
308
|
+
|
|
309
|
+
#if NK_TARGET_RVV
|
|
310
|
+
/** @copydoc nk_haversine_f64 */
|
|
311
|
+
NK_PUBLIC void nk_haversine_f64_rvv( //
|
|
312
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
313
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
314
|
+
nk_size_t n, nk_f64_t *results);
|
|
315
|
+
/** @copydoc nk_vincenty_f64 */
|
|
316
|
+
NK_PUBLIC void nk_vincenty_f64_rvv( //
|
|
317
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
318
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
319
|
+
nk_size_t n, nk_f64_t *results);
|
|
320
|
+
/** @copydoc nk_haversine_f32 */
|
|
321
|
+
NK_PUBLIC void nk_haversine_f32_rvv( //
|
|
322
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
323
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
324
|
+
nk_size_t n, nk_f32_t *results);
|
|
325
|
+
/** @copydoc nk_vincenty_f32 */
|
|
326
|
+
NK_PUBLIC void nk_vincenty_f32_rvv( //
|
|
327
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
328
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
329
|
+
nk_size_t n, nk_f32_t *results);
|
|
330
|
+
#endif // NK_TARGET_RVV
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* @brief Returns the output dtype for Haversine distance.
|
|
334
|
+
*/
|
|
335
|
+
NK_INTERNAL nk_dtype_t nk_haversine_output_dtype(nk_dtype_t dtype) {
|
|
336
|
+
switch (dtype) {
|
|
337
|
+
case nk_f64_k: return nk_f64_k;
|
|
338
|
+
case nk_f32_k: return nk_f32_k;
|
|
339
|
+
default: return nk_dtype_unknown_k;
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* @brief Returns the output dtype for Vincenty distance.
|
|
345
|
+
*/
|
|
346
|
+
NK_INTERNAL nk_dtype_t nk_vincenty_output_dtype(nk_dtype_t dtype) {
|
|
347
|
+
switch (dtype) {
|
|
348
|
+
case nk_f64_k: return nk_f64_k;
|
|
349
|
+
case nk_f32_k: return nk_f32_k;
|
|
350
|
+
default: return nk_dtype_unknown_k;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
#if defined(__cplusplus)
|
|
355
|
+
} // extern "C"
|
|
356
|
+
#endif
|
|
357
|
+
|
|
358
|
+
#include "numkong/geospatial/serial.h"
|
|
359
|
+
#include "numkong/geospatial/neon.h"
|
|
360
|
+
#include "numkong/geospatial/haswell.h"
|
|
361
|
+
#include "numkong/geospatial/skylake.h"
|
|
362
|
+
#include "numkong/geospatial/v128relaxed.h"
|
|
363
|
+
#include "numkong/geospatial/rvv.h"
|
|
364
|
+
|
|
365
|
+
#if defined(__cplusplus)
|
|
366
|
+
extern "C" {
|
|
367
|
+
#endif
|
|
368
|
+
|
|
369
|
+
#if !NK_DYNAMIC_DISPATCH
|
|
370
|
+
|
|
371
|
+
NK_PUBLIC void nk_haversine_f64( //
|
|
372
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
373
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
374
|
+
nk_size_t n, nk_f64_t *results) {
|
|
375
|
+
#if NK_TARGET_SKYLAKE
|
|
376
|
+
nk_haversine_f64_skylake(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
377
|
+
#elif NK_TARGET_HASWELL
|
|
378
|
+
nk_haversine_f64_haswell(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
379
|
+
#elif NK_TARGET_NEON
|
|
380
|
+
nk_haversine_f64_neon(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
381
|
+
#elif NK_TARGET_V128RELAXED
|
|
382
|
+
nk_haversine_f64_v128relaxed(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
383
|
+
#elif NK_TARGET_RVV
|
|
384
|
+
nk_haversine_f64_rvv(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
385
|
+
#else
|
|
386
|
+
nk_haversine_f64_serial(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
387
|
+
#endif
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
NK_PUBLIC void nk_haversine_f32( //
|
|
391
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
392
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
393
|
+
nk_size_t n, nk_f32_t *results) {
|
|
394
|
+
#if NK_TARGET_SKYLAKE
|
|
395
|
+
nk_haversine_f32_skylake(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
396
|
+
#elif NK_TARGET_HASWELL
|
|
397
|
+
nk_haversine_f32_haswell(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
398
|
+
#elif NK_TARGET_NEON
|
|
399
|
+
nk_haversine_f32_neon(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
400
|
+
#elif NK_TARGET_V128RELAXED
|
|
401
|
+
nk_haversine_f32_v128relaxed(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
402
|
+
#elif NK_TARGET_RVV
|
|
403
|
+
nk_haversine_f32_rvv(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
404
|
+
#else
|
|
405
|
+
nk_haversine_f32_serial(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
406
|
+
#endif
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
NK_PUBLIC void nk_vincenty_f64( //
|
|
410
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
411
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
412
|
+
nk_size_t n, nk_f64_t *results) {
|
|
413
|
+
#if NK_TARGET_SKYLAKE
|
|
414
|
+
nk_vincenty_f64_skylake(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
415
|
+
#elif NK_TARGET_HASWELL
|
|
416
|
+
nk_vincenty_f64_haswell(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
417
|
+
#elif NK_TARGET_NEON
|
|
418
|
+
nk_vincenty_f64_neon(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
419
|
+
#elif NK_TARGET_V128RELAXED
|
|
420
|
+
nk_vincenty_f64_v128relaxed(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
421
|
+
#elif NK_TARGET_RVV
|
|
422
|
+
nk_vincenty_f64_rvv(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
423
|
+
#else
|
|
424
|
+
nk_vincenty_f64_serial(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
425
|
+
#endif
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
NK_PUBLIC void nk_vincenty_f32( //
|
|
429
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
430
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
431
|
+
nk_size_t n, nk_f32_t *results) {
|
|
432
|
+
#if NK_TARGET_SKYLAKE
|
|
433
|
+
nk_vincenty_f32_skylake(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
434
|
+
#elif NK_TARGET_HASWELL
|
|
435
|
+
nk_vincenty_f32_haswell(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
436
|
+
#elif NK_TARGET_NEON
|
|
437
|
+
nk_vincenty_f32_neon(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
438
|
+
#elif NK_TARGET_V128RELAXED
|
|
439
|
+
nk_vincenty_f32_v128relaxed(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
440
|
+
#elif NK_TARGET_RVV
|
|
441
|
+
nk_vincenty_f32_rvv(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
442
|
+
#else
|
|
443
|
+
nk_vincenty_f32_serial(a_lats, a_lons, b_lats, b_lons, n, results);
|
|
444
|
+
#endif
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
#endif // !NK_DYNAMIC_DISPATCH
|
|
448
|
+
|
|
449
|
+
#if defined(__cplusplus)
|
|
450
|
+
} // extern "C"
|
|
451
|
+
#endif
|
|
452
|
+
|
|
453
|
+
#endif
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Geospatial kernels: haversine, vincenty.
|
|
3
|
+
* @file include/numkong/geospatial.hpp
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 5, 2026
|
|
6
|
+
*/
|
|
7
|
+
#ifndef NK_GEOSPATIAL_HPP
|
|
8
|
+
#define NK_GEOSPATIAL_HPP
|
|
9
|
+
|
|
10
|
+
#include <cstdint> // `std::uint32_t`
|
|
11
|
+
#include <type_traits> // `std::is_same_v`
|
|
12
|
+
|
|
13
|
+
#include "numkong/geospatial.h"
|
|
14
|
+
|
|
15
|
+
#include "numkong/types.hpp"
|
|
16
|
+
|
|
17
|
+
namespace ashvardanian::numkong {
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* @brief Batched Haversine: 2R × arcsin(√(sin²(Δφ/2) + cos φ₁ × cos φ₂ × sin²(Δλ/2)))
|
|
21
|
+
* @param[in] a_lats,a_lons Arrays of latitudes/longitudes for first points (radians)
|
|
22
|
+
* @param[in] b_lats,b_lons Arrays of latitudes/longitudes for second points (radians)
|
|
23
|
+
* @param[in] d Number of point pairs
|
|
24
|
+
* @param[out] results Output array of distances (meters)
|
|
25
|
+
*
|
|
26
|
+
* @tparam in_type_ Input coordinate type (f32_t, f64_t)
|
|
27
|
+
* @tparam precision_type_ Precision type for scalar fallback computations, defaults to `in_type_`
|
|
28
|
+
* @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
|
|
29
|
+
*
|
|
30
|
+
* @note Uses spherical Earth model with mediatorial radius (6335439.0 m)
|
|
31
|
+
* @note Accuracy: 0.3-0.6% vs WGS-84, suitable for ranking/similarity
|
|
32
|
+
*/
|
|
33
|
+
template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
|
|
34
|
+
void haversine(in_type_ const *a_lats, in_type_ const *a_lons, in_type_ const *b_lats, in_type_ const *b_lons,
|
|
35
|
+
std::size_t d, in_type_ *results) noexcept {
|
|
36
|
+
constexpr bool simd = allow_simd_ == prefer_simd_k && std::is_same_v<in_type_, precision_type_>;
|
|
37
|
+
|
|
38
|
+
if constexpr (std::is_same_v<in_type_, f64_t> && simd)
|
|
39
|
+
nk_haversine_f64(&a_lats->raw_, &a_lons->raw_, &b_lats->raw_, &b_lons->raw_, d, &results->raw_);
|
|
40
|
+
else if constexpr (std::is_same_v<in_type_, f32_t> && simd)
|
|
41
|
+
nk_haversine_f32(&a_lats->raw_, &a_lons->raw_, &b_lats->raw_, &b_lons->raw_, d, &results->raw_);
|
|
42
|
+
// Scalar fallback
|
|
43
|
+
else {
|
|
44
|
+
precision_type_ const earth_radius = precision_type_(6335439.0); // mediatorial radius in meters
|
|
45
|
+
|
|
46
|
+
for (std::size_t i = 0; i < d; i++) {
|
|
47
|
+
precision_type_ first_latitude = precision_type_(a_lats[i]);
|
|
48
|
+
precision_type_ first_longitude = precision_type_(a_lons[i]);
|
|
49
|
+
precision_type_ second_latitude = precision_type_(b_lats[i]);
|
|
50
|
+
precision_type_ second_longitude = precision_type_(b_lons[i]);
|
|
51
|
+
|
|
52
|
+
precision_type_ latitude_delta = second_latitude - first_latitude;
|
|
53
|
+
precision_type_ longitude_delta = second_longitude - first_longitude;
|
|
54
|
+
|
|
55
|
+
// Haversine formula: a = sin²(Δlat/2) + cos(lat1)×cos(lat2)×sin²(Δlon/2)
|
|
56
|
+
precision_type_ sin_latitude_delta_half = (latitude_delta * precision_type_(0.5)).sin();
|
|
57
|
+
precision_type_ sin_longitude_delta_half = (longitude_delta * precision_type_(0.5)).sin();
|
|
58
|
+
precision_type_ cos_first_latitude = first_latitude.cos();
|
|
59
|
+
precision_type_ cos_second_latitude = second_latitude.cos();
|
|
60
|
+
|
|
61
|
+
precision_type_ haversine_term = sin_latitude_delta_half * sin_latitude_delta_half +
|
|
62
|
+
cos_first_latitude * cos_second_latitude * sin_longitude_delta_half *
|
|
63
|
+
sin_longitude_delta_half;
|
|
64
|
+
|
|
65
|
+
// Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
|
|
66
|
+
precision_type_ sqrt_haversine = haversine_term.sqrt();
|
|
67
|
+
precision_type_ sqrt_complement = (precision_type_(1.0) - haversine_term).sqrt();
|
|
68
|
+
precision_type_ central_angle = precision_type_(2.0) * sqrt_haversine.atan2(sqrt_complement);
|
|
69
|
+
|
|
70
|
+
results[i] = in_type_(static_cast<double>(earth_radius * central_angle));
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* @brief Batched Vincenty distance (geodesic on WGS-84 ellipsoid)
|
|
77
|
+
* @param[in] a_lats,a_lons Arrays of latitudes/longitudes for first points (radians)
|
|
78
|
+
* @param[in] b_lats,b_lons Arrays of latitudes/longitudes for second points (radians)
|
|
79
|
+
* @param[in] d Number of point pairs
|
|
80
|
+
* @param[out] results Output array of distances (meters)
|
|
81
|
+
*
|
|
82
|
+
* @tparam in_type_ Input coordinate type (f32_t, f64_t)
|
|
83
|
+
* @tparam precision_type_ Precision type for scalar fallback computations, defaults to `in_type_`
|
|
84
|
+
* @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
|
|
85
|
+
*
|
|
86
|
+
* @note Uses WGS-84/IERS-2003 ellipsoid model
|
|
87
|
+
* @note Accuracy: 0.01-0.2% vs WGS-84, 3-20x more accurate than Haversine
|
|
88
|
+
* @note Iterative algorithm with max 100 iterations
|
|
89
|
+
*/
|
|
90
|
+
template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
|
|
91
|
+
void vincenty(in_type_ const *a_lats, in_type_ const *a_lons, in_type_ const *b_lats, in_type_ const *b_lons,
|
|
92
|
+
std::size_t d, in_type_ *results) noexcept {
|
|
93
|
+
constexpr bool simd = allow_simd_ == prefer_simd_k && std::is_same_v<in_type_, precision_type_>;
|
|
94
|
+
|
|
95
|
+
if constexpr (std::is_same_v<in_type_, f64_t> && simd)
|
|
96
|
+
nk_vincenty_f64(&a_lats->raw_, &a_lons->raw_, &b_lats->raw_, &b_lons->raw_, d, &results->raw_);
|
|
97
|
+
else if constexpr (std::is_same_v<in_type_, f32_t> && simd)
|
|
98
|
+
nk_vincenty_f32(&a_lats->raw_, &a_lons->raw_, &b_lats->raw_, &b_lons->raw_, d, &results->raw_);
|
|
99
|
+
// Scalar fallback
|
|
100
|
+
else {
|
|
101
|
+
precision_type_ const equatorial_radius = precision_type_(6378136.6);
|
|
102
|
+
precision_type_ const polar_radius = precision_type_(6356751.9);
|
|
103
|
+
precision_type_ const flattening = precision_type_(1.0) / precision_type_(298.25642);
|
|
104
|
+
precision_type_ const convergence_threshold = precision_type_(1e-12);
|
|
105
|
+
constexpr int max_iterations = 100;
|
|
106
|
+
|
|
107
|
+
for (std::size_t i = 0; i < d; i++) {
|
|
108
|
+
precision_type_ first_latitude = precision_type_(a_lats[i]);
|
|
109
|
+
precision_type_ second_latitude = precision_type_(b_lats[i]);
|
|
110
|
+
precision_type_ longitude_difference = precision_type_(b_lons[i]) - precision_type_(a_lons[i]);
|
|
111
|
+
|
|
112
|
+
// Reduced latitudes on the auxiliary sphere
|
|
113
|
+
precision_type_ tan_reduced_first = (precision_type_(1.0) - flattening) * first_latitude.tan();
|
|
114
|
+
precision_type_ tan_reduced_second = (precision_type_(1.0) - flattening) * second_latitude.tan();
|
|
115
|
+
precision_type_ cos_reduced_first = precision_type_(1.0) /
|
|
116
|
+
(precision_type_(1.0) + tan_reduced_first * tan_reduced_first).sqrt();
|
|
117
|
+
precision_type_ sin_reduced_first = tan_reduced_first * cos_reduced_first;
|
|
118
|
+
precision_type_ cos_reduced_second =
|
|
119
|
+
precision_type_(1.0) / (precision_type_(1.0) + tan_reduced_second * tan_reduced_second).sqrt();
|
|
120
|
+
precision_type_ sin_reduced_second = tan_reduced_second * cos_reduced_second;
|
|
121
|
+
|
|
122
|
+
// Iterative convergence of lambda (difference in longitude on auxiliary sphere)
|
|
123
|
+
precision_type_ lambda = longitude_difference;
|
|
124
|
+
precision_type_ lambda_previous = longitude_difference;
|
|
125
|
+
precision_type_ sin_angular_distance, cos_angular_distance, angular_distance;
|
|
126
|
+
precision_type_ sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
|
|
127
|
+
bool coincident = false;
|
|
128
|
+
|
|
129
|
+
for (unsigned int iteration = 0; iteration < max_iterations; iteration++) {
|
|
130
|
+
precision_type_ sin_lambda = lambda.sin();
|
|
131
|
+
precision_type_ cos_lambda = lambda.cos();
|
|
132
|
+
|
|
133
|
+
precision_type_ cross_term = cos_reduced_second * sin_lambda;
|
|
134
|
+
precision_type_ mixed_term = cos_reduced_first * sin_reduced_second -
|
|
135
|
+
sin_reduced_first * cos_reduced_second * cos_lambda;
|
|
136
|
+
sin_angular_distance = (cross_term * cross_term + mixed_term * mixed_term).sqrt();
|
|
137
|
+
|
|
138
|
+
if (sin_angular_distance == precision_type_(0.0)) {
|
|
139
|
+
coincident = true;
|
|
140
|
+
break;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
cos_angular_distance = sin_reduced_first * sin_reduced_second +
|
|
144
|
+
cos_reduced_first * cos_reduced_second * cos_lambda;
|
|
145
|
+
angular_distance = sin_angular_distance.atan2(cos_angular_distance);
|
|
146
|
+
|
|
147
|
+
sin_azimuth = cos_reduced_first * cos_reduced_second * sin_lambda / sin_angular_distance;
|
|
148
|
+
cos_squared_azimuth = precision_type_(1.0) - sin_azimuth * sin_azimuth;
|
|
149
|
+
|
|
150
|
+
// Handle equatorial geodesic case
|
|
151
|
+
cos_double_angular_midpoint = (cos_squared_azimuth != precision_type_(0.0))
|
|
152
|
+
? cos_angular_distance - precision_type_(2.0) * sin_reduced_first *
|
|
153
|
+
sin_reduced_second / cos_squared_azimuth
|
|
154
|
+
: precision_type_(0.0);
|
|
155
|
+
|
|
156
|
+
precision_type_ correction_factor =
|
|
157
|
+
flattening / precision_type_(16.0) * cos_squared_azimuth *
|
|
158
|
+
(precision_type_(4.0) +
|
|
159
|
+
flattening * (precision_type_(4.0) - precision_type_(3.0) * cos_squared_azimuth));
|
|
160
|
+
|
|
161
|
+
lambda_previous = lambda;
|
|
162
|
+
lambda = longitude_difference +
|
|
163
|
+
(precision_type_(1.0) - correction_factor) * flattening * sin_azimuth *
|
|
164
|
+
(angular_distance +
|
|
165
|
+
correction_factor * sin_angular_distance *
|
|
166
|
+
(cos_double_angular_midpoint +
|
|
167
|
+
correction_factor * cos_angular_distance *
|
|
168
|
+
(precision_type_(-1.0) + precision_type_(2.0) * cos_double_angular_midpoint *
|
|
169
|
+
cos_double_angular_midpoint)));
|
|
170
|
+
|
|
171
|
+
if ((lambda - lambda_previous).abs() < convergence_threshold) break;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if (coincident) {
|
|
175
|
+
results[i] = in_type_(0.0);
|
|
176
|
+
continue;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Final distance calculation
|
|
180
|
+
precision_type_ u_squared = cos_squared_azimuth *
|
|
181
|
+
(equatorial_radius * equatorial_radius - polar_radius * polar_radius) /
|
|
182
|
+
(polar_radius * polar_radius);
|
|
183
|
+
precision_type_ series_a =
|
|
184
|
+
precision_type_(1.0) +
|
|
185
|
+
u_squared / precision_type_(16384.0) *
|
|
186
|
+
(precision_type_(4096.0) +
|
|
187
|
+
u_squared * (precision_type_(-768.0) +
|
|
188
|
+
u_squared * (precision_type_(320.0) - precision_type_(175.0) * u_squared)));
|
|
189
|
+
precision_type_ series_b = u_squared / precision_type_(1024.0) *
|
|
190
|
+
(precision_type_(256.0) +
|
|
191
|
+
u_squared *
|
|
192
|
+
(precision_type_(-128.0) +
|
|
193
|
+
u_squared * (precision_type_(74.0) - precision_type_(47.0) * u_squared)));
|
|
194
|
+
|
|
195
|
+
precision_type_ angular_correction =
|
|
196
|
+
series_b * sin_angular_distance *
|
|
197
|
+
(cos_double_angular_midpoint +
|
|
198
|
+
series_b / precision_type_(4.0) *
|
|
199
|
+
(cos_angular_distance *
|
|
200
|
+
(precision_type_(-1.0) +
|
|
201
|
+
precision_type_(2.0) * cos_double_angular_midpoint * cos_double_angular_midpoint) -
|
|
202
|
+
series_b / precision_type_(6.0) * cos_double_angular_midpoint *
|
|
203
|
+
(precision_type_(-3.0) + precision_type_(4.0) * sin_angular_distance * sin_angular_distance) *
|
|
204
|
+
(precision_type_(-3.0) +
|
|
205
|
+
precision_type_(4.0) * cos_double_angular_midpoint * cos_double_angular_midpoint)));
|
|
206
|
+
|
|
207
|
+
results[i] = in_type_(
|
|
208
|
+
static_cast<double>(polar_radius * series_a * (angular_distance - angular_correction)));
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
} // namespace ashvardanian::numkong
|
|
214
|
+
|
|
215
|
+
#include "numkong/tensor.hpp"
|
|
216
|
+
|
|
217
|
+
namespace ashvardanian::numkong {
|
|
218
|
+
|
|
219
|
+
template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
|
|
220
|
+
void haversine(vector_view<in_type_> a_lats, vector_view<in_type_> a_lons, vector_view<in_type_> b_lats,
|
|
221
|
+
vector_view<in_type_> b_lons, in_type_ *results) noexcept {
|
|
222
|
+
haversine<in_type_, precision_type_, allow_simd_>(a_lats.data(), a_lons.data(), b_lats.data(), b_lons.data(),
|
|
223
|
+
a_lats.size(), results);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
|
|
227
|
+
void vincenty(vector_view<in_type_> a_lats, vector_view<in_type_> a_lons, vector_view<in_type_> b_lats,
|
|
228
|
+
vector_view<in_type_> b_lons, in_type_ *results) noexcept {
|
|
229
|
+
vincenty<in_type_, precision_type_, allow_simd_>(a_lats.data(), a_lons.data(), b_lats.data(), b_lons.data(),
|
|
230
|
+
a_lats.size(), results);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
} // namespace ashvardanian::numkong
|
|
234
|
+
|
|
235
|
+
#endif // NK_GEOSPATIAL_HPP
|