numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -8,11 +8,11 @@
|
|
|
8
8
|
*
|
|
9
9
|
* @section geospatial_neon_instructions Key NEON Geospatial Instructions
|
|
10
10
|
*
|
|
11
|
-
* Intrinsic
|
|
12
|
-
* vfmaq_f32
|
|
13
|
-
* vfmaq_f64
|
|
14
|
-
* vsqrtq_f32
|
|
15
|
-
* vsqrtq_f64
|
|
11
|
+
* Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
|
|
12
|
+
* vfmaq_f32 FMLA.S (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
|
|
13
|
+
* vfmaq_f64 FMLA.D (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
|
|
14
|
+
* vsqrtq_f32 FSQRT.S (vec) 10cy @ V02 10cy @ V02 9cy @ V02
|
|
15
|
+
* vsqrtq_f64 FSQRT.D (vec) 13cy @ V02 16cy @ V02 16cy @ V02
|
|
16
16
|
*/
|
|
17
17
|
#ifndef NK_GEOSPATIAL_NEON_H
|
|
18
18
|
#define NK_GEOSPATIAL_NEON_H
|
|
@@ -38,44 +38,48 @@ extern "C" {
|
|
|
38
38
|
* These require NEON trigonometric kernels from trigonometry/neon.h.
|
|
39
39
|
*/
|
|
40
40
|
|
|
41
|
-
NK_INTERNAL float64x2_t nk_haversine_f64x2_neon_(
|
|
42
|
-
float64x2_t
|
|
43
|
-
float64x2_t
|
|
41
|
+
NK_INTERNAL float64x2_t nk_haversine_f64x2_neon_( //
|
|
42
|
+
float64x2_t first_latitudes_f64x2, float64x2_t first_longitudes_f64x2, //
|
|
43
|
+
float64x2_t second_latitudes_f64x2, float64x2_t second_longitudes_f64x2) {
|
|
44
44
|
|
|
45
|
-
float64x2_t const
|
|
46
|
-
float64x2_t const
|
|
47
|
-
float64x2_t const
|
|
48
|
-
float64x2_t const
|
|
45
|
+
float64x2_t const earth_radius_f64x2 = vdupq_n_f64(NK_EARTH_MEDIATORIAL_RADIUS);
|
|
46
|
+
float64x2_t const half_f64x2 = vdupq_n_f64(0.5);
|
|
47
|
+
float64x2_t const one_f64x2 = vdupq_n_f64(1.0);
|
|
48
|
+
float64x2_t const two_f64x2 = vdupq_n_f64(2.0);
|
|
49
49
|
|
|
50
|
-
float64x2_t
|
|
51
|
-
float64x2_t
|
|
50
|
+
float64x2_t latitude_delta_f64x2 = vsubq_f64(second_latitudes_f64x2, first_latitudes_f64x2);
|
|
51
|
+
float64x2_t longitude_delta_f64x2 = vsubq_f64(second_longitudes_f64x2, first_longitudes_f64x2);
|
|
52
52
|
|
|
53
53
|
// Haversine terms: sin²(Δ/2)
|
|
54
|
-
float64x2_t
|
|
55
|
-
float64x2_t
|
|
56
|
-
float64x2_t
|
|
57
|
-
float64x2_t
|
|
58
|
-
float64x2_t
|
|
59
|
-
|
|
54
|
+
float64x2_t latitude_delta_half_f64x2 = vmulq_f64(latitude_delta_f64x2, half_f64x2);
|
|
55
|
+
float64x2_t longitude_delta_half_f64x2 = vmulq_f64(longitude_delta_f64x2, half_f64x2);
|
|
56
|
+
float64x2_t sin_latitude_delta_half_f64x2 = nk_sin_f64x2_neon_(latitude_delta_half_f64x2);
|
|
57
|
+
float64x2_t sin_longitude_delta_half_f64x2 = nk_sin_f64x2_neon_(longitude_delta_half_f64x2);
|
|
58
|
+
float64x2_t sin_squared_latitude_delta_half_f64x2 = vmulq_f64(sin_latitude_delta_half_f64x2,
|
|
59
|
+
sin_latitude_delta_half_f64x2);
|
|
60
|
+
float64x2_t sin_squared_longitude_delta_half_f64x2 = vmulq_f64(sin_longitude_delta_half_f64x2,
|
|
61
|
+
sin_longitude_delta_half_f64x2);
|
|
60
62
|
|
|
61
63
|
// Latitude cosine product
|
|
62
|
-
float64x2_t
|
|
63
|
-
float64x2_t
|
|
64
|
-
float64x2_t
|
|
64
|
+
float64x2_t cos_first_latitude_f64x2 = nk_cos_f64x2_neon_(first_latitudes_f64x2);
|
|
65
|
+
float64x2_t cos_second_latitude_f64x2 = nk_cos_f64x2_neon_(second_latitudes_f64x2);
|
|
66
|
+
float64x2_t cos_latitude_product_f64x2 = vmulq_f64(cos_first_latitude_f64x2, cos_second_latitude_f64x2);
|
|
65
67
|
|
|
66
68
|
// a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
|
|
67
|
-
float64x2_t
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
69
|
+
float64x2_t haversine_term_f64x2 = vaddq_f64(
|
|
70
|
+
sin_squared_latitude_delta_half_f64x2,
|
|
71
|
+
vmulq_f64(cos_latitude_product_f64x2, sin_squared_longitude_delta_half_f64x2));
|
|
72
|
+
// Clamp haversine_term_f64x2 to [0, 1] to prevent NaN from sqrt of negative values
|
|
73
|
+
float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
|
|
74
|
+
haversine_term_f64x2 = vmaxq_f64(zero_f64x2, vminq_f64(one_f64x2, haversine_term_f64x2));
|
|
72
75
|
|
|
73
76
|
// Central angle: c = 2 × atan2(√a, √(1-a))
|
|
74
|
-
float64x2_t
|
|
75
|
-
float64x2_t
|
|
76
|
-
float64x2_t
|
|
77
|
+
float64x2_t sqrt_haversine_f64x2 = vsqrtq_f64(haversine_term_f64x2);
|
|
78
|
+
float64x2_t sqrt_complement_f64x2 = vsqrtq_f64(vsubq_f64(one_f64x2, haversine_term_f64x2));
|
|
79
|
+
float64x2_t central_angle_f64x2 = vmulq_f64(two_f64x2,
|
|
80
|
+
nk_atan2_f64x2_neon_(sqrt_haversine_f64x2, sqrt_complement_f64x2));
|
|
77
81
|
|
|
78
|
-
return vmulq_f64(
|
|
82
|
+
return vmulq_f64(earth_radius_f64x2, central_angle_f64x2);
|
|
79
83
|
}
|
|
80
84
|
|
|
81
85
|
NK_PUBLIC void nk_haversine_f64_neon( //
|
|
@@ -84,14 +88,14 @@ NK_PUBLIC void nk_haversine_f64_neon( //
|
|
|
84
88
|
nk_size_t n, nk_f64_t *results) {
|
|
85
89
|
|
|
86
90
|
while (n >= 2) {
|
|
87
|
-
float64x2_t
|
|
88
|
-
float64x2_t
|
|
89
|
-
float64x2_t
|
|
90
|
-
float64x2_t
|
|
91
|
+
float64x2_t first_latitudes_f64x2 = vld1q_f64(a_lats);
|
|
92
|
+
float64x2_t first_longitudes_f64x2 = vld1q_f64(a_lons);
|
|
93
|
+
float64x2_t second_latitudes_f64x2 = vld1q_f64(b_lats);
|
|
94
|
+
float64x2_t second_longitudes_f64x2 = vld1q_f64(b_lons);
|
|
91
95
|
|
|
92
|
-
float64x2_t
|
|
93
|
-
|
|
94
|
-
vst1q_f64(results,
|
|
96
|
+
float64x2_t distances_f64x2 = nk_haversine_f64x2_neon_(first_latitudes_f64x2, first_longitudes_f64x2,
|
|
97
|
+
second_latitudes_f64x2, second_longitudes_f64x2);
|
|
98
|
+
vst1q_f64(results, distances_f64x2);
|
|
95
99
|
|
|
96
100
|
a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
|
|
97
101
|
}
|
|
@@ -103,52 +107,56 @@ NK_PUBLIC void nk_haversine_f64_neon( //
|
|
|
103
107
|
nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
|
|
104
108
|
nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
|
|
105
109
|
nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
|
|
106
|
-
float64x2_t
|
|
107
|
-
|
|
108
|
-
result_vec.f64x2 =
|
|
110
|
+
float64x2_t distances_f64x2 = nk_haversine_f64x2_neon_(a_lat_vec.f64x2, a_lon_vec.f64x2, b_lat_vec.f64x2,
|
|
111
|
+
b_lon_vec.f64x2);
|
|
112
|
+
result_vec.f64x2 = distances_f64x2;
|
|
109
113
|
nk_partial_store_b64x2_serial_(&result_vec, results, n);
|
|
110
114
|
}
|
|
111
115
|
}
|
|
112
116
|
|
|
113
|
-
NK_INTERNAL float32x4_t nk_haversine_f32x4_neon_(
|
|
114
|
-
float32x4_t
|
|
115
|
-
float32x4_t
|
|
117
|
+
NK_INTERNAL float32x4_t nk_haversine_f32x4_neon_( //
|
|
118
|
+
float32x4_t first_latitudes_f32x4, float32x4_t first_longitudes_f32x4, //
|
|
119
|
+
float32x4_t second_latitudes_f32x4, float32x4_t second_longitudes_f32x4) {
|
|
116
120
|
|
|
117
|
-
float32x4_t const
|
|
118
|
-
float32x4_t const
|
|
119
|
-
float32x4_t const
|
|
120
|
-
float32x4_t const
|
|
121
|
+
float32x4_t const earth_radius_f32x4 = vdupq_n_f32((float)NK_EARTH_MEDIATORIAL_RADIUS);
|
|
122
|
+
float32x4_t const half_f32x4 = vdupq_n_f32(0.5f);
|
|
123
|
+
float32x4_t const one_f32x4 = vdupq_n_f32(1.0f);
|
|
124
|
+
float32x4_t const two_f32x4 = vdupq_n_f32(2.0f);
|
|
121
125
|
|
|
122
|
-
float32x4_t
|
|
123
|
-
float32x4_t
|
|
126
|
+
float32x4_t latitude_delta_f32x4 = vsubq_f32(second_latitudes_f32x4, first_latitudes_f32x4);
|
|
127
|
+
float32x4_t longitude_delta_f32x4 = vsubq_f32(second_longitudes_f32x4, first_longitudes_f32x4);
|
|
124
128
|
|
|
125
129
|
// Haversine terms: sin²(Δ/2)
|
|
126
|
-
float32x4_t
|
|
127
|
-
float32x4_t
|
|
128
|
-
float32x4_t
|
|
129
|
-
float32x4_t
|
|
130
|
-
float32x4_t
|
|
131
|
-
|
|
130
|
+
float32x4_t latitude_delta_half_f32x4 = vmulq_f32(latitude_delta_f32x4, half_f32x4);
|
|
131
|
+
float32x4_t longitude_delta_half_f32x4 = vmulq_f32(longitude_delta_f32x4, half_f32x4);
|
|
132
|
+
float32x4_t sin_latitude_delta_half_f32x4 = nk_sin_f32x4_neon_(latitude_delta_half_f32x4);
|
|
133
|
+
float32x4_t sin_longitude_delta_half_f32x4 = nk_sin_f32x4_neon_(longitude_delta_half_f32x4);
|
|
134
|
+
float32x4_t sin_squared_latitude_delta_half_f32x4 = vmulq_f32(sin_latitude_delta_half_f32x4,
|
|
135
|
+
sin_latitude_delta_half_f32x4);
|
|
136
|
+
float32x4_t sin_squared_longitude_delta_half_f32x4 = vmulq_f32(sin_longitude_delta_half_f32x4,
|
|
137
|
+
sin_longitude_delta_half_f32x4);
|
|
132
138
|
|
|
133
139
|
// Latitude cosine product
|
|
134
|
-
float32x4_t
|
|
135
|
-
float32x4_t
|
|
136
|
-
float32x4_t
|
|
140
|
+
float32x4_t cos_first_latitude_f32x4 = nk_cos_f32x4_neon_(first_latitudes_f32x4);
|
|
141
|
+
float32x4_t cos_second_latitude_f32x4 = nk_cos_f32x4_neon_(second_latitudes_f32x4);
|
|
142
|
+
float32x4_t cos_latitude_product_f32x4 = vmulq_f32(cos_first_latitude_f32x4, cos_second_latitude_f32x4);
|
|
137
143
|
|
|
138
144
|
// a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
|
|
139
|
-
float32x4_t
|
|
140
|
-
|
|
145
|
+
float32x4_t haversine_term_f32x4 = vaddq_f32(
|
|
146
|
+
sin_squared_latitude_delta_half_f32x4,
|
|
147
|
+
vmulq_f32(cos_latitude_product_f32x4, sin_squared_longitude_delta_half_f32x4));
|
|
141
148
|
|
|
142
149
|
// Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
|
|
143
|
-
float32x4_t
|
|
144
|
-
|
|
150
|
+
float32x4_t zero_f32x4 = vdupq_n_f32(0.0f);
|
|
151
|
+
haversine_term_f32x4 = vmaxq_f32(zero_f32x4, vminq_f32(one_f32x4, haversine_term_f32x4));
|
|
145
152
|
|
|
146
153
|
// Central angle: c = 2 × atan2(√a, √(1-a))
|
|
147
|
-
float32x4_t
|
|
148
|
-
float32x4_t
|
|
149
|
-
float32x4_t
|
|
154
|
+
float32x4_t sqrt_haversine_f32x4 = vsqrtq_f32(haversine_term_f32x4);
|
|
155
|
+
float32x4_t sqrt_complement_f32x4 = vsqrtq_f32(vsubq_f32(one_f32x4, haversine_term_f32x4));
|
|
156
|
+
float32x4_t central_angle_f32x4 = vmulq_f32(two_f32x4,
|
|
157
|
+
nk_atan2_f32x4_neon_(sqrt_haversine_f32x4, sqrt_complement_f32x4));
|
|
150
158
|
|
|
151
|
-
return vmulq_f32(
|
|
159
|
+
return vmulq_f32(earth_radius_f32x4, central_angle_f32x4);
|
|
152
160
|
}
|
|
153
161
|
|
|
154
162
|
NK_PUBLIC void nk_haversine_f32_neon( //
|
|
@@ -157,14 +165,14 @@ NK_PUBLIC void nk_haversine_f32_neon( //
|
|
|
157
165
|
nk_size_t n, nk_f32_t *results) {
|
|
158
166
|
|
|
159
167
|
while (n >= 4) {
|
|
160
|
-
float32x4_t
|
|
161
|
-
float32x4_t
|
|
162
|
-
float32x4_t
|
|
163
|
-
float32x4_t
|
|
168
|
+
float32x4_t first_latitudes_f32x4 = vld1q_f32(a_lats);
|
|
169
|
+
float32x4_t first_longitudes_f32x4 = vld1q_f32(a_lons);
|
|
170
|
+
float32x4_t second_latitudes_f32x4 = vld1q_f32(b_lats);
|
|
171
|
+
float32x4_t second_longitudes_f32x4 = vld1q_f32(b_lons);
|
|
164
172
|
|
|
165
|
-
float32x4_t
|
|
166
|
-
|
|
167
|
-
vst1q_f32(results,
|
|
173
|
+
float32x4_t distances_f32x4 = nk_haversine_f32x4_neon_(first_latitudes_f32x4, first_longitudes_f32x4,
|
|
174
|
+
second_latitudes_f32x4, second_longitudes_f32x4);
|
|
175
|
+
vst1q_f32(results, distances_f32x4);
|
|
168
176
|
|
|
169
177
|
a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
|
|
170
178
|
}
|
|
@@ -176,9 +184,9 @@ NK_PUBLIC void nk_haversine_f32_neon( //
|
|
|
176
184
|
nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
|
|
177
185
|
nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
|
|
178
186
|
nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
|
|
179
|
-
float32x4_t
|
|
180
|
-
|
|
181
|
-
result_vec.f32x4 =
|
|
187
|
+
float32x4_t distances_f32x4 = nk_haversine_f32x4_neon_(a_lat_vec.f32x4, a_lon_vec.f32x4, b_lat_vec.f32x4,
|
|
188
|
+
b_lon_vec.f32x4);
|
|
189
|
+
result_vec.f32x4 = distances_f32x4;
|
|
182
190
|
nk_partial_store_b32x4_serial_(&result_vec, results, n);
|
|
183
191
|
}
|
|
184
192
|
}
|
|
@@ -187,158 +195,176 @@ NK_PUBLIC void nk_haversine_f32_neon( //
|
|
|
187
195
|
* @brief NEON helper for Vincenty's geodesic distance on 2 f64 point pairs.
|
|
188
196
|
* @note This is a true SIMD implementation using masked convergence tracking via blending.
|
|
189
197
|
*/
|
|
190
|
-
NK_INTERNAL float64x2_t nk_vincenty_f64x2_neon_(
|
|
191
|
-
float64x2_t
|
|
192
|
-
float64x2_t
|
|
193
|
-
|
|
194
|
-
float64x2_t const
|
|
195
|
-
float64x2_t const
|
|
196
|
-
float64x2_t const
|
|
197
|
-
float64x2_t const
|
|
198
|
-
float64x2_t const
|
|
199
|
-
float64x2_t const
|
|
200
|
-
float64x2_t const
|
|
201
|
-
float64x2_t const
|
|
202
|
-
float64x2_t const
|
|
203
|
-
float64x2_t const
|
|
204
|
-
float64x2_t const
|
|
198
|
+
NK_INTERNAL float64x2_t nk_vincenty_f64x2_neon_( //
|
|
199
|
+
float64x2_t first_latitudes_f64x2, float64x2_t first_longitudes_f64x2, //
|
|
200
|
+
float64x2_t second_latitudes_f64x2, float64x2_t second_longitudes_f64x2) {
|
|
201
|
+
|
|
202
|
+
float64x2_t const equatorial_radius_f64x2 = vdupq_n_f64(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
203
|
+
float64x2_t const polar_radius_f64x2 = vdupq_n_f64(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
204
|
+
float64x2_t const flattening_f64x2 = vdupq_n_f64(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
205
|
+
float64x2_t const convergence_threshold_f64x2 = vdupq_n_f64(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
|
|
206
|
+
float64x2_t const one_f64x2 = vdupq_n_f64(1.0);
|
|
207
|
+
float64x2_t const two_f64x2 = vdupq_n_f64(2.0);
|
|
208
|
+
float64x2_t const three_f64x2 = vdupq_n_f64(3.0);
|
|
209
|
+
float64x2_t const four_f64x2 = vdupq_n_f64(4.0);
|
|
210
|
+
float64x2_t const six_f64x2 = vdupq_n_f64(6.0);
|
|
211
|
+
float64x2_t const sixteen_f64x2 = vdupq_n_f64(16.0);
|
|
212
|
+
float64x2_t const epsilon_f64x2 = vdupq_n_f64(1e-15);
|
|
205
213
|
|
|
206
214
|
// Longitude difference
|
|
207
|
-
float64x2_t
|
|
215
|
+
float64x2_t longitude_difference_f64x2 = vsubq_f64(second_longitudes_f64x2, first_longitudes_f64x2);
|
|
208
216
|
|
|
209
217
|
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
210
|
-
float64x2_t
|
|
211
|
-
float64x2_t
|
|
212
|
-
|
|
213
|
-
float64x2_t
|
|
214
|
-
|
|
218
|
+
float64x2_t one_minus_f_f64x2 = vsubq_f64(one_f64x2, flattening_f64x2);
|
|
219
|
+
float64x2_t tan_first_f64x2 = vdivq_f64(nk_sin_f64x2_neon_(first_latitudes_f64x2),
|
|
220
|
+
nk_cos_f64x2_neon_(first_latitudes_f64x2));
|
|
221
|
+
float64x2_t tan_second_f64x2 = vdivq_f64(nk_sin_f64x2_neon_(second_latitudes_f64x2),
|
|
222
|
+
nk_cos_f64x2_neon_(second_latitudes_f64x2));
|
|
223
|
+
float64x2_t tan_reduced_first_f64x2 = vmulq_f64(one_minus_f_f64x2, tan_first_f64x2);
|
|
224
|
+
float64x2_t tan_reduced_second_f64x2 = vmulq_f64(one_minus_f_f64x2, tan_second_f64x2);
|
|
215
225
|
|
|
216
226
|
// cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
|
|
217
|
-
float64x2_t
|
|
218
|
-
|
|
219
|
-
float64x2_t
|
|
220
|
-
float64x2_t
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
float64x2_t
|
|
227
|
+
float64x2_t cos_reduced_first_f64x2 = vdivq_f64(
|
|
228
|
+
one_f64x2, vsqrtq_f64(vfmaq_f64(one_f64x2, tan_reduced_first_f64x2, tan_reduced_first_f64x2)));
|
|
229
|
+
float64x2_t sin_reduced_first_f64x2 = vmulq_f64(tan_reduced_first_f64x2, cos_reduced_first_f64x2);
|
|
230
|
+
float64x2_t cos_reduced_second_f64x2 = vdivq_f64(
|
|
231
|
+
one_f64x2, vsqrtq_f64(vfmaq_f64(one_f64x2, tan_reduced_second_f64x2, tan_reduced_second_f64x2)));
|
|
232
|
+
float64x2_t sin_reduced_second_f64x2 = vmulq_f64(tan_reduced_second_f64x2, cos_reduced_second_f64x2);
|
|
233
|
+
|
|
234
|
+
// Initialize lambda_f64x2 and tracking variables
|
|
235
|
+
float64x2_t lambda_f64x2 = longitude_difference_f64x2;
|
|
236
|
+
float64x2_t sin_angular_distance_f64x2, cos_angular_distance_f64x2, angular_distance_f64x2;
|
|
237
|
+
float64x2_t sin_azimuth_f64x2, cos_squared_azimuth_f64x2, cos_double_angular_midpoint_f64x2;
|
|
226
238
|
|
|
227
239
|
// Track convergence and coincident points using masks
|
|
228
|
-
uint64x2_t
|
|
229
|
-
uint64x2_t
|
|
240
|
+
uint64x2_t converged_mask_u64x2 = vdupq_n_u64(0);
|
|
241
|
+
uint64x2_t coincident_mask_u64x2 = vdupq_n_u64(0);
|
|
230
242
|
|
|
231
243
|
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
|
|
232
244
|
// Check if all lanes converged
|
|
233
|
-
|
|
245
|
+
nk_u64_t converged_bits = vgetq_lane_u64(converged_mask_u64x2, 0) & vgetq_lane_u64(converged_mask_u64x2, 1);
|
|
234
246
|
if (converged_bits) break;
|
|
235
247
|
|
|
236
|
-
float64x2_t
|
|
237
|
-
float64x2_t
|
|
248
|
+
float64x2_t sin_lambda_f64x2 = nk_sin_f64x2_neon_(lambda_f64x2);
|
|
249
|
+
float64x2_t cos_lambda_f64x2 = nk_cos_f64x2_neon_(lambda_f64x2);
|
|
238
250
|
|
|
239
|
-
// sin²(
|
|
240
|
-
float64x2_t
|
|
241
|
-
float64x2_t
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
251
|
+
// sin²(angular_distance_f64x2) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
|
|
252
|
+
float64x2_t cross_term_f64x2 = vmulq_f64(cos_reduced_second_f64x2, sin_lambda_f64x2);
|
|
253
|
+
float64x2_t mixed_term_f64x2 = vsubq_f64(
|
|
254
|
+
vmulq_f64(cos_reduced_first_f64x2, sin_reduced_second_f64x2),
|
|
255
|
+
vmulq_f64(vmulq_f64(sin_reduced_first_f64x2, cos_reduced_second_f64x2), cos_lambda_f64x2));
|
|
256
|
+
float64x2_t sin_angular_dist_sq_f64x2 = vfmaq_f64(vmulq_f64(mixed_term_f64x2, mixed_term_f64x2),
|
|
257
|
+
cross_term_f64x2, cross_term_f64x2);
|
|
258
|
+
sin_angular_distance_f64x2 = vsqrtq_f64(sin_angular_dist_sq_f64x2);
|
|
245
259
|
|
|
246
|
-
// Check for coincident points (
|
|
247
|
-
|
|
260
|
+
// Check for coincident points (sin_angular_distance_f64x2 ≈ 0)
|
|
261
|
+
coincident_mask_u64x2 = vcltq_f64(sin_angular_distance_f64x2, epsilon_f64x2);
|
|
248
262
|
|
|
249
|
-
// cos(
|
|
250
|
-
|
|
251
|
-
|
|
263
|
+
// cos(angular_distance_f64x2) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
|
|
264
|
+
cos_angular_distance_f64x2 = vfmaq_f64(vmulq_f64(sin_reduced_first_f64x2, sin_reduced_second_f64x2),
|
|
265
|
+
vmulq_f64(cos_reduced_first_f64x2, cos_reduced_second_f64x2),
|
|
266
|
+
cos_lambda_f64x2);
|
|
252
267
|
|
|
253
|
-
//
|
|
254
|
-
|
|
268
|
+
// angular_distance_f64x2 = atan2(sin, cos)
|
|
269
|
+
angular_distance_f64x2 = nk_atan2_f64x2_neon_(sin_angular_distance_f64x2, cos_angular_distance_f64x2);
|
|
255
270
|
|
|
256
|
-
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(
|
|
271
|
+
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f64x2)
|
|
257
272
|
// Avoid division by zero by using blending
|
|
258
|
-
float64x2_t
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
273
|
+
float64x2_t safe_sin_angular_f64x2 = vbslq_f64(coincident_mask_u64x2, one_f64x2, sin_angular_distance_f64x2);
|
|
274
|
+
sin_azimuth_f64x2 = vdivq_f64(
|
|
275
|
+
vmulq_f64(vmulq_f64(cos_reduced_first_f64x2, cos_reduced_second_f64x2), sin_lambda_f64x2),
|
|
276
|
+
safe_sin_angular_f64x2);
|
|
277
|
+
cos_squared_azimuth_f64x2 = vsubq_f64(one_f64x2, vmulq_f64(sin_azimuth_f64x2, sin_azimuth_f64x2));
|
|
262
278
|
|
|
263
279
|
// Handle equatorial case: cos²α ≈ 0
|
|
264
|
-
uint64x2_t
|
|
265
|
-
float64x2_t
|
|
280
|
+
uint64x2_t equatorial_mask_u64x2 = vcltq_f64(cos_squared_azimuth_f64x2, epsilon_f64x2);
|
|
281
|
+
float64x2_t safe_cos_sq_azimuth_f64x2 = vbslq_f64(equatorial_mask_u64x2, one_f64x2, cos_squared_azimuth_f64x2);
|
|
266
282
|
|
|
267
283
|
// cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
|
|
268
|
-
float64x2_t
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
284
|
+
float64x2_t sin_product_f64x2 = vmulq_f64(sin_reduced_first_f64x2, sin_reduced_second_f64x2);
|
|
285
|
+
cos_double_angular_midpoint_f64x2 = vsubq_f64(
|
|
286
|
+
cos_angular_distance_f64x2, vdivq_f64(vmulq_f64(two_f64x2, sin_product_f64x2), safe_cos_sq_azimuth_f64x2));
|
|
287
|
+
cos_double_angular_midpoint_f64x2 = vbslq_f64(equatorial_mask_u64x2, vdupq_n_f64(0.0),
|
|
288
|
+
cos_double_angular_midpoint_f64x2);
|
|
272
289
|
|
|
273
290
|
// C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
|
|
274
|
-
float64x2_t
|
|
275
|
-
vdivq_f64(
|
|
276
|
-
vmulq_f64(
|
|
291
|
+
float64x2_t correction_factor_f64x2 = vmulq_f64(
|
|
292
|
+
vdivq_f64(flattening_f64x2, sixteen_f64x2),
|
|
293
|
+
vmulq_f64(cos_squared_azimuth_f64x2,
|
|
294
|
+
vfmaq_f64(four_f64x2, flattening_f64x2,
|
|
295
|
+
vfmsq_f64(four_f64x2, three_f64x2, cos_squared_azimuth_f64x2))));
|
|
277
296
|
|
|
278
297
|
// λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
|
|
279
|
-
float64x2_t
|
|
280
|
-
//
|
|
281
|
-
float64x2_t
|
|
282
|
-
//
|
|
283
|
-
float64x2_t
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
298
|
+
float64x2_t cos_2sm_sq_f64x2 = vmulq_f64(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
|
|
299
|
+
// innermost_f64x2 = -1 + 2 × cos²(2σₘ)
|
|
300
|
+
float64x2_t innermost_f64x2 = vfmaq_f64(vdupq_n_f64(-1.0), two_f64x2, cos_2sm_sq_f64x2);
|
|
301
|
+
// middle_f64x2 = cos(2σₘ) + C × cos(σ) × innermost_f64x2
|
|
302
|
+
float64x2_t middle_f64x2 = vfmaq_f64(cos_double_angular_midpoint_f64x2,
|
|
303
|
+
vmulq_f64(correction_factor_f64x2, cos_angular_distance_f64x2),
|
|
304
|
+
innermost_f64x2);
|
|
305
|
+
// inner_f64x2 = C × sin(σ) × middle_f64x2
|
|
306
|
+
float64x2_t inner_f64x2 = vmulq_f64(vmulq_f64(correction_factor_f64x2, sin_angular_distance_f64x2),
|
|
307
|
+
middle_f64x2);
|
|
308
|
+
|
|
309
|
+
// λ' = L + (1-C) * f * sin_α * (σ + inner_f64x2)
|
|
310
|
+
float64x2_t lambda_new_f64x2 = vfmaq_f64(
|
|
311
|
+
longitude_difference_f64x2,
|
|
312
|
+
vmulq_f64(vmulq_f64(vsubq_f64(one_f64x2, correction_factor_f64x2), flattening_f64x2), sin_azimuth_f64x2),
|
|
313
|
+
vaddq_f64(angular_distance_f64x2, inner_f64x2));
|
|
292
314
|
|
|
293
315
|
// Check convergence: |λ - λ'| < threshold
|
|
294
|
-
float64x2_t
|
|
295
|
-
float64x2_t
|
|
296
|
-
uint64x2_t
|
|
297
|
-
|
|
316
|
+
float64x2_t lambda_diff_f64x2 = vsubq_f64(lambda_new_f64x2, lambda_f64x2);
|
|
317
|
+
float64x2_t lambda_diff_abs_f64x2 = vabsq_f64(lambda_diff_f64x2);
|
|
318
|
+
uint64x2_t newly_converged_u64x2 = vcltq_f64(lambda_diff_abs_f64x2, convergence_threshold_f64x2);
|
|
319
|
+
converged_mask_u64x2 = vorrq_u64(converged_mask_u64x2, newly_converged_u64x2);
|
|
298
320
|
|
|
299
|
-
// Only update
|
|
300
|
-
|
|
321
|
+
// Only update lambda_f64x2 for non-converged lanes
|
|
322
|
+
lambda_f64x2 = vbslq_f64(converged_mask_u64x2, lambda_f64x2, lambda_new_f64x2);
|
|
301
323
|
}
|
|
302
324
|
|
|
303
325
|
// Final distance calculation
|
|
304
326
|
// u² = cos²α * (a² - b²) / b²
|
|
305
|
-
float64x2_t
|
|
306
|
-
float64x2_t
|
|
307
|
-
float64x2_t
|
|
327
|
+
float64x2_t a_sq_f64x2 = vmulq_f64(equatorial_radius_f64x2, equatorial_radius_f64x2);
|
|
328
|
+
float64x2_t b_sq_f64x2 = vmulq_f64(polar_radius_f64x2, polar_radius_f64x2);
|
|
329
|
+
float64x2_t u_squared_f64x2 = vdivq_f64(vmulq_f64(cos_squared_azimuth_f64x2, vsubq_f64(a_sq_f64x2, b_sq_f64x2)),
|
|
330
|
+
b_sq_f64x2);
|
|
308
331
|
|
|
309
332
|
// A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
|
|
310
|
-
float64x2_t
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
333
|
+
float64x2_t series_a_f64x2 = vfmaq_f64(vdupq_n_f64(320.0), u_squared_f64x2, vdupq_n_f64(-175.0));
|
|
334
|
+
series_a_f64x2 = vfmaq_f64(vdupq_n_f64(-768.0), u_squared_f64x2, series_a_f64x2);
|
|
335
|
+
series_a_f64x2 = vfmaq_f64(vdupq_n_f64(4096.0), u_squared_f64x2, series_a_f64x2);
|
|
336
|
+
series_a_f64x2 = vfmaq_f64(one_f64x2, vdivq_f64(u_squared_f64x2, vdupq_n_f64(16384.0)), series_a_f64x2);
|
|
314
337
|
|
|
315
338
|
// B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
|
|
316
|
-
float64x2_t
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
339
|
+
float64x2_t series_b_f64x2 = vfmaq_f64(vdupq_n_f64(74.0), u_squared_f64x2, vdupq_n_f64(-47.0));
|
|
340
|
+
series_b_f64x2 = vfmaq_f64(vdupq_n_f64(-128.0), u_squared_f64x2, series_b_f64x2);
|
|
341
|
+
series_b_f64x2 = vfmaq_f64(vdupq_n_f64(256.0), u_squared_f64x2, series_b_f64x2);
|
|
342
|
+
series_b_f64x2 = vmulq_f64(vdivq_f64(u_squared_f64x2, vdupq_n_f64(1024.0)), series_b_f64x2);
|
|
320
343
|
|
|
321
344
|
// Δσ = B × sin(σ) × (cos(2σₘ) + B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 +
|
|
322
345
|
// 4 × cos²(2σₘ))))
|
|
323
|
-
float64x2_t
|
|
324
|
-
float64x2_t
|
|
325
|
-
float64x2_t
|
|
326
|
-
|
|
327
|
-
float64x2_t
|
|
328
|
-
float64x2_t
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
346
|
+
float64x2_t cos_2sm_sq_f64x2 = vmulq_f64(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
|
|
347
|
+
float64x2_t sin_sq_f64x2 = vmulq_f64(sin_angular_distance_f64x2, sin_angular_distance_f64x2);
|
|
348
|
+
float64x2_t term1_f64x2 = vfmaq_f64(vdupq_n_f64(-1.0), two_f64x2, cos_2sm_sq_f64x2);
|
|
349
|
+
term1_f64x2 = vmulq_f64(cos_angular_distance_f64x2, term1_f64x2);
|
|
350
|
+
float64x2_t term2_f64x2 = vfmaq_f64(vdupq_n_f64(-3.0), four_f64x2, sin_sq_f64x2);
|
|
351
|
+
float64x2_t term3_f64x2 = vfmaq_f64(vdupq_n_f64(-3.0), four_f64x2, cos_2sm_sq_f64x2);
|
|
352
|
+
term2_f64x2 = vmulq_f64(vmulq_f64(vdivq_f64(series_b_f64x2, six_f64x2), cos_double_angular_midpoint_f64x2),
|
|
353
|
+
vmulq_f64(term2_f64x2, term3_f64x2));
|
|
354
|
+
float64x2_t delta_sigma_f64x2 = vmulq_f64(
|
|
355
|
+
series_b_f64x2,
|
|
356
|
+
vmulq_f64(sin_angular_distance_f64x2,
|
|
357
|
+
vaddq_f64(cos_double_angular_midpoint_f64x2,
|
|
358
|
+
vmulq_f64(vdivq_f64(series_b_f64x2, four_f64x2), vsubq_f64(term1_f64x2, term2_f64x2)))));
|
|
334
359
|
|
|
335
360
|
// s = b * A * (σ - Δσ)
|
|
336
|
-
float64x2_t
|
|
361
|
+
float64x2_t distances_f64x2 = vmulq_f64(vmulq_f64(polar_radius_f64x2, series_a_f64x2),
|
|
362
|
+
vsubq_f64(angular_distance_f64x2, delta_sigma_f64x2));
|
|
337
363
|
|
|
338
364
|
// Set coincident points to zero
|
|
339
|
-
|
|
365
|
+
distances_f64x2 = vbslq_f64(coincident_mask_u64x2, vdupq_n_f64(0.0), distances_f64x2);
|
|
340
366
|
|
|
341
|
-
return
|
|
367
|
+
return distances_f64x2;
|
|
342
368
|
}
|
|
343
369
|
|
|
344
370
|
NK_PUBLIC void nk_vincenty_f64_neon( //
|
|
@@ -347,14 +373,14 @@ NK_PUBLIC void nk_vincenty_f64_neon( //
|
|
|
347
373
|
nk_size_t n, nk_f64_t *results) {
|
|
348
374
|
|
|
349
375
|
while (n >= 2) {
|
|
350
|
-
float64x2_t
|
|
351
|
-
float64x2_t
|
|
352
|
-
float64x2_t
|
|
353
|
-
float64x2_t
|
|
376
|
+
float64x2_t first_latitudes_f64x2 = vld1q_f64(a_lats);
|
|
377
|
+
float64x2_t first_longitudes_f64x2 = vld1q_f64(a_lons);
|
|
378
|
+
float64x2_t second_latitudes_f64x2 = vld1q_f64(b_lats);
|
|
379
|
+
float64x2_t second_longitudes_f64x2 = vld1q_f64(b_lons);
|
|
354
380
|
|
|
355
|
-
float64x2_t
|
|
356
|
-
|
|
357
|
-
vst1q_f64(results,
|
|
381
|
+
float64x2_t distances_f64x2 = nk_vincenty_f64x2_neon_(first_latitudes_f64x2, first_longitudes_f64x2,
|
|
382
|
+
second_latitudes_f64x2, second_longitudes_f64x2);
|
|
383
|
+
vst1q_f64(results, distances_f64x2);
|
|
358
384
|
|
|
359
385
|
a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
|
|
360
386
|
}
|
|
@@ -366,9 +392,9 @@ NK_PUBLIC void nk_vincenty_f64_neon( //
|
|
|
366
392
|
nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
|
|
367
393
|
nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
|
|
368
394
|
nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
|
|
369
|
-
float64x2_t
|
|
370
|
-
|
|
371
|
-
result_vec.f64x2 =
|
|
395
|
+
float64x2_t distances_f64x2 = nk_vincenty_f64x2_neon_(a_lat_vec.f64x2, a_lon_vec.f64x2, b_lat_vec.f64x2,
|
|
396
|
+
b_lon_vec.f64x2);
|
|
397
|
+
result_vec.f64x2 = distances_f64x2;
|
|
372
398
|
nk_partial_store_b64x2_serial_(&result_vec, results, n);
|
|
373
399
|
}
|
|
374
400
|
}
|
|
@@ -377,151 +403,169 @@ NK_PUBLIC void nk_vincenty_f64_neon( //
|
|
|
377
403
|
* @brief NEON helper for Vincenty's geodesic distance on 4 f32 point pairs.
|
|
378
404
|
* @note This is a true SIMD implementation using masked convergence tracking via blending.
|
|
379
405
|
*/
|
|
380
|
-
NK_INTERNAL float32x4_t nk_vincenty_f32x4_neon_(
|
|
381
|
-
float32x4_t
|
|
382
|
-
float32x4_t
|
|
383
|
-
|
|
384
|
-
float32x4_t const
|
|
385
|
-
float32x4_t const
|
|
386
|
-
float32x4_t const
|
|
387
|
-
float32x4_t const
|
|
388
|
-
float32x4_t const
|
|
389
|
-
float32x4_t const
|
|
390
|
-
float32x4_t const
|
|
391
|
-
float32x4_t const
|
|
392
|
-
float32x4_t const
|
|
393
|
-
float32x4_t const
|
|
394
|
-
float32x4_t const
|
|
406
|
+
NK_INTERNAL float32x4_t nk_vincenty_f32x4_neon_( //
|
|
407
|
+
float32x4_t first_latitudes_f32x4, float32x4_t first_longitudes_f32x4, //
|
|
408
|
+
float32x4_t second_latitudes_f32x4, float32x4_t second_longitudes_f32x4) {
|
|
409
|
+
|
|
410
|
+
float32x4_t const equatorial_radius_f32x4 = vdupq_n_f32((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
411
|
+
float32x4_t const polar_radius_f32x4 = vdupq_n_f32((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
412
|
+
float32x4_t const flattening_f32x4 = vdupq_n_f32(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
413
|
+
float32x4_t const convergence_threshold_f32x4 = vdupq_n_f32(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
|
|
414
|
+
float32x4_t const one_f32x4 = vdupq_n_f32(1.0f);
|
|
415
|
+
float32x4_t const two_f32x4 = vdupq_n_f32(2.0f);
|
|
416
|
+
float32x4_t const three_f32x4 = vdupq_n_f32(3.0f);
|
|
417
|
+
float32x4_t const four_f32x4 = vdupq_n_f32(4.0f);
|
|
418
|
+
float32x4_t const six_f32x4 = vdupq_n_f32(6.0f);
|
|
419
|
+
float32x4_t const sixteen_f32x4 = vdupq_n_f32(16.0f);
|
|
420
|
+
float32x4_t const epsilon_f32x4 = vdupq_n_f32(1e-7f);
|
|
395
421
|
|
|
396
422
|
// Longitude difference
|
|
397
|
-
float32x4_t
|
|
423
|
+
float32x4_t longitude_difference_f32x4 = vsubq_f32(second_longitudes_f32x4, first_longitudes_f32x4);
|
|
398
424
|
|
|
399
425
|
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
400
|
-
float32x4_t
|
|
401
|
-
float32x4_t
|
|
402
|
-
|
|
403
|
-
float32x4_t
|
|
404
|
-
|
|
426
|
+
float32x4_t one_minus_f_f32x4 = vsubq_f32(one_f32x4, flattening_f32x4);
|
|
427
|
+
float32x4_t tan_first_f32x4 = vdivq_f32(nk_sin_f32x4_neon_(first_latitudes_f32x4),
|
|
428
|
+
nk_cos_f32x4_neon_(first_latitudes_f32x4));
|
|
429
|
+
float32x4_t tan_second_f32x4 = vdivq_f32(nk_sin_f32x4_neon_(second_latitudes_f32x4),
|
|
430
|
+
nk_cos_f32x4_neon_(second_latitudes_f32x4));
|
|
431
|
+
float32x4_t tan_reduced_first_f32x4 = vmulq_f32(one_minus_f_f32x4, tan_first_f32x4);
|
|
432
|
+
float32x4_t tan_reduced_second_f32x4 = vmulq_f32(one_minus_f_f32x4, tan_second_f32x4);
|
|
405
433
|
|
|
406
434
|
// cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
|
|
407
|
-
float32x4_t
|
|
408
|
-
|
|
409
|
-
float32x4_t
|
|
410
|
-
float32x4_t
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
float32x4_t
|
|
435
|
+
float32x4_t cos_reduced_first_f32x4 = vdivq_f32(
|
|
436
|
+
one_f32x4, vsqrtq_f32(vfmaq_f32(one_f32x4, tan_reduced_first_f32x4, tan_reduced_first_f32x4)));
|
|
437
|
+
float32x4_t sin_reduced_first_f32x4 = vmulq_f32(tan_reduced_first_f32x4, cos_reduced_first_f32x4);
|
|
438
|
+
float32x4_t cos_reduced_second_f32x4 = vdivq_f32(
|
|
439
|
+
one_f32x4, vsqrtq_f32(vfmaq_f32(one_f32x4, tan_reduced_second_f32x4, tan_reduced_second_f32x4)));
|
|
440
|
+
float32x4_t sin_reduced_second_f32x4 = vmulq_f32(tan_reduced_second_f32x4, cos_reduced_second_f32x4);
|
|
441
|
+
|
|
442
|
+
// Initialize lambda_f32x4 and tracking variables
|
|
443
|
+
float32x4_t lambda_f32x4 = longitude_difference_f32x4;
|
|
444
|
+
float32x4_t sin_angular_distance_f32x4, cos_angular_distance_f32x4, angular_distance_f32x4;
|
|
445
|
+
float32x4_t sin_azimuth_f32x4, cos_squared_azimuth_f32x4, cos_double_angular_midpoint_f32x4;
|
|
416
446
|
|
|
417
447
|
// Track convergence and coincident points using masks
|
|
418
|
-
uint32x4_t
|
|
419
|
-
uint32x4_t
|
|
448
|
+
uint32x4_t converged_mask_u32x4 = vdupq_n_u32(0);
|
|
449
|
+
uint32x4_t coincident_mask_u32x4 = vdupq_n_u32(0);
|
|
420
450
|
|
|
421
451
|
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
|
|
422
452
|
// Check if all lanes converged (all bits set = 0xFFFFFFFF per lane)
|
|
423
|
-
|
|
453
|
+
nk_u32_t converged_bits = vminvq_u32(converged_mask_u32x4);
|
|
424
454
|
if (converged_bits == 0xFFFFFFFF) break;
|
|
425
455
|
|
|
426
|
-
float32x4_t
|
|
427
|
-
float32x4_t
|
|
456
|
+
float32x4_t sin_lambda_f32x4 = nk_sin_f32x4_neon_(lambda_f32x4);
|
|
457
|
+
float32x4_t cos_lambda_f32x4 = nk_cos_f32x4_neon_(lambda_f32x4);
|
|
428
458
|
|
|
429
|
-
// sin²(
|
|
430
|
-
float32x4_t
|
|
431
|
-
float32x4_t
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
459
|
+
// sin²(angular_distance_f32x4) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
|
|
460
|
+
float32x4_t cross_term_f32x4 = vmulq_f32(cos_reduced_second_f32x4, sin_lambda_f32x4);
|
|
461
|
+
float32x4_t mixed_term_f32x4 = vsubq_f32(
|
|
462
|
+
vmulq_f32(cos_reduced_first_f32x4, sin_reduced_second_f32x4),
|
|
463
|
+
vmulq_f32(vmulq_f32(sin_reduced_first_f32x4, cos_reduced_second_f32x4), cos_lambda_f32x4));
|
|
464
|
+
float32x4_t sin_angular_dist_sq_f32x4 = vfmaq_f32(vmulq_f32(mixed_term_f32x4, mixed_term_f32x4),
|
|
465
|
+
cross_term_f32x4, cross_term_f32x4);
|
|
466
|
+
sin_angular_distance_f32x4 = vsqrtq_f32(sin_angular_dist_sq_f32x4);
|
|
435
467
|
|
|
436
|
-
// Check for coincident points (
|
|
437
|
-
|
|
468
|
+
// Check for coincident points (sin_angular_distance_f32x4 ≈ 0)
|
|
469
|
+
coincident_mask_u32x4 = vcltq_f32(sin_angular_distance_f32x4, epsilon_f32x4);
|
|
438
470
|
|
|
439
|
-
// cos(
|
|
440
|
-
|
|
441
|
-
|
|
471
|
+
// cos(angular_distance_f32x4) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
|
|
472
|
+
cos_angular_distance_f32x4 = vfmaq_f32(vmulq_f32(sin_reduced_first_f32x4, sin_reduced_second_f32x4),
|
|
473
|
+
vmulq_f32(cos_reduced_first_f32x4, cos_reduced_second_f32x4),
|
|
474
|
+
cos_lambda_f32x4);
|
|
442
475
|
|
|
443
|
-
//
|
|
444
|
-
|
|
476
|
+
// angular_distance_f32x4 = atan2(sin, cos)
|
|
477
|
+
angular_distance_f32x4 = nk_atan2_f32x4_neon_(sin_angular_distance_f32x4, cos_angular_distance_f32x4);
|
|
445
478
|
|
|
446
|
-
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(
|
|
447
|
-
float32x4_t
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
479
|
+
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f32x4)
|
|
480
|
+
float32x4_t safe_sin_angular_f32x4 = vbslq_f32(coincident_mask_u32x4, one_f32x4, sin_angular_distance_f32x4);
|
|
481
|
+
sin_azimuth_f32x4 = vdivq_f32(
|
|
482
|
+
vmulq_f32(vmulq_f32(cos_reduced_first_f32x4, cos_reduced_second_f32x4), sin_lambda_f32x4),
|
|
483
|
+
safe_sin_angular_f32x4);
|
|
484
|
+
cos_squared_azimuth_f32x4 = vsubq_f32(one_f32x4, vmulq_f32(sin_azimuth_f32x4, sin_azimuth_f32x4));
|
|
451
485
|
|
|
452
486
|
// Handle equatorial case: cos²α ≈ 0
|
|
453
|
-
uint32x4_t
|
|
454
|
-
float32x4_t
|
|
487
|
+
uint32x4_t equatorial_mask_u32x4 = vcltq_f32(cos_squared_azimuth_f32x4, epsilon_f32x4);
|
|
488
|
+
float32x4_t safe_cos_sq_azimuth_f32x4 = vbslq_f32(equatorial_mask_u32x4, one_f32x4, cos_squared_azimuth_f32x4);
|
|
455
489
|
|
|
456
490
|
// cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
|
|
457
|
-
float32x4_t
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
491
|
+
float32x4_t sin_product_f32x4 = vmulq_f32(sin_reduced_first_f32x4, sin_reduced_second_f32x4);
|
|
492
|
+
cos_double_angular_midpoint_f32x4 = vsubq_f32(
|
|
493
|
+
cos_angular_distance_f32x4, vdivq_f32(vmulq_f32(two_f32x4, sin_product_f32x4), safe_cos_sq_azimuth_f32x4));
|
|
494
|
+
cos_double_angular_midpoint_f32x4 = vbslq_f32(equatorial_mask_u32x4, vdupq_n_f32(0.0f),
|
|
495
|
+
cos_double_angular_midpoint_f32x4);
|
|
461
496
|
|
|
462
497
|
// C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
|
|
463
|
-
float32x4_t
|
|
464
|
-
vdivq_f32(
|
|
465
|
-
vmulq_f32(
|
|
498
|
+
float32x4_t correction_factor_f32x4 = vmulq_f32(
|
|
499
|
+
vdivq_f32(flattening_f32x4, sixteen_f32x4),
|
|
500
|
+
vmulq_f32(cos_squared_azimuth_f32x4,
|
|
501
|
+
vfmaq_f32(four_f32x4, flattening_f32x4,
|
|
502
|
+
vfmsq_f32(four_f32x4, three_f32x4, cos_squared_azimuth_f32x4))));
|
|
466
503
|
|
|
467
504
|
// λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
|
|
468
|
-
float32x4_t
|
|
469
|
-
float32x4_t
|
|
470
|
-
float32x4_t
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
505
|
+
float32x4_t cos_2sm_sq_f32x4 = vmulq_f32(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
|
|
506
|
+
float32x4_t innermost_f32x4 = vfmaq_f32(vdupq_n_f32(-1.0f), two_f32x4, cos_2sm_sq_f32x4);
|
|
507
|
+
float32x4_t middle_f32x4 = vfmaq_f32(cos_double_angular_midpoint_f32x4,
|
|
508
|
+
vmulq_f32(correction_factor_f32x4, cos_angular_distance_f32x4),
|
|
509
|
+
innermost_f32x4);
|
|
510
|
+
float32x4_t inner_f32x4 = vmulq_f32(vmulq_f32(correction_factor_f32x4, sin_angular_distance_f32x4),
|
|
511
|
+
middle_f32x4);
|
|
512
|
+
|
|
513
|
+
float32x4_t lambda_new_f32x4 = vfmaq_f32(
|
|
514
|
+
longitude_difference_f32x4,
|
|
515
|
+
vmulq_f32(vmulq_f32(vsubq_f32(one_f32x4, correction_factor_f32x4), flattening_f32x4), sin_azimuth_f32x4),
|
|
516
|
+
vaddq_f32(angular_distance_f32x4, inner_f32x4));
|
|
477
517
|
|
|
478
518
|
// Check convergence: |λ - λ'| < threshold
|
|
479
|
-
float32x4_t
|
|
480
|
-
float32x4_t
|
|
481
|
-
uint32x4_t
|
|
482
|
-
|
|
519
|
+
float32x4_t lambda_diff_f32x4 = vsubq_f32(lambda_new_f32x4, lambda_f32x4);
|
|
520
|
+
float32x4_t lambda_diff_abs_f32x4 = vabsq_f32(lambda_diff_f32x4);
|
|
521
|
+
uint32x4_t newly_converged_u32x4 = vcltq_f32(lambda_diff_abs_f32x4, convergence_threshold_f32x4);
|
|
522
|
+
converged_mask_u32x4 = vorrq_u32(converged_mask_u32x4, newly_converged_u32x4);
|
|
483
523
|
|
|
484
|
-
// Only update
|
|
485
|
-
|
|
524
|
+
// Only update lambda_f32x4 for non-converged lanes
|
|
525
|
+
lambda_f32x4 = vbslq_f32(converged_mask_u32x4, lambda_f32x4, lambda_new_f32x4);
|
|
486
526
|
}
|
|
487
527
|
|
|
488
528
|
// Final distance calculation
|
|
489
|
-
float32x4_t
|
|
490
|
-
float32x4_t
|
|
491
|
-
float32x4_t
|
|
529
|
+
float32x4_t a_sq_f32x4 = vmulq_f32(equatorial_radius_f32x4, equatorial_radius_f32x4);
|
|
530
|
+
float32x4_t b_sq_f32x4 = vmulq_f32(polar_radius_f32x4, polar_radius_f32x4);
|
|
531
|
+
float32x4_t u_squared_f32x4 = vdivq_f32(vmulq_f32(cos_squared_azimuth_f32x4, vsubq_f32(a_sq_f32x4, b_sq_f32x4)),
|
|
532
|
+
b_sq_f32x4);
|
|
492
533
|
|
|
493
534
|
// A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
|
|
494
|
-
float32x4_t
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
535
|
+
float32x4_t series_a_f32x4 = vfmaq_f32(vdupq_n_f32(320.0f), u_squared_f32x4, vdupq_n_f32(-175.0f));
|
|
536
|
+
series_a_f32x4 = vfmaq_f32(vdupq_n_f32(-768.0f), u_squared_f32x4, series_a_f32x4);
|
|
537
|
+
series_a_f32x4 = vfmaq_f32(vdupq_n_f32(4096.0f), u_squared_f32x4, series_a_f32x4);
|
|
538
|
+
series_a_f32x4 = vfmaq_f32(one_f32x4, vdivq_f32(u_squared_f32x4, vdupq_n_f32(16384.0f)), series_a_f32x4);
|
|
498
539
|
|
|
499
540
|
// B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
|
|
500
|
-
float32x4_t
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
541
|
+
float32x4_t series_b_f32x4 = vfmaq_f32(vdupq_n_f32(74.0f), u_squared_f32x4, vdupq_n_f32(-47.0f));
|
|
542
|
+
series_b_f32x4 = vfmaq_f32(vdupq_n_f32(-128.0f), u_squared_f32x4, series_b_f32x4);
|
|
543
|
+
series_b_f32x4 = vfmaq_f32(vdupq_n_f32(256.0f), u_squared_f32x4, series_b_f32x4);
|
|
544
|
+
series_b_f32x4 = vmulq_f32(vdivq_f32(u_squared_f32x4, vdupq_n_f32(1024.0f)), series_b_f32x4);
|
|
504
545
|
|
|
505
546
|
// Δσ calculation
|
|
506
|
-
float32x4_t
|
|
507
|
-
float32x4_t
|
|
508
|
-
float32x4_t
|
|
509
|
-
|
|
510
|
-
float32x4_t
|
|
511
|
-
float32x4_t
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
547
|
+
float32x4_t cos_2sm_sq_f32x4 = vmulq_f32(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
|
|
548
|
+
float32x4_t sin_sq_f32x4 = vmulq_f32(sin_angular_distance_f32x4, sin_angular_distance_f32x4);
|
|
549
|
+
float32x4_t term1_f32x4 = vfmaq_f32(vdupq_n_f32(-1.0f), two_f32x4, cos_2sm_sq_f32x4);
|
|
550
|
+
term1_f32x4 = vmulq_f32(cos_angular_distance_f32x4, term1_f32x4);
|
|
551
|
+
float32x4_t term2_f32x4 = vfmaq_f32(vdupq_n_f32(-3.0f), four_f32x4, sin_sq_f32x4);
|
|
552
|
+
float32x4_t term3_f32x4 = vfmaq_f32(vdupq_n_f32(-3.0f), four_f32x4, cos_2sm_sq_f32x4);
|
|
553
|
+
term2_f32x4 = vmulq_f32(vmulq_f32(vdivq_f32(series_b_f32x4, six_f32x4), cos_double_angular_midpoint_f32x4),
|
|
554
|
+
vmulq_f32(term2_f32x4, term3_f32x4));
|
|
555
|
+
float32x4_t delta_sigma_f32x4 = vmulq_f32(
|
|
556
|
+
series_b_f32x4,
|
|
557
|
+
vmulq_f32(sin_angular_distance_f32x4,
|
|
558
|
+
vaddq_f32(cos_double_angular_midpoint_f32x4,
|
|
559
|
+
vmulq_f32(vdivq_f32(series_b_f32x4, four_f32x4), vsubq_f32(term1_f32x4, term2_f32x4)))));
|
|
517
560
|
|
|
518
561
|
// s = b * A * (σ - Δσ)
|
|
519
|
-
float32x4_t
|
|
562
|
+
float32x4_t distances_f32x4 = vmulq_f32(vmulq_f32(polar_radius_f32x4, series_a_f32x4),
|
|
563
|
+
vsubq_f32(angular_distance_f32x4, delta_sigma_f32x4));
|
|
520
564
|
|
|
521
565
|
// Set coincident points to zero
|
|
522
|
-
|
|
566
|
+
distances_f32x4 = vbslq_f32(coincident_mask_u32x4, vdupq_n_f32(0.0f), distances_f32x4);
|
|
523
567
|
|
|
524
|
-
return
|
|
568
|
+
return distances_f32x4;
|
|
525
569
|
}
|
|
526
570
|
|
|
527
571
|
NK_PUBLIC void nk_vincenty_f32_neon( //
|
|
@@ -530,14 +574,14 @@ NK_PUBLIC void nk_vincenty_f32_neon( //
|
|
|
530
574
|
nk_size_t n, nk_f32_t *results) {
|
|
531
575
|
|
|
532
576
|
while (n >= 4) {
|
|
533
|
-
float32x4_t
|
|
534
|
-
float32x4_t
|
|
535
|
-
float32x4_t
|
|
536
|
-
float32x4_t
|
|
577
|
+
float32x4_t first_latitudes_f32x4 = vld1q_f32(a_lats);
|
|
578
|
+
float32x4_t first_longitudes_f32x4 = vld1q_f32(a_lons);
|
|
579
|
+
float32x4_t second_latitudes_f32x4 = vld1q_f32(b_lats);
|
|
580
|
+
float32x4_t second_longitudes_f32x4 = vld1q_f32(b_lons);
|
|
537
581
|
|
|
538
|
-
float32x4_t
|
|
539
|
-
|
|
540
|
-
vst1q_f32(results,
|
|
582
|
+
float32x4_t distances_f32x4 = nk_vincenty_f32x4_neon_(first_latitudes_f32x4, first_longitudes_f32x4,
|
|
583
|
+
second_latitudes_f32x4, second_longitudes_f32x4);
|
|
584
|
+
vst1q_f32(results, distances_f32x4);
|
|
541
585
|
|
|
542
586
|
a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
|
|
543
587
|
}
|
|
@@ -549,9 +593,9 @@ NK_PUBLIC void nk_vincenty_f32_neon( //
|
|
|
549
593
|
nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
|
|
550
594
|
nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
|
|
551
595
|
nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
|
|
552
|
-
float32x4_t
|
|
553
|
-
|
|
554
|
-
result_vec.f32x4 =
|
|
596
|
+
float32x4_t distances_f32x4 = nk_vincenty_f32x4_neon_(a_lat_vec.f32x4, a_lon_vec.f32x4, b_lat_vec.f32x4,
|
|
597
|
+
b_lon_vec.f32x4);
|
|
598
|
+
result_vec.f32x4 = distances_f32x4;
|
|
555
599
|
nk_partial_store_b32x4_serial_(&result_vec, results, n);
|
|
556
600
|
}
|
|
557
601
|
}
|