numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -46,46 +46,50 @@ extern "C" {
|
|
|
46
46
|
* These require WASM trigonometric kernels from trigonometry/v128relaxed.h.
|
|
47
47
|
*/
|
|
48
48
|
|
|
49
|
-
NK_INTERNAL v128_t nk_haversine_f64x2_v128relaxed_(
|
|
50
|
-
v128_t
|
|
51
|
-
v128_t
|
|
49
|
+
NK_INTERNAL v128_t nk_haversine_f64x2_v128relaxed_( //
|
|
50
|
+
v128_t first_latitudes_f64x2, v128_t first_longitudes_f64x2, //
|
|
51
|
+
v128_t second_latitudes_f64x2, v128_t second_longitudes_f64x2) {
|
|
52
52
|
|
|
53
|
-
v128_t const
|
|
54
|
-
v128_t const
|
|
55
|
-
v128_t const
|
|
56
|
-
v128_t const
|
|
53
|
+
v128_t const earth_radius_f64x2 = wasm_f64x2_splat(NK_EARTH_MEDIATORIAL_RADIUS);
|
|
54
|
+
v128_t const half_f64x2 = wasm_f64x2_splat(0.5);
|
|
55
|
+
v128_t const one_f64x2 = wasm_f64x2_splat(1.0);
|
|
56
|
+
v128_t const two_f64x2 = wasm_f64x2_splat(2.0);
|
|
57
57
|
|
|
58
|
-
v128_t
|
|
59
|
-
v128_t
|
|
58
|
+
v128_t latitude_delta_f64x2 = wasm_f64x2_sub(second_latitudes_f64x2, first_latitudes_f64x2);
|
|
59
|
+
v128_t longitude_delta_f64x2 = wasm_f64x2_sub(second_longitudes_f64x2, first_longitudes_f64x2);
|
|
60
60
|
|
|
61
61
|
// Haversine terms: sin^2(delta/2)
|
|
62
|
-
v128_t
|
|
63
|
-
v128_t
|
|
64
|
-
v128_t
|
|
65
|
-
v128_t
|
|
66
|
-
v128_t
|
|
67
|
-
|
|
62
|
+
v128_t latitude_delta_half_f64x2 = wasm_f64x2_mul(latitude_delta_f64x2, half_f64x2);
|
|
63
|
+
v128_t longitude_delta_half_f64x2 = wasm_f64x2_mul(longitude_delta_f64x2, half_f64x2);
|
|
64
|
+
v128_t sin_latitude_delta_half_f64x2 = nk_f64x2_sin_v128relaxed_(latitude_delta_half_f64x2);
|
|
65
|
+
v128_t sin_longitude_delta_half_f64x2 = nk_f64x2_sin_v128relaxed_(longitude_delta_half_f64x2);
|
|
66
|
+
v128_t sin_squared_latitude_delta_half_f64x2 = wasm_f64x2_mul(sin_latitude_delta_half_f64x2,
|
|
67
|
+
sin_latitude_delta_half_f64x2);
|
|
68
|
+
v128_t sin_squared_longitude_delta_half_f64x2 = wasm_f64x2_mul(sin_longitude_delta_half_f64x2,
|
|
69
|
+
sin_longitude_delta_half_f64x2);
|
|
68
70
|
|
|
69
71
|
// Latitude cosine product
|
|
70
|
-
v128_t
|
|
71
|
-
v128_t
|
|
72
|
-
v128_t
|
|
72
|
+
v128_t cos_first_latitude_f64x2 = nk_f64x2_cos_v128relaxed_(first_latitudes_f64x2);
|
|
73
|
+
v128_t cos_second_latitude_f64x2 = nk_f64x2_cos_v128relaxed_(second_latitudes_f64x2);
|
|
74
|
+
v128_t cos_latitude_product_f64x2 = wasm_f64x2_mul(cos_first_latitude_f64x2, cos_second_latitude_f64x2);
|
|
73
75
|
|
|
74
76
|
// a = sin^2(dlat/2) + cos(lat1) * cos(lat2) * sin^2(dlon/2)
|
|
75
|
-
v128_t
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
//
|
|
79
|
-
//
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
v128_t haversine_term_f64x2 = wasm_f64x2_add(
|
|
78
|
+
sin_squared_latitude_delta_half_f64x2,
|
|
79
|
+
wasm_f64x2_mul(cos_latitude_product_f64x2, sin_squared_longitude_delta_half_f64x2));
|
|
80
|
+
// Clamp haversine_term_f64x2 to [0, 1] to prevent NaN from sqrt of negative values
|
|
81
|
+
// relaxed_min/max: 1 instruction (minpd/maxpd) vs 6-9 (with NaN/signed-zero_f64x2 fixup) on x86.
|
|
82
|
+
// Safe because haversine_term_f64x2 is a product of finite sin/cos values — NaN is impossible.
|
|
83
|
+
v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
|
|
84
|
+
haversine_term_f64x2 = wasm_f64x2_relaxed_max(zero_f64x2, wasm_f64x2_relaxed_min(one_f64x2, haversine_term_f64x2));
|
|
82
85
|
|
|
83
86
|
// Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
|
|
84
|
-
v128_t
|
|
85
|
-
v128_t
|
|
86
|
-
v128_t
|
|
87
|
+
v128_t sqrt_haversine_f64x2 = wasm_f64x2_sqrt(haversine_term_f64x2);
|
|
88
|
+
v128_t sqrt_complement_f64x2 = wasm_f64x2_sqrt(wasm_f64x2_sub(one_f64x2, haversine_term_f64x2));
|
|
89
|
+
v128_t central_angle_f64x2 = wasm_f64x2_mul(
|
|
90
|
+
two_f64x2, nk_f64x2_atan2_v128relaxed_(sqrt_haversine_f64x2, sqrt_complement_f64x2));
|
|
87
91
|
|
|
88
|
-
return wasm_f64x2_mul(
|
|
92
|
+
return wasm_f64x2_mul(earth_radius_f64x2, central_angle_f64x2);
|
|
89
93
|
}
|
|
90
94
|
|
|
91
95
|
NK_PUBLIC void nk_haversine_f64_v128relaxed( //
|
|
@@ -94,14 +98,14 @@ NK_PUBLIC void nk_haversine_f64_v128relaxed( //
|
|
|
94
98
|
nk_size_t n, nk_f64_t *results) {
|
|
95
99
|
|
|
96
100
|
while (n >= 2) {
|
|
97
|
-
v128_t
|
|
98
|
-
v128_t
|
|
99
|
-
v128_t
|
|
100
|
-
v128_t
|
|
101
|
+
v128_t first_latitudes_f64x2 = wasm_v128_load(a_lats);
|
|
102
|
+
v128_t first_longitudes_f64x2 = wasm_v128_load(a_lons);
|
|
103
|
+
v128_t second_latitudes_f64x2 = wasm_v128_load(b_lats);
|
|
104
|
+
v128_t second_longitudes_f64x2 = wasm_v128_load(b_lons);
|
|
101
105
|
|
|
102
|
-
v128_t
|
|
103
|
-
|
|
104
|
-
wasm_v128_store(results,
|
|
106
|
+
v128_t distances_f64x2 = nk_haversine_f64x2_v128relaxed_(first_latitudes_f64x2, first_longitudes_f64x2,
|
|
107
|
+
second_latitudes_f64x2, second_longitudes_f64x2);
|
|
108
|
+
wasm_v128_store(results, distances_f64x2);
|
|
105
109
|
|
|
106
110
|
a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
|
|
107
111
|
}
|
|
@@ -113,54 +117,58 @@ NK_PUBLIC void nk_haversine_f64_v128relaxed( //
|
|
|
113
117
|
nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
|
|
114
118
|
nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
|
|
115
119
|
nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
|
|
116
|
-
v128_t
|
|
117
|
-
|
|
118
|
-
result_vec.v128 =
|
|
120
|
+
v128_t distances_f64x2 = nk_haversine_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
|
|
121
|
+
b_lon_vec.v128);
|
|
122
|
+
result_vec.v128 = distances_f64x2;
|
|
119
123
|
nk_partial_store_b64x2_serial_(&result_vec, results, n);
|
|
120
124
|
}
|
|
121
125
|
}
|
|
122
126
|
|
|
123
|
-
NK_INTERNAL v128_t nk_haversine_f32x4_v128relaxed_(
|
|
124
|
-
v128_t
|
|
125
|
-
v128_t
|
|
127
|
+
NK_INTERNAL v128_t nk_haversine_f32x4_v128relaxed_( //
|
|
128
|
+
v128_t first_latitudes_f32x4, v128_t first_longitudes_f32x4, //
|
|
129
|
+
v128_t second_latitudes_f32x4, v128_t second_longitudes_f32x4) {
|
|
126
130
|
|
|
127
|
-
v128_t const
|
|
128
|
-
v128_t const
|
|
129
|
-
v128_t const
|
|
130
|
-
v128_t const
|
|
131
|
+
v128_t const earth_radius_f32x4 = wasm_f32x4_splat((float)NK_EARTH_MEDIATORIAL_RADIUS);
|
|
132
|
+
v128_t const half_f32x4 = wasm_f32x4_splat(0.5f);
|
|
133
|
+
v128_t const one_f32x4 = wasm_f32x4_splat(1.0f);
|
|
134
|
+
v128_t const two_f32x4 = wasm_f32x4_splat(2.0f);
|
|
131
135
|
|
|
132
|
-
v128_t
|
|
133
|
-
v128_t
|
|
136
|
+
v128_t latitude_delta_f32x4 = wasm_f32x4_sub(second_latitudes_f32x4, first_latitudes_f32x4);
|
|
137
|
+
v128_t longitude_delta_f32x4 = wasm_f32x4_sub(second_longitudes_f32x4, first_longitudes_f32x4);
|
|
134
138
|
|
|
135
139
|
// Haversine terms: sin^2(delta/2)
|
|
136
|
-
v128_t
|
|
137
|
-
v128_t
|
|
138
|
-
v128_t
|
|
139
|
-
v128_t
|
|
140
|
-
v128_t
|
|
141
|
-
|
|
140
|
+
v128_t latitude_delta_half_f32x4 = wasm_f32x4_mul(latitude_delta_f32x4, half_f32x4);
|
|
141
|
+
v128_t longitude_delta_half_f32x4 = wasm_f32x4_mul(longitude_delta_f32x4, half_f32x4);
|
|
142
|
+
v128_t sin_latitude_delta_half_f32x4 = nk_f32x4_sin_v128relaxed_(latitude_delta_half_f32x4);
|
|
143
|
+
v128_t sin_longitude_delta_half_f32x4 = nk_f32x4_sin_v128relaxed_(longitude_delta_half_f32x4);
|
|
144
|
+
v128_t sin_squared_latitude_delta_half_f32x4 = wasm_f32x4_mul(sin_latitude_delta_half_f32x4,
|
|
145
|
+
sin_latitude_delta_half_f32x4);
|
|
146
|
+
v128_t sin_squared_longitude_delta_half_f32x4 = wasm_f32x4_mul(sin_longitude_delta_half_f32x4,
|
|
147
|
+
sin_longitude_delta_half_f32x4);
|
|
142
148
|
|
|
143
149
|
// Latitude cosine product
|
|
144
|
-
v128_t
|
|
145
|
-
v128_t
|
|
146
|
-
v128_t
|
|
150
|
+
v128_t cos_first_latitude_f32x4 = nk_f32x4_cos_v128relaxed_(first_latitudes_f32x4);
|
|
151
|
+
v128_t cos_second_latitude_f32x4 = nk_f32x4_cos_v128relaxed_(second_latitudes_f32x4);
|
|
152
|
+
v128_t cos_latitude_product_f32x4 = wasm_f32x4_mul(cos_first_latitude_f32x4, cos_second_latitude_f32x4);
|
|
147
153
|
|
|
148
154
|
// a = sin^2(dlat/2) + cos(lat1) * cos(lat2) * sin^2(dlon/2)
|
|
149
|
-
v128_t
|
|
150
|
-
|
|
155
|
+
v128_t haversine_term_f32x4 = wasm_f32x4_add(
|
|
156
|
+
sin_squared_latitude_delta_half_f32x4,
|
|
157
|
+
wasm_f32x4_mul(cos_latitude_product_f32x4, sin_squared_longitude_delta_half_f32x4));
|
|
151
158
|
|
|
152
159
|
// Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
|
|
153
|
-
// relaxed_min/max: 1 instruction (minps/maxps) vs 6-9 (with NaN/signed-
|
|
154
|
-
// Safe because
|
|
155
|
-
v128_t
|
|
156
|
-
|
|
160
|
+
// relaxed_min/max: 1 instruction (minps/maxps) vs 6-9 (with NaN/signed-zero_f32x4 fixup) on x86.
|
|
161
|
+
// Safe because haversine_term_f32x4 is a product of finite sin/cos values — NaN is impossible.
|
|
162
|
+
v128_t zero_f32x4 = wasm_f32x4_splat(0.0f);
|
|
163
|
+
haversine_term_f32x4 = wasm_f32x4_relaxed_max(zero_f32x4, wasm_f32x4_relaxed_min(one_f32x4, haversine_term_f32x4));
|
|
157
164
|
|
|
158
165
|
// Central angle: c = 2 * atan2(sqrt(a), sqrt(1-a))
|
|
159
|
-
v128_t
|
|
160
|
-
v128_t
|
|
161
|
-
v128_t
|
|
166
|
+
v128_t sqrt_haversine_f32x4 = wasm_f32x4_sqrt(haversine_term_f32x4);
|
|
167
|
+
v128_t sqrt_complement_f32x4 = wasm_f32x4_sqrt(wasm_f32x4_sub(one_f32x4, haversine_term_f32x4));
|
|
168
|
+
v128_t central_angle_f32x4 = wasm_f32x4_mul(
|
|
169
|
+
two_f32x4, nk_f32x4_atan2_v128relaxed_(sqrt_haversine_f32x4, sqrt_complement_f32x4));
|
|
162
170
|
|
|
163
|
-
return wasm_f32x4_mul(
|
|
171
|
+
return wasm_f32x4_mul(earth_radius_f32x4, central_angle_f32x4);
|
|
164
172
|
}
|
|
165
173
|
|
|
166
174
|
NK_PUBLIC void nk_haversine_f32_v128relaxed( //
|
|
@@ -169,14 +177,14 @@ NK_PUBLIC void nk_haversine_f32_v128relaxed( //
|
|
|
169
177
|
nk_size_t n, nk_f32_t *results) {
|
|
170
178
|
|
|
171
179
|
while (n >= 4) {
|
|
172
|
-
v128_t
|
|
173
|
-
v128_t
|
|
174
|
-
v128_t
|
|
175
|
-
v128_t
|
|
180
|
+
v128_t first_latitudes_f32x4 = wasm_v128_load(a_lats);
|
|
181
|
+
v128_t first_longitudes_f32x4 = wasm_v128_load(a_lons);
|
|
182
|
+
v128_t second_latitudes_f32x4 = wasm_v128_load(b_lats);
|
|
183
|
+
v128_t second_longitudes_f32x4 = wasm_v128_load(b_lons);
|
|
176
184
|
|
|
177
|
-
v128_t
|
|
178
|
-
|
|
179
|
-
wasm_v128_store(results,
|
|
185
|
+
v128_t distances_f32x4 = nk_haversine_f32x4_v128relaxed_(first_latitudes_f32x4, first_longitudes_f32x4,
|
|
186
|
+
second_latitudes_f32x4, second_longitudes_f32x4);
|
|
187
|
+
wasm_v128_store(results, distances_f32x4);
|
|
180
188
|
|
|
181
189
|
a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
|
|
182
190
|
}
|
|
@@ -188,9 +196,9 @@ NK_PUBLIC void nk_haversine_f32_v128relaxed( //
|
|
|
188
196
|
nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
|
|
189
197
|
nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
|
|
190
198
|
nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
|
|
191
|
-
v128_t
|
|
192
|
-
|
|
193
|
-
result_vec.v128 =
|
|
199
|
+
v128_t distances_f32x4 = nk_haversine_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
|
|
200
|
+
b_lon_vec.v128);
|
|
201
|
+
result_vec.v128 = distances_f32x4;
|
|
194
202
|
nk_partial_store_b32x4_serial_(&result_vec, results, n);
|
|
195
203
|
}
|
|
196
204
|
}
|
|
@@ -199,174 +207,189 @@ NK_PUBLIC void nk_haversine_f32_v128relaxed( //
|
|
|
199
207
|
* @brief WASM Relaxed SIMD helper for Vincenty's geodesic distance on 2 f64 point pairs.
|
|
200
208
|
* @note This is a true SIMD implementation using masked convergence tracking via blending.
|
|
201
209
|
*/
|
|
202
|
-
NK_INTERNAL v128_t nk_vincenty_f64x2_v128relaxed_(
|
|
203
|
-
v128_t
|
|
204
|
-
v128_t
|
|
205
|
-
|
|
206
|
-
v128_t const
|
|
207
|
-
v128_t const
|
|
208
|
-
v128_t const
|
|
209
|
-
v128_t const
|
|
210
|
-
v128_t const
|
|
211
|
-
v128_t const
|
|
212
|
-
v128_t const
|
|
213
|
-
v128_t const
|
|
214
|
-
v128_t const
|
|
215
|
-
v128_t const
|
|
216
|
-
v128_t const
|
|
210
|
+
NK_INTERNAL v128_t nk_vincenty_f64x2_v128relaxed_( //
|
|
211
|
+
v128_t first_latitudes_f64x2, v128_t first_longitudes_f64x2, //
|
|
212
|
+
v128_t second_latitudes_f64x2, v128_t second_longitudes_f64x2) {
|
|
213
|
+
|
|
214
|
+
v128_t const equatorial_radius_f64x2 = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
215
|
+
v128_t const polar_radius_f64x2 = wasm_f64x2_splat(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
216
|
+
v128_t const flattening_f64x2 = wasm_f64x2_splat(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
217
|
+
v128_t const convergence_threshold_f64x2 = wasm_f64x2_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
|
|
218
|
+
v128_t const one_f64x2 = wasm_f64x2_splat(1.0);
|
|
219
|
+
v128_t const two_f64x2 = wasm_f64x2_splat(2.0);
|
|
220
|
+
v128_t const three_f64x2 = wasm_f64x2_splat(3.0);
|
|
221
|
+
v128_t const four_f64x2 = wasm_f64x2_splat(4.0);
|
|
222
|
+
v128_t const six_f64x2 = wasm_f64x2_splat(6.0);
|
|
223
|
+
v128_t const sixteen_f64x2 = wasm_f64x2_splat(16.0);
|
|
224
|
+
v128_t const epsilon_f64x2 = wasm_f64x2_splat(1e-15);
|
|
217
225
|
|
|
218
226
|
// Longitude difference
|
|
219
|
-
v128_t
|
|
227
|
+
v128_t longitude_difference_f64x2 = wasm_f64x2_sub(second_longitudes_f64x2, first_longitudes_f64x2);
|
|
220
228
|
|
|
221
229
|
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
222
|
-
v128_t
|
|
223
|
-
v128_t
|
|
224
|
-
|
|
225
|
-
v128_t
|
|
226
|
-
|
|
227
|
-
v128_t
|
|
228
|
-
v128_t
|
|
230
|
+
v128_t one_minus_f_f64x2 = wasm_f64x2_sub(one_f64x2, flattening_f64x2);
|
|
231
|
+
v128_t tan_first_f64x2 = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(first_latitudes_f64x2),
|
|
232
|
+
nk_f64x2_cos_v128relaxed_(first_latitudes_f64x2));
|
|
233
|
+
v128_t tan_second_f64x2 = wasm_f64x2_div(nk_f64x2_sin_v128relaxed_(second_latitudes_f64x2),
|
|
234
|
+
nk_f64x2_cos_v128relaxed_(second_latitudes_f64x2));
|
|
235
|
+
v128_t tan_reduced_first_f64x2 = wasm_f64x2_mul(one_minus_f_f64x2, tan_first_f64x2);
|
|
236
|
+
v128_t tan_reduced_second_f64x2 = wasm_f64x2_mul(one_minus_f_f64x2, tan_second_f64x2);
|
|
229
237
|
|
|
230
238
|
// cos(U) = 1/sqrt(1 + tan^2(U)), sin(U) = tan(U) * cos(U)
|
|
231
|
-
v128_t
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
v128_t
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
v128_t
|
|
239
|
+
v128_t cos_reduced_first_f64x2 = wasm_f64x2_div(
|
|
240
|
+
one_f64x2,
|
|
241
|
+
wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_first_f64x2, tan_reduced_first_f64x2, one_f64x2)));
|
|
242
|
+
v128_t sin_reduced_first_f64x2 = wasm_f64x2_mul(tan_reduced_first_f64x2, cos_reduced_first_f64x2);
|
|
243
|
+
v128_t cos_reduced_second_f64x2 = wasm_f64x2_div(
|
|
244
|
+
one_f64x2,
|
|
245
|
+
wasm_f64x2_sqrt(wasm_f64x2_relaxed_madd(tan_reduced_second_f64x2, tan_reduced_second_f64x2, one_f64x2)));
|
|
246
|
+
v128_t sin_reduced_second_f64x2 = wasm_f64x2_mul(tan_reduced_second_f64x2, cos_reduced_second_f64x2);
|
|
247
|
+
|
|
248
|
+
// Initialize lambda_f64x2 and tracking variables
|
|
249
|
+
v128_t lambda_f64x2 = longitude_difference_f64x2;
|
|
250
|
+
v128_t sin_angular_distance_f64x2, cos_angular_distance_f64x2, angular_distance_f64x2;
|
|
251
|
+
v128_t sin_azimuth_f64x2, cos_squared_azimuth_f64x2, cos_double_angular_midpoint_f64x2;
|
|
242
252
|
|
|
243
253
|
// Track convergence and coincident points using masks
|
|
244
|
-
v128_t
|
|
245
|
-
v128_t
|
|
254
|
+
v128_t converged_mask_i64x2 = wasm_i64x2_splat(0);
|
|
255
|
+
v128_t coincident_mask_i64x2 = wasm_i64x2_splat(0);
|
|
246
256
|
|
|
247
257
|
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
|
|
248
258
|
// Check if all lanes converged
|
|
249
|
-
if (wasm_i8x16_all_true(
|
|
259
|
+
if (wasm_i8x16_all_true(converged_mask_i64x2)) break;
|
|
250
260
|
|
|
251
|
-
v128_t
|
|
252
|
-
v128_t
|
|
261
|
+
v128_t sin_lambda_f64x2 = nk_f64x2_sin_v128relaxed_(lambda_f64x2);
|
|
262
|
+
v128_t cos_lambda_f64x2 = nk_f64x2_cos_v128relaxed_(lambda_f64x2);
|
|
253
263
|
|
|
254
|
-
// sin^2(
|
|
255
|
-
v128_t
|
|
256
|
-
v128_t
|
|
257
|
-
wasm_f64x2_mul(
|
|
258
|
-
wasm_f64x2_mul(wasm_f64x2_mul(
|
|
259
|
-
v128_t
|
|
260
|
-
|
|
261
|
-
|
|
264
|
+
// sin^2(angular_distance_f64x2) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
|
|
265
|
+
v128_t cross_term_f64x2 = wasm_f64x2_mul(cos_reduced_second_f64x2, sin_lambda_f64x2);
|
|
266
|
+
v128_t mixed_term_f64x2 = wasm_f64x2_sub(
|
|
267
|
+
wasm_f64x2_mul(cos_reduced_first_f64x2, sin_reduced_second_f64x2),
|
|
268
|
+
wasm_f64x2_mul(wasm_f64x2_mul(sin_reduced_first_f64x2, cos_reduced_second_f64x2), cos_lambda_f64x2));
|
|
269
|
+
v128_t sin_angular_dist_sq_f64x2 = wasm_f64x2_relaxed_madd(cross_term_f64x2, cross_term_f64x2,
|
|
270
|
+
wasm_f64x2_mul(mixed_term_f64x2, mixed_term_f64x2));
|
|
271
|
+
sin_angular_distance_f64x2 = wasm_f64x2_sqrt(sin_angular_dist_sq_f64x2);
|
|
262
272
|
|
|
263
|
-
// Check for coincident points (
|
|
264
|
-
|
|
273
|
+
// Check for coincident points (sin_angular_distance_f64x2 ~ 0)
|
|
274
|
+
coincident_mask_i64x2 = wasm_f64x2_lt(sin_angular_distance_f64x2, epsilon_f64x2);
|
|
265
275
|
|
|
266
|
-
// cos(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
276
|
+
// cos(angular_distance_f64x2) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
|
|
277
|
+
cos_angular_distance_f64x2 = wasm_f64x2_relaxed_madd(
|
|
278
|
+
wasm_f64x2_mul(cos_reduced_first_f64x2, cos_reduced_second_f64x2), cos_lambda_f64x2,
|
|
279
|
+
wasm_f64x2_mul(sin_reduced_first_f64x2, sin_reduced_second_f64x2));
|
|
270
280
|
|
|
271
|
-
//
|
|
272
|
-
|
|
281
|
+
// angular_distance_f64x2 = atan2(sin, cos)
|
|
282
|
+
angular_distance_f64x2 = nk_f64x2_atan2_v128relaxed_(sin_angular_distance_f64x2, cos_angular_distance_f64x2);
|
|
273
283
|
|
|
274
|
-
// sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(
|
|
284
|
+
// sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance_f64x2)
|
|
275
285
|
// Avoid division by zero by using blending
|
|
276
286
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
277
287
|
// Safe because mask is from comparison (all-ones or all-zeros per lane).
|
|
278
|
-
v128_t
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
288
|
+
v128_t safe_sin_angular_i64x2 = wasm_i64x2_relaxed_laneselect(one_f64x2, sin_angular_distance_f64x2,
|
|
289
|
+
coincident_mask_i64x2);
|
|
290
|
+
sin_azimuth_f64x2 = wasm_f64x2_div(
|
|
291
|
+
wasm_f64x2_mul(wasm_f64x2_mul(cos_reduced_first_f64x2, cos_reduced_second_f64x2), sin_lambda_f64x2),
|
|
292
|
+
safe_sin_angular_i64x2);
|
|
293
|
+
cos_squared_azimuth_f64x2 = wasm_f64x2_relaxed_nmadd(sin_azimuth_f64x2, sin_azimuth_f64x2, one_f64x2);
|
|
282
294
|
|
|
283
295
|
// Handle equatorial case: cos^2(a) ~ 0
|
|
284
|
-
v128_t
|
|
285
|
-
v128_t
|
|
296
|
+
v128_t equatorial_mask_f64x2 = wasm_f64x2_lt(cos_squared_azimuth_f64x2, epsilon_f64x2);
|
|
297
|
+
v128_t safe_cos_sq_azimuth_i64x2 = wasm_i64x2_relaxed_laneselect(one_f64x2, cos_squared_azimuth_f64x2,
|
|
298
|
+
equatorial_mask_f64x2);
|
|
286
299
|
|
|
287
300
|
// cos(2sm) = cos(s) - 2 * sin(U1) * sin(U2) / cos^2(a)
|
|
288
|
-
v128_t
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
301
|
+
v128_t sin_product_f64x2 = wasm_f64x2_mul(sin_reduced_first_f64x2, sin_reduced_second_f64x2);
|
|
302
|
+
cos_double_angular_midpoint_f64x2 = wasm_f64x2_sub(
|
|
303
|
+
cos_angular_distance_f64x2,
|
|
304
|
+
wasm_f64x2_div(wasm_f64x2_mul(two_f64x2, sin_product_f64x2), safe_cos_sq_azimuth_i64x2));
|
|
305
|
+
cos_double_angular_midpoint_f64x2 = wasm_i64x2_relaxed_laneselect(
|
|
306
|
+
wasm_f64x2_splat(0.0), cos_double_angular_midpoint_f64x2, equatorial_mask_f64x2);
|
|
293
307
|
|
|
294
308
|
// C = f/16 * cos^2(a) * (4 + f*(4 - 3*cos^2(a)))
|
|
295
|
-
v128_t
|
|
296
|
-
wasm_f64x2_div(
|
|
309
|
+
v128_t correction_factor_f64x2 = wasm_f64x2_mul(
|
|
310
|
+
wasm_f64x2_div(flattening_f64x2, sixteen_f64x2),
|
|
297
311
|
wasm_f64x2_mul(
|
|
298
|
-
|
|
299
|
-
wasm_f64x2_relaxed_madd(
|
|
312
|
+
cos_squared_azimuth_f64x2,
|
|
313
|
+
wasm_f64x2_relaxed_madd(flattening_f64x2,
|
|
314
|
+
wasm_f64x2_relaxed_nmadd(three_f64x2, cos_squared_azimuth_f64x2, four_f64x2),
|
|
315
|
+
four_f64x2)));
|
|
300
316
|
|
|
301
317
|
// l' = L + (1-C) * f * sin(a) * (s + C * sin(s) * (cos(2sm) + C * cos(s) * (-1 + 2 * cos^2(2sm))))
|
|
302
|
-
v128_t
|
|
303
|
-
//
|
|
304
|
-
v128_t
|
|
305
|
-
//
|
|
306
|
-
v128_t
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
318
|
+
v128_t cos_2sm_sq_f64x2 = wasm_f64x2_mul(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
|
|
319
|
+
// innermost_f64x2 = -1 + 2 * cos^2(2sm)
|
|
320
|
+
v128_t innermost_f64x2 = wasm_f64x2_relaxed_madd(two_f64x2, cos_2sm_sq_f64x2, wasm_f64x2_splat(-1.0));
|
|
321
|
+
// middle_f64x2 = cos(2sm) + C * cos(s) * innermost_f64x2
|
|
322
|
+
v128_t middle_f64x2 = wasm_f64x2_relaxed_madd(
|
|
323
|
+
wasm_f64x2_mul(correction_factor_f64x2, cos_angular_distance_f64x2), innermost_f64x2,
|
|
324
|
+
cos_double_angular_midpoint_f64x2);
|
|
325
|
+
// inner_f64x2 = C * sin(s) * middle_f64x2
|
|
326
|
+
v128_t inner_f64x2 = wasm_f64x2_mul(wasm_f64x2_mul(correction_factor_f64x2, sin_angular_distance_f64x2),
|
|
327
|
+
middle_f64x2);
|
|
328
|
+
|
|
329
|
+
// l' = L + (1-C) * f * sin_a * (s + inner_f64x2)
|
|
330
|
+
v128_t lambda_new_f64x2 = wasm_f64x2_relaxed_madd(
|
|
331
|
+
wasm_f64x2_mul(wasm_f64x2_mul(wasm_f64x2_sub(one_f64x2, correction_factor_f64x2), flattening_f64x2),
|
|
332
|
+
sin_azimuth_f64x2),
|
|
333
|
+
wasm_f64x2_add(angular_distance_f64x2, inner_f64x2), longitude_difference_f64x2);
|
|
315
334
|
|
|
316
335
|
// Check convergence: |l - l'| < threshold
|
|
317
|
-
v128_t
|
|
318
|
-
v128_t
|
|
319
|
-
v128_t
|
|
320
|
-
|
|
336
|
+
v128_t lambda_diff_f64x2 = wasm_f64x2_sub(lambda_new_f64x2, lambda_f64x2);
|
|
337
|
+
v128_t lambda_diff_abs_f64x2 = wasm_f64x2_abs(lambda_diff_f64x2);
|
|
338
|
+
v128_t newly_converged_f64x2 = wasm_f64x2_lt(lambda_diff_abs_f64x2, convergence_threshold_f64x2);
|
|
339
|
+
converged_mask_i64x2 = wasm_v128_or(converged_mask_i64x2, newly_converged_f64x2);
|
|
321
340
|
|
|
322
|
-
// Only update
|
|
341
|
+
// Only update lambda_f64x2 for non-converged lanes
|
|
323
342
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
324
343
|
// Safe because mask is from comparison (all-ones or all-zeros per lane).
|
|
325
|
-
|
|
344
|
+
lambda_f64x2 = wasm_i64x2_relaxed_laneselect(lambda_f64x2, lambda_new_f64x2, converged_mask_i64x2);
|
|
326
345
|
}
|
|
327
346
|
|
|
328
347
|
// Final distance calculation
|
|
329
348
|
// u^2 = cos^2(a) * (a^2 - b^2) / b^2
|
|
330
|
-
v128_t
|
|
331
|
-
v128_t
|
|
332
|
-
v128_t
|
|
349
|
+
v128_t a_sq_f64x2 = wasm_f64x2_mul(equatorial_radius_f64x2, equatorial_radius_f64x2);
|
|
350
|
+
v128_t b_sq_f64x2 = wasm_f64x2_mul(polar_radius_f64x2, polar_radius_f64x2);
|
|
351
|
+
v128_t u_squared_f64x2 = wasm_f64x2_div(
|
|
352
|
+
wasm_f64x2_mul(cos_squared_azimuth_f64x2, wasm_f64x2_sub(a_sq_f64x2, b_sq_f64x2)), b_sq_f64x2);
|
|
333
353
|
|
|
334
354
|
// A = 1 + u^2/16384 * (4096 + u^2*(-768 + u^2*(320 - 175*u^2)))
|
|
335
|
-
v128_t
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
355
|
+
v128_t series_a_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, wasm_f64x2_splat(-175.0), wasm_f64x2_splat(320.0));
|
|
356
|
+
series_a_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_a_f64x2, wasm_f64x2_splat(-768.0));
|
|
357
|
+
series_a_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_a_f64x2, wasm_f64x2_splat(4096.0));
|
|
358
|
+
series_a_f64x2 = wasm_f64x2_relaxed_madd(wasm_f64x2_div(u_squared_f64x2, wasm_f64x2_splat(16384.0)), series_a_f64x2,
|
|
359
|
+
one_f64x2);
|
|
339
360
|
|
|
340
361
|
// B = u^2/1024 * (256 + u^2*(-128 + u^2*(74 - 47*u^2)))
|
|
341
|
-
v128_t
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
362
|
+
v128_t series_b_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, wasm_f64x2_splat(-47.0), wasm_f64x2_splat(74.0));
|
|
363
|
+
series_b_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_b_f64x2, wasm_f64x2_splat(-128.0));
|
|
364
|
+
series_b_f64x2 = wasm_f64x2_relaxed_madd(u_squared_f64x2, series_b_f64x2, wasm_f64x2_splat(256.0));
|
|
365
|
+
series_b_f64x2 = wasm_f64x2_mul(wasm_f64x2_div(u_squared_f64x2, wasm_f64x2_splat(1024.0)), series_b_f64x2);
|
|
345
366
|
|
|
346
367
|
// Delta-sigma calculation
|
|
347
|
-
v128_t
|
|
348
|
-
v128_t
|
|
349
|
-
v128_t
|
|
350
|
-
|
|
351
|
-
v128_t
|
|
352
|
-
v128_t
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
368
|
+
v128_t cos_2sm_sq_f64x2 = wasm_f64x2_mul(cos_double_angular_midpoint_f64x2, cos_double_angular_midpoint_f64x2);
|
|
369
|
+
v128_t sin_sq_f64x2 = wasm_f64x2_mul(sin_angular_distance_f64x2, sin_angular_distance_f64x2);
|
|
370
|
+
v128_t term1_f64x2 = wasm_f64x2_relaxed_madd(two_f64x2, cos_2sm_sq_f64x2, wasm_f64x2_splat(-1.0));
|
|
371
|
+
term1_f64x2 = wasm_f64x2_mul(cos_angular_distance_f64x2, term1_f64x2);
|
|
372
|
+
v128_t term2_f64x2 = wasm_f64x2_relaxed_madd(four_f64x2, sin_sq_f64x2, wasm_f64x2_splat(-3.0));
|
|
373
|
+
v128_t term3_f64x2 = wasm_f64x2_relaxed_madd(four_f64x2, cos_2sm_sq_f64x2, wasm_f64x2_splat(-3.0));
|
|
374
|
+
term2_f64x2 = wasm_f64x2_mul(
|
|
375
|
+
wasm_f64x2_mul(wasm_f64x2_div(series_b_f64x2, six_f64x2), cos_double_angular_midpoint_f64x2),
|
|
376
|
+
wasm_f64x2_mul(term2_f64x2, term3_f64x2));
|
|
377
|
+
v128_t delta_sigma_f64x2 = wasm_f64x2_mul(
|
|
378
|
+
series_b_f64x2, wasm_f64x2_mul(sin_angular_distance_f64x2,
|
|
379
|
+
wasm_f64x2_add(cos_double_angular_midpoint_f64x2,
|
|
380
|
+
wasm_f64x2_mul(wasm_f64x2_div(series_b_f64x2, four_f64x2),
|
|
381
|
+
wasm_f64x2_sub(term1_f64x2, term2_f64x2)))));
|
|
359
382
|
|
|
360
383
|
// s = b * A * (s - ds)
|
|
361
|
-
v128_t
|
|
362
|
-
|
|
384
|
+
v128_t distances_f64x2 = wasm_f64x2_mul(wasm_f64x2_mul(polar_radius_f64x2, series_a_f64x2),
|
|
385
|
+
wasm_f64x2_sub(angular_distance_f64x2, delta_sigma_f64x2));
|
|
363
386
|
|
|
364
387
|
// Set coincident points to zero
|
|
365
388
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
366
389
|
// Safe because mask is from comparison (all-ones or all-zeros per lane).
|
|
367
|
-
|
|
390
|
+
distances_f64x2 = wasm_i64x2_relaxed_laneselect(wasm_f64x2_splat(0.0), distances_f64x2, coincident_mask_i64x2);
|
|
368
391
|
|
|
369
|
-
return
|
|
392
|
+
return distances_f64x2;
|
|
370
393
|
}
|
|
371
394
|
|
|
372
395
|
NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
|
|
@@ -375,14 +398,14 @@ NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
|
|
|
375
398
|
nk_size_t n, nk_f64_t *results) {
|
|
376
399
|
|
|
377
400
|
while (n >= 2) {
|
|
378
|
-
v128_t
|
|
379
|
-
v128_t
|
|
380
|
-
v128_t
|
|
381
|
-
v128_t
|
|
401
|
+
v128_t first_latitudes_f64x2 = wasm_v128_load(a_lats);
|
|
402
|
+
v128_t first_longitudes_f64x2 = wasm_v128_load(a_lons);
|
|
403
|
+
v128_t second_latitudes_f64x2 = wasm_v128_load(b_lats);
|
|
404
|
+
v128_t second_longitudes_f64x2 = wasm_v128_load(b_lons);
|
|
382
405
|
|
|
383
|
-
v128_t
|
|
384
|
-
|
|
385
|
-
wasm_v128_store(results,
|
|
406
|
+
v128_t distances_f64x2 = nk_vincenty_f64x2_v128relaxed_(first_latitudes_f64x2, first_longitudes_f64x2,
|
|
407
|
+
second_latitudes_f64x2, second_longitudes_f64x2);
|
|
408
|
+
wasm_v128_store(results, distances_f64x2);
|
|
386
409
|
|
|
387
410
|
a_lats += 2, a_lons += 2, b_lats += 2, b_lons += 2, results += 2, n -= 2;
|
|
388
411
|
}
|
|
@@ -394,9 +417,9 @@ NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
|
|
|
394
417
|
nk_partial_load_b64x2_serial_(a_lons, &a_lon_vec, n);
|
|
395
418
|
nk_partial_load_b64x2_serial_(b_lats, &b_lat_vec, n);
|
|
396
419
|
nk_partial_load_b64x2_serial_(b_lons, &b_lon_vec, n);
|
|
397
|
-
v128_t
|
|
398
|
-
|
|
399
|
-
result_vec.v128 =
|
|
420
|
+
v128_t distances_f64x2 = nk_vincenty_f64x2_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
|
|
421
|
+
b_lon_vec.v128);
|
|
422
|
+
result_vec.v128 = distances_f64x2;
|
|
400
423
|
nk_partial_store_b64x2_serial_(&result_vec, results, n);
|
|
401
424
|
}
|
|
402
425
|
}
|
|
@@ -405,168 +428,184 @@ NK_PUBLIC void nk_vincenty_f64_v128relaxed( //
|
|
|
405
428
|
* @brief WASM Relaxed SIMD helper for Vincenty's geodesic distance on 4 f32 point pairs.
|
|
406
429
|
* @note This is a true SIMD implementation using masked convergence tracking via blending.
|
|
407
430
|
*/
|
|
408
|
-
NK_INTERNAL v128_t nk_vincenty_f32x4_v128relaxed_(
|
|
409
|
-
v128_t
|
|
410
|
-
v128_t
|
|
411
|
-
|
|
412
|
-
v128_t const
|
|
413
|
-
v128_t const
|
|
414
|
-
v128_t const
|
|
415
|
-
v128_t const
|
|
416
|
-
v128_t const
|
|
417
|
-
v128_t const
|
|
418
|
-
v128_t const
|
|
419
|
-
v128_t const
|
|
420
|
-
v128_t const
|
|
421
|
-
v128_t const
|
|
422
|
-
v128_t const
|
|
431
|
+
NK_INTERNAL v128_t nk_vincenty_f32x4_v128relaxed_( //
|
|
432
|
+
v128_t first_latitudes_f32x4, v128_t first_longitudes_f32x4, //
|
|
433
|
+
v128_t second_latitudes_f32x4, v128_t second_longitudes_f32x4) {
|
|
434
|
+
|
|
435
|
+
v128_t const equatorial_radius_f32x4 = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
436
|
+
v128_t const polar_radius_f32x4 = wasm_f32x4_splat((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
437
|
+
v128_t const flattening_f32x4 = wasm_f32x4_splat(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
438
|
+
v128_t const convergence_threshold_f32x4 = wasm_f32x4_splat(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
|
|
439
|
+
v128_t const one_f32x4 = wasm_f32x4_splat(1.0f);
|
|
440
|
+
v128_t const two_f32x4 = wasm_f32x4_splat(2.0f);
|
|
441
|
+
v128_t const three_f32x4 = wasm_f32x4_splat(3.0f);
|
|
442
|
+
v128_t const four_f32x4 = wasm_f32x4_splat(4.0f);
|
|
443
|
+
v128_t const six_f32x4 = wasm_f32x4_splat(6.0f);
|
|
444
|
+
v128_t const sixteen_f32x4 = wasm_f32x4_splat(16.0f);
|
|
445
|
+
v128_t const epsilon_f32x4 = wasm_f32x4_splat(1e-7f);
|
|
423
446
|
|
|
424
447
|
// Longitude difference
|
|
425
|
-
v128_t
|
|
448
|
+
v128_t longitude_difference_f32x4 = wasm_f32x4_sub(second_longitudes_f32x4, first_longitudes_f32x4);
|
|
426
449
|
|
|
427
450
|
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
428
|
-
v128_t
|
|
429
|
-
v128_t
|
|
430
|
-
|
|
431
|
-
v128_t
|
|
432
|
-
|
|
433
|
-
v128_t
|
|
434
|
-
v128_t
|
|
451
|
+
v128_t one_minus_f_f32x4 = wasm_f32x4_sub(one_f32x4, flattening_f32x4);
|
|
452
|
+
v128_t tan_first_f32x4 = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(first_latitudes_f32x4),
|
|
453
|
+
nk_f32x4_cos_v128relaxed_(first_latitudes_f32x4));
|
|
454
|
+
v128_t tan_second_f32x4 = wasm_f32x4_div(nk_f32x4_sin_v128relaxed_(second_latitudes_f32x4),
|
|
455
|
+
nk_f32x4_cos_v128relaxed_(second_latitudes_f32x4));
|
|
456
|
+
v128_t tan_reduced_first_f32x4 = wasm_f32x4_mul(one_minus_f_f32x4, tan_first_f32x4);
|
|
457
|
+
v128_t tan_reduced_second_f32x4 = wasm_f32x4_mul(one_minus_f_f32x4, tan_second_f32x4);
|
|
435
458
|
|
|
436
459
|
// cos(U) = 1/sqrt(1 + tan^2(U)), sin(U) = tan(U) * cos(U)
|
|
437
|
-
v128_t
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
v128_t
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
v128_t
|
|
460
|
+
v128_t cos_reduced_first_f32x4 = wasm_f32x4_div(
|
|
461
|
+
one_f32x4,
|
|
462
|
+
wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_first_f32x4, tan_reduced_first_f32x4, one_f32x4)));
|
|
463
|
+
v128_t sin_reduced_first_f32x4 = wasm_f32x4_mul(tan_reduced_first_f32x4, cos_reduced_first_f32x4);
|
|
464
|
+
v128_t cos_reduced_second_f32x4 = wasm_f32x4_div(
|
|
465
|
+
one_f32x4,
|
|
466
|
+
wasm_f32x4_sqrt(wasm_f32x4_relaxed_madd(tan_reduced_second_f32x4, tan_reduced_second_f32x4, one_f32x4)));
|
|
467
|
+
v128_t sin_reduced_second_f32x4 = wasm_f32x4_mul(tan_reduced_second_f32x4, cos_reduced_second_f32x4);
|
|
468
|
+
|
|
469
|
+
// Initialize lambda_f32x4 and tracking variables
|
|
470
|
+
v128_t lambda_f32x4 = longitude_difference_f32x4;
|
|
471
|
+
v128_t sin_angular_distance_f32x4, cos_angular_distance_f32x4, angular_distance_f32x4;
|
|
472
|
+
v128_t sin_azimuth_f32x4, cos_squared_azimuth_f32x4, cos_double_angular_midpoint_f32x4;
|
|
448
473
|
|
|
449
474
|
// Track convergence and coincident points using masks
|
|
450
|
-
v128_t
|
|
451
|
-
v128_t
|
|
475
|
+
v128_t converged_mask_i32x4 = wasm_i32x4_splat(0);
|
|
476
|
+
v128_t coincident_mask_i32x4 = wasm_i32x4_splat(0);
|
|
452
477
|
|
|
453
478
|
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
|
|
454
479
|
// Check if all lanes converged
|
|
455
|
-
if (wasm_i8x16_all_true(
|
|
480
|
+
if (wasm_i8x16_all_true(converged_mask_i32x4)) break;
|
|
456
481
|
|
|
457
|
-
v128_t
|
|
458
|
-
v128_t
|
|
482
|
+
v128_t sin_lambda_f32x4 = nk_f32x4_sin_v128relaxed_(lambda_f32x4);
|
|
483
|
+
v128_t cos_lambda_f32x4 = nk_f32x4_cos_v128relaxed_(lambda_f32x4);
|
|
459
484
|
|
|
460
|
-
// sin^2(
|
|
461
|
-
v128_t
|
|
462
|
-
v128_t
|
|
463
|
-
wasm_f32x4_mul(
|
|
464
|
-
wasm_f32x4_mul(wasm_f32x4_mul(
|
|
465
|
-
v128_t
|
|
466
|
-
|
|
467
|
-
|
|
485
|
+
// sin^2(angular_distance_f32x4) = (cos(U2) * sin(l))^2 + (cos(U1) * sin(U2) - sin(U1) * cos(U2) * cos(l))^2
|
|
486
|
+
v128_t cross_term_f32x4 = wasm_f32x4_mul(cos_reduced_second_f32x4, sin_lambda_f32x4);
|
|
487
|
+
v128_t mixed_term_f32x4 = wasm_f32x4_sub(
|
|
488
|
+
wasm_f32x4_mul(cos_reduced_first_f32x4, sin_reduced_second_f32x4),
|
|
489
|
+
wasm_f32x4_mul(wasm_f32x4_mul(sin_reduced_first_f32x4, cos_reduced_second_f32x4), cos_lambda_f32x4));
|
|
490
|
+
v128_t sin_angular_dist_sq_f32x4 = wasm_f32x4_relaxed_madd(cross_term_f32x4, cross_term_f32x4,
|
|
491
|
+
wasm_f32x4_mul(mixed_term_f32x4, mixed_term_f32x4));
|
|
492
|
+
sin_angular_distance_f32x4 = wasm_f32x4_sqrt(sin_angular_dist_sq_f32x4);
|
|
468
493
|
|
|
469
|
-
// Check for coincident points (
|
|
470
|
-
|
|
494
|
+
// Check for coincident points (sin_angular_distance_f32x4 ~ 0)
|
|
495
|
+
coincident_mask_i32x4 = wasm_f32x4_lt(sin_angular_distance_f32x4, epsilon_f32x4);
|
|
471
496
|
|
|
472
|
-
// cos(
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
497
|
+
// cos(angular_distance_f32x4) = sin(U1) * sin(U2) + cos(U1) * cos(U2) * cos(l)
|
|
498
|
+
cos_angular_distance_f32x4 = wasm_f32x4_relaxed_madd(
|
|
499
|
+
wasm_f32x4_mul(cos_reduced_first_f32x4, cos_reduced_second_f32x4), cos_lambda_f32x4,
|
|
500
|
+
wasm_f32x4_mul(sin_reduced_first_f32x4, sin_reduced_second_f32x4));
|
|
476
501
|
|
|
477
|
-
//
|
|
478
|
-
|
|
502
|
+
// angular_distance_f32x4 = atan2(sin, cos)
|
|
503
|
+
angular_distance_f32x4 = nk_f32x4_atan2_v128relaxed_(sin_angular_distance_f32x4, cos_angular_distance_f32x4);
|
|
479
504
|
|
|
480
|
-
// sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(
|
|
505
|
+
// sin(azimuth) = cos(U1) * cos(U2) * sin(l) / sin(angular_distance_f32x4)
|
|
481
506
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
482
507
|
// Safe because mask is from comparison (all-ones or all-zeros per lane).
|
|
483
|
-
v128_t
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
508
|
+
v128_t safe_sin_angular_i32x4 = wasm_i32x4_relaxed_laneselect(one_f32x4, sin_angular_distance_f32x4,
|
|
509
|
+
coincident_mask_i32x4);
|
|
510
|
+
sin_azimuth_f32x4 = wasm_f32x4_div(
|
|
511
|
+
wasm_f32x4_mul(wasm_f32x4_mul(cos_reduced_first_f32x4, cos_reduced_second_f32x4), sin_lambda_f32x4),
|
|
512
|
+
safe_sin_angular_i32x4);
|
|
513
|
+
cos_squared_azimuth_f32x4 = wasm_f32x4_relaxed_nmadd(sin_azimuth_f32x4, sin_azimuth_f32x4, one_f32x4);
|
|
487
514
|
|
|
488
515
|
// Handle equatorial case: cos^2(a) ~ 0
|
|
489
|
-
v128_t
|
|
490
|
-
v128_t
|
|
516
|
+
v128_t equatorial_mask_f32x4 = wasm_f32x4_lt(cos_squared_azimuth_f32x4, epsilon_f32x4);
|
|
517
|
+
v128_t safe_cos_sq_azimuth_i32x4 = wasm_i32x4_relaxed_laneselect(one_f32x4, cos_squared_azimuth_f32x4,
|
|
518
|
+
equatorial_mask_f32x4);
|
|
491
519
|
|
|
492
520
|
// cos(2sm) = cos(s) - 2 * sin(U1) * sin(U2) / cos^2(a)
|
|
493
|
-
v128_t
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
521
|
+
v128_t sin_product_f32x4 = wasm_f32x4_mul(sin_reduced_first_f32x4, sin_reduced_second_f32x4);
|
|
522
|
+
cos_double_angular_midpoint_f32x4 = wasm_f32x4_sub(
|
|
523
|
+
cos_angular_distance_f32x4,
|
|
524
|
+
wasm_f32x4_div(wasm_f32x4_mul(two_f32x4, sin_product_f32x4), safe_cos_sq_azimuth_i32x4));
|
|
525
|
+
cos_double_angular_midpoint_f32x4 = wasm_i32x4_relaxed_laneselect(
|
|
526
|
+
wasm_f32x4_splat(0.0f), cos_double_angular_midpoint_f32x4, equatorial_mask_f32x4);
|
|
498
527
|
|
|
499
528
|
// C = f/16 * cos^2(a) * (4 + f*(4 - 3*cos^2(a)))
|
|
500
|
-
v128_t
|
|
501
|
-
wasm_f32x4_div(
|
|
529
|
+
v128_t correction_factor_f32x4 = wasm_f32x4_mul(
|
|
530
|
+
wasm_f32x4_div(flattening_f32x4, sixteen_f32x4),
|
|
502
531
|
wasm_f32x4_mul(
|
|
503
|
-
|
|
504
|
-
wasm_f32x4_relaxed_madd(
|
|
532
|
+
cos_squared_azimuth_f32x4,
|
|
533
|
+
wasm_f32x4_relaxed_madd(flattening_f32x4,
|
|
534
|
+
wasm_f32x4_relaxed_nmadd(three_f32x4, cos_squared_azimuth_f32x4, four_f32x4),
|
|
535
|
+
four_f32x4)));
|
|
505
536
|
|
|
506
537
|
// l' = L + (1-C) * f * sin(a) * (s + C * sin(s) * (cos(2sm) + C * cos(s) * (-1 + 2 * cos^2(2sm))))
|
|
507
|
-
v128_t
|
|
508
|
-
v128_t
|
|
509
|
-
v128_t
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
538
|
+
v128_t cos_2sm_sq_f32x4 = wasm_f32x4_mul(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
|
|
539
|
+
v128_t innermost_f32x4 = wasm_f32x4_relaxed_madd(two_f32x4, cos_2sm_sq_f32x4, wasm_f32x4_splat(-1.0f));
|
|
540
|
+
v128_t middle_f32x4 = wasm_f32x4_relaxed_madd(
|
|
541
|
+
wasm_f32x4_mul(correction_factor_f32x4, cos_angular_distance_f32x4), innermost_f32x4,
|
|
542
|
+
cos_double_angular_midpoint_f32x4);
|
|
543
|
+
v128_t inner_f32x4 = wasm_f32x4_mul(wasm_f32x4_mul(correction_factor_f32x4, sin_angular_distance_f32x4),
|
|
544
|
+
middle_f32x4);
|
|
545
|
+
|
|
546
|
+
v128_t lambda_new_f32x4 = wasm_f32x4_relaxed_madd(
|
|
547
|
+
wasm_f32x4_mul(wasm_f32x4_mul(wasm_f32x4_sub(one_f32x4, correction_factor_f32x4), flattening_f32x4),
|
|
548
|
+
sin_azimuth_f32x4),
|
|
549
|
+
wasm_f32x4_add(angular_distance_f32x4, inner_f32x4), longitude_difference_f32x4);
|
|
516
550
|
|
|
517
551
|
// Check convergence: |l - l'| < threshold
|
|
518
|
-
v128_t
|
|
519
|
-
v128_t
|
|
520
|
-
v128_t
|
|
521
|
-
|
|
552
|
+
v128_t lambda_diff_f32x4 = wasm_f32x4_sub(lambda_new_f32x4, lambda_f32x4);
|
|
553
|
+
v128_t lambda_diff_abs_f32x4 = wasm_f32x4_abs(lambda_diff_f32x4);
|
|
554
|
+
v128_t newly_converged_f32x4 = wasm_f32x4_lt(lambda_diff_abs_f32x4, convergence_threshold_f32x4);
|
|
555
|
+
converged_mask_i32x4 = wasm_v128_or(converged_mask_i32x4, newly_converged_f32x4);
|
|
522
556
|
|
|
523
|
-
// Only update
|
|
557
|
+
// Only update lambda_f32x4 for non-converged lanes
|
|
524
558
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
525
559
|
// Safe because mask is from comparison (all-ones or all-zeros per lane).
|
|
526
|
-
|
|
560
|
+
lambda_f32x4 = wasm_i32x4_relaxed_laneselect(lambda_f32x4, lambda_new_f32x4, converged_mask_i32x4);
|
|
527
561
|
}
|
|
528
562
|
|
|
529
563
|
// Final distance calculation
|
|
530
|
-
v128_t
|
|
531
|
-
v128_t
|
|
532
|
-
v128_t
|
|
564
|
+
v128_t a_sq_f32x4 = wasm_f32x4_mul(equatorial_radius_f32x4, equatorial_radius_f32x4);
|
|
565
|
+
v128_t b_sq_f32x4 = wasm_f32x4_mul(polar_radius_f32x4, polar_radius_f32x4);
|
|
566
|
+
v128_t u_squared_f32x4 = wasm_f32x4_div(
|
|
567
|
+
wasm_f32x4_mul(cos_squared_azimuth_f32x4, wasm_f32x4_sub(a_sq_f32x4, b_sq_f32x4)), b_sq_f32x4);
|
|
533
568
|
|
|
534
569
|
// A = 1 + u^2/16384 * (4096 + u^2*(-768 + u^2*(320 - 175*u^2)))
|
|
535
|
-
v128_t
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
570
|
+
v128_t series_a_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, wasm_f32x4_splat(-175.0f),
|
|
571
|
+
wasm_f32x4_splat(320.0f));
|
|
572
|
+
series_a_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_a_f32x4, wasm_f32x4_splat(-768.0f));
|
|
573
|
+
series_a_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_a_f32x4, wasm_f32x4_splat(4096.0f));
|
|
574
|
+
series_a_f32x4 = wasm_f32x4_relaxed_madd(wasm_f32x4_div(u_squared_f32x4, wasm_f32x4_splat(16384.0f)),
|
|
575
|
+
series_a_f32x4, one_f32x4);
|
|
539
576
|
|
|
540
577
|
// B = u^2/1024 * (256 + u^2*(-128 + u^2*(74 - 47*u^2)))
|
|
541
|
-
v128_t
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
578
|
+
v128_t series_b_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, wasm_f32x4_splat(-47.0f), wasm_f32x4_splat(74.0f));
|
|
579
|
+
series_b_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_b_f32x4, wasm_f32x4_splat(-128.0f));
|
|
580
|
+
series_b_f32x4 = wasm_f32x4_relaxed_madd(u_squared_f32x4, series_b_f32x4, wasm_f32x4_splat(256.0f));
|
|
581
|
+
series_b_f32x4 = wasm_f32x4_mul(wasm_f32x4_div(u_squared_f32x4, wasm_f32x4_splat(1024.0f)), series_b_f32x4);
|
|
545
582
|
|
|
546
583
|
// Delta-sigma calculation
|
|
547
|
-
v128_t
|
|
548
|
-
v128_t
|
|
549
|
-
v128_t
|
|
550
|
-
|
|
551
|
-
v128_t
|
|
552
|
-
v128_t
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
584
|
+
v128_t cos_2sm_sq_f32x4 = wasm_f32x4_mul(cos_double_angular_midpoint_f32x4, cos_double_angular_midpoint_f32x4);
|
|
585
|
+
v128_t sin_sq_f32x4 = wasm_f32x4_mul(sin_angular_distance_f32x4, sin_angular_distance_f32x4);
|
|
586
|
+
v128_t term1_f32x4 = wasm_f32x4_relaxed_madd(two_f32x4, cos_2sm_sq_f32x4, wasm_f32x4_splat(-1.0f));
|
|
587
|
+
term1_f32x4 = wasm_f32x4_mul(cos_angular_distance_f32x4, term1_f32x4);
|
|
588
|
+
v128_t term2_f32x4 = wasm_f32x4_relaxed_madd(four_f32x4, sin_sq_f32x4, wasm_f32x4_splat(-3.0f));
|
|
589
|
+
v128_t term3_f32x4 = wasm_f32x4_relaxed_madd(four_f32x4, cos_2sm_sq_f32x4, wasm_f32x4_splat(-3.0f));
|
|
590
|
+
term2_f32x4 = wasm_f32x4_mul(
|
|
591
|
+
wasm_f32x4_mul(wasm_f32x4_div(series_b_f32x4, six_f32x4), cos_double_angular_midpoint_f32x4),
|
|
592
|
+
wasm_f32x4_mul(term2_f32x4, term3_f32x4));
|
|
593
|
+
v128_t delta_sigma_f32x4 = wasm_f32x4_mul(
|
|
594
|
+
series_b_f32x4, wasm_f32x4_mul(sin_angular_distance_f32x4,
|
|
595
|
+
wasm_f32x4_add(cos_double_angular_midpoint_f32x4,
|
|
596
|
+
wasm_f32x4_mul(wasm_f32x4_div(series_b_f32x4, four_f32x4),
|
|
597
|
+
wasm_f32x4_sub(term1_f32x4, term2_f32x4)))));
|
|
559
598
|
|
|
560
599
|
// s = b * A * (s - ds)
|
|
561
|
-
v128_t
|
|
562
|
-
|
|
600
|
+
v128_t distances_f32x4 = wasm_f32x4_mul(wasm_f32x4_mul(polar_radius_f32x4, series_a_f32x4),
|
|
601
|
+
wasm_f32x4_sub(angular_distance_f32x4, delta_sigma_f32x4));
|
|
563
602
|
|
|
564
603
|
// Set coincident points to zero
|
|
565
604
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
566
605
|
// Safe because mask is from comparison (all-ones or all-zeros per lane).
|
|
567
|
-
|
|
606
|
+
distances_f32x4 = wasm_i32x4_relaxed_laneselect(wasm_f32x4_splat(0.0f), distances_f32x4, coincident_mask_i32x4);
|
|
568
607
|
|
|
569
|
-
return
|
|
608
|
+
return distances_f32x4;
|
|
570
609
|
}
|
|
571
610
|
|
|
572
611
|
NK_PUBLIC void nk_vincenty_f32_v128relaxed( //
|
|
@@ -575,14 +614,14 @@ NK_PUBLIC void nk_vincenty_f32_v128relaxed( //
|
|
|
575
614
|
nk_size_t n, nk_f32_t *results) {
|
|
576
615
|
|
|
577
616
|
while (n >= 4) {
|
|
578
|
-
v128_t
|
|
579
|
-
v128_t
|
|
580
|
-
v128_t
|
|
581
|
-
v128_t
|
|
617
|
+
v128_t first_latitudes_f32x4 = wasm_v128_load(a_lats);
|
|
618
|
+
v128_t first_longitudes_f32x4 = wasm_v128_load(a_lons);
|
|
619
|
+
v128_t second_latitudes_f32x4 = wasm_v128_load(b_lats);
|
|
620
|
+
v128_t second_longitudes_f32x4 = wasm_v128_load(b_lons);
|
|
582
621
|
|
|
583
|
-
v128_t
|
|
584
|
-
|
|
585
|
-
wasm_v128_store(results,
|
|
622
|
+
v128_t distances_f32x4 = nk_vincenty_f32x4_v128relaxed_(first_latitudes_f32x4, first_longitudes_f32x4,
|
|
623
|
+
second_latitudes_f32x4, second_longitudes_f32x4);
|
|
624
|
+
wasm_v128_store(results, distances_f32x4);
|
|
586
625
|
|
|
587
626
|
a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
|
|
588
627
|
}
|
|
@@ -594,9 +633,9 @@ NK_PUBLIC void nk_vincenty_f32_v128relaxed( //
|
|
|
594
633
|
nk_partial_load_b32x4_serial_(a_lons, &a_lon_vec, n);
|
|
595
634
|
nk_partial_load_b32x4_serial_(b_lats, &b_lat_vec, n);
|
|
596
635
|
nk_partial_load_b32x4_serial_(b_lons, &b_lon_vec, n);
|
|
597
|
-
v128_t
|
|
598
|
-
|
|
599
|
-
result_vec.v128 =
|
|
636
|
+
v128_t distances_f32x4 = nk_vincenty_f32x4_v128relaxed_(a_lat_vec.v128, a_lon_vec.v128, b_lat_vec.v128,
|
|
637
|
+
b_lon_vec.v128);
|
|
638
|
+
result_vec.v128 = distances_f32x4;
|
|
600
639
|
nk_partial_store_b32x4_serial_(&result_vec, results, n);
|
|
601
640
|
}
|
|
602
641
|
}
|