numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -8,13 +8,14 @@
|
|
|
8
8
|
*
|
|
9
9
|
* @section geospatial_haswell_instructions Key AVX2 Geospatial Instructions
|
|
10
10
|
*
|
|
11
|
-
* Intrinsic
|
|
12
|
-
* _mm256_sqrt_ps
|
|
13
|
-
* _mm256_sqrt_pd
|
|
14
|
-
* _mm256_div_ps
|
|
15
|
-
* _mm256_div_pd
|
|
16
|
-
* _mm256_fmadd_ps
|
|
17
|
-
* _mm256_fmadd_pd
|
|
11
|
+
* Intrinsic Instruction Icelake Genoa
|
|
12
|
+
* _mm256_sqrt_ps VSQRTPS (YMM, YMM) 12cy @ p0 15cy @ p01
|
|
13
|
+
* _mm256_sqrt_pd VSQRTPD (YMM, YMM) 13cy @ p0 21cy @ p01
|
|
14
|
+
* _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11cy @ p0 11cy @ p01
|
|
15
|
+
* _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13cy @ p0 13cy @ p01
|
|
16
|
+
* _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
|
|
17
|
+
* _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
|
|
18
|
+
* _mm256_cmp_ps VCMPPS (YMM, YMM, YMM, I8) 3cy @ p01 3cy @ p01
|
|
18
19
|
*/
|
|
19
20
|
#ifndef NK_GEOSPATIAL_HASWELL_H
|
|
20
21
|
#define NK_GEOSPATIAL_HASWELL_H
|
|
@@ -40,44 +41,48 @@ extern "C" {
|
|
|
40
41
|
* These require AVX2 trigonometric kernels from trigonometry.h.
|
|
41
42
|
*/
|
|
42
43
|
|
|
43
|
-
NK_INTERNAL __m256d nk_haversine_f64x4_haswell_(
|
|
44
|
-
__m256d
|
|
45
|
-
__m256d
|
|
44
|
+
NK_INTERNAL __m256d nk_haversine_f64x4_haswell_( //
|
|
45
|
+
__m256d first_latitudes_f64x4, __m256d first_longitudes_f64x4, //
|
|
46
|
+
__m256d second_latitudes_f64x4, __m256d second_longitudes_f64x4) {
|
|
46
47
|
|
|
47
|
-
__m256d const
|
|
48
|
-
__m256d const
|
|
49
|
-
__m256d const
|
|
50
|
-
__m256d const
|
|
48
|
+
__m256d const earth_radius_f64x4 = _mm256_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
|
|
49
|
+
__m256d const half_f64x4 = _mm256_set1_pd(0.5);
|
|
50
|
+
__m256d const one_f64x4 = _mm256_set1_pd(1.0);
|
|
51
|
+
__m256d const two_f64x4 = _mm256_set1_pd(2.0);
|
|
51
52
|
|
|
52
|
-
__m256d
|
|
53
|
-
__m256d
|
|
53
|
+
__m256d latitude_delta_f64x4 = _mm256_sub_pd(second_latitudes_f64x4, first_latitudes_f64x4);
|
|
54
|
+
__m256d longitude_delta_f64x4 = _mm256_sub_pd(second_longitudes_f64x4, first_longitudes_f64x4);
|
|
54
55
|
|
|
55
56
|
// Haversine terms: sin²(Δ/2)
|
|
56
|
-
__m256d
|
|
57
|
-
__m256d
|
|
58
|
-
__m256d
|
|
59
|
-
__m256d
|
|
60
|
-
__m256d
|
|
61
|
-
|
|
57
|
+
__m256d latitude_delta_half_f64x4 = _mm256_mul_pd(latitude_delta_f64x4, half_f64x4);
|
|
58
|
+
__m256d longitude_delta_half_f64x4 = _mm256_mul_pd(longitude_delta_f64x4, half_f64x4);
|
|
59
|
+
__m256d sin_latitude_delta_half_f64x4 = nk_sin_f64x4_haswell_(latitude_delta_half_f64x4);
|
|
60
|
+
__m256d sin_longitude_delta_half_f64x4 = nk_sin_f64x4_haswell_(longitude_delta_half_f64x4);
|
|
61
|
+
__m256d sin_squared_latitude_delta_half_f64x4 = _mm256_mul_pd(sin_latitude_delta_half_f64x4,
|
|
62
|
+
sin_latitude_delta_half_f64x4);
|
|
63
|
+
__m256d sin_squared_longitude_delta_half_f64x4 = _mm256_mul_pd(sin_longitude_delta_half_f64x4,
|
|
64
|
+
sin_longitude_delta_half_f64x4);
|
|
62
65
|
|
|
63
66
|
// Latitude cosine product
|
|
64
|
-
__m256d
|
|
65
|
-
__m256d
|
|
66
|
-
__m256d
|
|
67
|
+
__m256d cos_first_latitude_f64x4 = nk_cos_f64x4_haswell_(first_latitudes_f64x4);
|
|
68
|
+
__m256d cos_second_latitude_f64x4 = nk_cos_f64x4_haswell_(second_latitudes_f64x4);
|
|
69
|
+
__m256d cos_latitude_product_f64x4 = _mm256_mul_pd(cos_first_latitude_f64x4, cos_second_latitude_f64x4);
|
|
67
70
|
|
|
68
71
|
// a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
|
|
69
|
-
__m256d
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
72
|
+
__m256d haversine_term_f64x4 = _mm256_add_pd(
|
|
73
|
+
sin_squared_latitude_delta_half_f64x4,
|
|
74
|
+
_mm256_mul_pd(cos_latitude_product_f64x4, sin_squared_longitude_delta_half_f64x4));
|
|
75
|
+
// Clamp haversine_term_f64x4 to [0, 1] to prevent NaN from sqrt of negative values
|
|
76
|
+
__m256d zero_f64x4 = _mm256_setzero_pd();
|
|
77
|
+
haversine_term_f64x4 = _mm256_max_pd(zero_f64x4, _mm256_min_pd(one_f64x4, haversine_term_f64x4));
|
|
74
78
|
|
|
75
79
|
// Central angle: c = 2 × atan2(√a, √(1-a))
|
|
76
|
-
__m256d
|
|
77
|
-
__m256d
|
|
78
|
-
__m256d
|
|
80
|
+
__m256d sqrt_haversine_f64x4 = _mm256_sqrt_pd(haversine_term_f64x4);
|
|
81
|
+
__m256d sqrt_complement_f64x4 = _mm256_sqrt_pd(_mm256_sub_pd(one_f64x4, haversine_term_f64x4));
|
|
82
|
+
__m256d central_angle_f64x4 = _mm256_mul_pd(two_f64x4,
|
|
83
|
+
nk_atan2_f64x4_haswell_(sqrt_haversine_f64x4, sqrt_complement_f64x4));
|
|
79
84
|
|
|
80
|
-
return _mm256_mul_pd(
|
|
85
|
+
return _mm256_mul_pd(earth_radius_f64x4, central_angle_f64x4);
|
|
81
86
|
}
|
|
82
87
|
|
|
83
88
|
NK_PUBLIC void nk_haversine_f64_haswell( //
|
|
@@ -86,14 +91,14 @@ NK_PUBLIC void nk_haversine_f64_haswell( //
|
|
|
86
91
|
nk_size_t n, nk_f64_t *results) {
|
|
87
92
|
|
|
88
93
|
while (n >= 4) {
|
|
89
|
-
__m256d
|
|
90
|
-
__m256d
|
|
91
|
-
__m256d
|
|
92
|
-
__m256d
|
|
94
|
+
__m256d first_latitudes_f64x4 = _mm256_loadu_pd(a_lats);
|
|
95
|
+
__m256d first_longitudes_f64x4 = _mm256_loadu_pd(a_lons);
|
|
96
|
+
__m256d second_latitudes_f64x4 = _mm256_loadu_pd(b_lats);
|
|
97
|
+
__m256d second_longitudes_f64x4 = _mm256_loadu_pd(b_lons);
|
|
93
98
|
|
|
94
|
-
__m256d
|
|
95
|
-
|
|
96
|
-
_mm256_storeu_pd(results,
|
|
99
|
+
__m256d distances_f64x4 = nk_haversine_f64x4_haswell_(first_latitudes_f64x4, first_longitudes_f64x4,
|
|
100
|
+
second_latitudes_f64x4, second_longitudes_f64x4);
|
|
101
|
+
_mm256_storeu_pd(results, distances_f64x4);
|
|
97
102
|
|
|
98
103
|
a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
|
|
99
104
|
}
|
|
@@ -105,52 +110,56 @@ NK_PUBLIC void nk_haversine_f64_haswell( //
|
|
|
105
110
|
nk_partial_load_b64x4_haswell_(a_lons, &a_lon_vec, n);
|
|
106
111
|
nk_partial_load_b64x4_haswell_(b_lats, &b_lat_vec, n);
|
|
107
112
|
nk_partial_load_b64x4_haswell_(b_lons, &b_lon_vec, n);
|
|
108
|
-
__m256d
|
|
109
|
-
|
|
110
|
-
result_vec.ymm_pd =
|
|
113
|
+
__m256d distances_f64x4 = nk_haversine_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
|
|
114
|
+
b_lon_vec.ymm_pd);
|
|
115
|
+
result_vec.ymm_pd = distances_f64x4;
|
|
111
116
|
nk_partial_store_b64x4_haswell_(&result_vec, results, n);
|
|
112
117
|
}
|
|
113
118
|
}
|
|
114
119
|
|
|
115
|
-
NK_INTERNAL __m256 nk_haversine_f32x8_haswell_(
|
|
116
|
-
__m256
|
|
117
|
-
__m256
|
|
120
|
+
NK_INTERNAL __m256 nk_haversine_f32x8_haswell_( //
|
|
121
|
+
__m256 first_latitudes_f32x8, __m256 first_longitudes_f32x8, //
|
|
122
|
+
__m256 second_latitudes_f32x8, __m256 second_longitudes_f32x8) {
|
|
118
123
|
|
|
119
|
-
__m256 const
|
|
120
|
-
__m256 const
|
|
121
|
-
__m256 const
|
|
122
|
-
__m256 const
|
|
124
|
+
__m256 const earth_radius_f32x8 = _mm256_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
|
|
125
|
+
__m256 const half_f32x8 = _mm256_set1_ps(0.5f);
|
|
126
|
+
__m256 const one_f32x8 = _mm256_set1_ps(1.0f);
|
|
127
|
+
__m256 const two_f32x8 = _mm256_set1_ps(2.0f);
|
|
123
128
|
|
|
124
|
-
__m256
|
|
125
|
-
__m256
|
|
129
|
+
__m256 latitude_delta_f32x8 = _mm256_sub_ps(second_latitudes_f32x8, first_latitudes_f32x8);
|
|
130
|
+
__m256 longitude_delta_f32x8 = _mm256_sub_ps(second_longitudes_f32x8, first_longitudes_f32x8);
|
|
126
131
|
|
|
127
132
|
// Haversine terms: sin²(Δ/2)
|
|
128
|
-
__m256
|
|
129
|
-
__m256
|
|
130
|
-
__m256
|
|
131
|
-
__m256
|
|
132
|
-
__m256
|
|
133
|
-
|
|
133
|
+
__m256 latitude_delta_half_f32x8 = _mm256_mul_ps(latitude_delta_f32x8, half_f32x8);
|
|
134
|
+
__m256 longitude_delta_half_f32x8 = _mm256_mul_ps(longitude_delta_f32x8, half_f32x8);
|
|
135
|
+
__m256 sin_latitude_delta_half_f32x8 = nk_sin_f32x8_haswell_(latitude_delta_half_f32x8);
|
|
136
|
+
__m256 sin_longitude_delta_half_f32x8 = nk_sin_f32x8_haswell_(longitude_delta_half_f32x8);
|
|
137
|
+
__m256 sin_squared_latitude_delta_half_f32x8 = _mm256_mul_ps(sin_latitude_delta_half_f32x8,
|
|
138
|
+
sin_latitude_delta_half_f32x8);
|
|
139
|
+
__m256 sin_squared_longitude_delta_half_f32x8 = _mm256_mul_ps(sin_longitude_delta_half_f32x8,
|
|
140
|
+
sin_longitude_delta_half_f32x8);
|
|
134
141
|
|
|
135
142
|
// Latitude cosine product
|
|
136
|
-
__m256
|
|
137
|
-
__m256
|
|
138
|
-
__m256
|
|
143
|
+
__m256 cos_first_latitude_f32x8 = nk_cos_f32x8_haswell_(first_latitudes_f32x8);
|
|
144
|
+
__m256 cos_second_latitude_f32x8 = nk_cos_f32x8_haswell_(second_latitudes_f32x8);
|
|
145
|
+
__m256 cos_latitude_product_f32x8 = _mm256_mul_ps(cos_first_latitude_f32x8, cos_second_latitude_f32x8);
|
|
139
146
|
|
|
140
147
|
// a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
|
|
141
|
-
__m256
|
|
142
|
-
|
|
148
|
+
__m256 haversine_term_f32x8 = _mm256_add_ps(
|
|
149
|
+
sin_squared_latitude_delta_half_f32x8,
|
|
150
|
+
_mm256_mul_ps(cos_latitude_product_f32x8, sin_squared_longitude_delta_half_f32x8));
|
|
143
151
|
|
|
144
152
|
// Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
|
|
145
|
-
__m256
|
|
146
|
-
|
|
153
|
+
__m256 zero_f32x8 = _mm256_setzero_ps();
|
|
154
|
+
haversine_term_f32x8 = _mm256_max_ps(zero_f32x8, _mm256_min_ps(one_f32x8, haversine_term_f32x8));
|
|
147
155
|
|
|
148
156
|
// Central angle: c = 2 × atan2(√a, √(1-a))
|
|
149
|
-
__m256
|
|
150
|
-
__m256
|
|
151
|
-
__m256
|
|
157
|
+
__m256 sqrt_haversine_f32x8 = _mm256_sqrt_ps(haversine_term_f32x8);
|
|
158
|
+
__m256 sqrt_complement_f32x8 = _mm256_sqrt_ps(_mm256_sub_ps(one_f32x8, haversine_term_f32x8));
|
|
159
|
+
__m256 central_angle_f32x8 = _mm256_mul_ps(two_f32x8,
|
|
160
|
+
nk_atan2_f32x8_haswell_(sqrt_haversine_f32x8, sqrt_complement_f32x8));
|
|
152
161
|
|
|
153
|
-
return _mm256_mul_ps(
|
|
162
|
+
return _mm256_mul_ps(earth_radius_f32x8, central_angle_f32x8);
|
|
154
163
|
}
|
|
155
164
|
|
|
156
165
|
NK_PUBLIC void nk_haversine_f32_haswell( //
|
|
@@ -159,14 +168,14 @@ NK_PUBLIC void nk_haversine_f32_haswell( //
|
|
|
159
168
|
nk_size_t n, nk_f32_t *results) {
|
|
160
169
|
|
|
161
170
|
while (n >= 8) {
|
|
162
|
-
__m256
|
|
163
|
-
__m256
|
|
164
|
-
__m256
|
|
165
|
-
__m256
|
|
171
|
+
__m256 first_latitudes_f32x8 = _mm256_loadu_ps(a_lats);
|
|
172
|
+
__m256 first_longitudes_f32x8 = _mm256_loadu_ps(a_lons);
|
|
173
|
+
__m256 second_latitudes_f32x8 = _mm256_loadu_ps(b_lats);
|
|
174
|
+
__m256 second_longitudes_f32x8 = _mm256_loadu_ps(b_lons);
|
|
166
175
|
|
|
167
|
-
__m256
|
|
168
|
-
|
|
169
|
-
_mm256_storeu_ps(results,
|
|
176
|
+
__m256 distances_f32x8 = nk_haversine_f32x8_haswell_(first_latitudes_f32x8, first_longitudes_f32x8,
|
|
177
|
+
second_latitudes_f32x8, second_longitudes_f32x8);
|
|
178
|
+
_mm256_storeu_ps(results, distances_f32x8);
|
|
170
179
|
|
|
171
180
|
a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
|
|
172
181
|
}
|
|
@@ -178,9 +187,9 @@ NK_PUBLIC void nk_haversine_f32_haswell( //
|
|
|
178
187
|
nk_partial_load_b32x8_serial_(a_lons, &a_lon_vec, n);
|
|
179
188
|
nk_partial_load_b32x8_serial_(b_lats, &b_lat_vec, n);
|
|
180
189
|
nk_partial_load_b32x8_serial_(b_lons, &b_lon_vec, n);
|
|
181
|
-
__m256
|
|
182
|
-
|
|
183
|
-
result_vec.ymm_ps =
|
|
190
|
+
__m256 distances_f32x8 = nk_haversine_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
|
|
191
|
+
b_lon_vec.ymm_ps);
|
|
192
|
+
result_vec.ymm_ps = distances_f32x8;
|
|
184
193
|
nk_partial_store_b32x8_serial_(&result_vec, results, n);
|
|
185
194
|
}
|
|
186
195
|
}
|
|
@@ -189,165 +198,180 @@ NK_PUBLIC void nk_haversine_f32_haswell( //
|
|
|
189
198
|
* @brief AVX2 helper for Vincenty's geodesic distance on 4 f64 point pairs.
|
|
190
199
|
* @note This is a true SIMD implementation using masked convergence tracking via blending.
|
|
191
200
|
*/
|
|
192
|
-
NK_INTERNAL __m256d nk_vincenty_f64x4_haswell_(
|
|
193
|
-
__m256d
|
|
194
|
-
__m256d
|
|
195
|
-
|
|
196
|
-
__m256d const
|
|
197
|
-
__m256d const
|
|
198
|
-
__m256d const
|
|
199
|
-
__m256d const
|
|
200
|
-
__m256d const
|
|
201
|
-
__m256d const
|
|
202
|
-
__m256d const
|
|
203
|
-
__m256d const
|
|
204
|
-
__m256d const
|
|
205
|
-
__m256d const
|
|
206
|
-
__m256d const
|
|
201
|
+
NK_INTERNAL __m256d nk_vincenty_f64x4_haswell_( //
|
|
202
|
+
__m256d first_latitudes_f64x4, __m256d first_longitudes_f64x4, //
|
|
203
|
+
__m256d second_latitudes_f64x4, __m256d second_longitudes_f64x4) {
|
|
204
|
+
|
|
205
|
+
__m256d const equatorial_radius_f64x4 = _mm256_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
206
|
+
__m256d const polar_radius_f64x4 = _mm256_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
207
|
+
__m256d const flattening_f64x4 = _mm256_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
208
|
+
__m256d const convergence_threshold_f64x4 = _mm256_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
|
|
209
|
+
__m256d const one_f64x4 = _mm256_set1_pd(1.0);
|
|
210
|
+
__m256d const two_f64x4 = _mm256_set1_pd(2.0);
|
|
211
|
+
__m256d const three_f64x4 = _mm256_set1_pd(3.0);
|
|
212
|
+
__m256d const four_f64x4 = _mm256_set1_pd(4.0);
|
|
213
|
+
__m256d const six_f64x4 = _mm256_set1_pd(6.0);
|
|
214
|
+
__m256d const sixteen_f64x4 = _mm256_set1_pd(16.0);
|
|
215
|
+
__m256d const epsilon_f64x4 = _mm256_set1_pd(1e-15);
|
|
207
216
|
|
|
208
217
|
// Longitude difference
|
|
209
|
-
__m256d
|
|
218
|
+
__m256d longitude_difference_f64x4 = _mm256_sub_pd(second_longitudes_f64x4, first_longitudes_f64x4);
|
|
210
219
|
|
|
211
220
|
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
212
|
-
__m256d
|
|
213
|
-
__m256d
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
__m256d
|
|
221
|
+
__m256d one_minus_f_f64x4 = _mm256_sub_pd(one_f64x4, flattening_f64x4);
|
|
222
|
+
__m256d tan_first_f64x4 = _mm256_div_pd(nk_sin_f64x4_haswell_(first_latitudes_f64x4),
|
|
223
|
+
nk_cos_f64x4_haswell_(first_latitudes_f64x4));
|
|
224
|
+
__m256d tan_second_f64x4 = _mm256_div_pd(nk_sin_f64x4_haswell_(second_latitudes_f64x4),
|
|
225
|
+
nk_cos_f64x4_haswell_(second_latitudes_f64x4));
|
|
226
|
+
__m256d tan_reduced_first_f64x4 = _mm256_mul_pd(one_minus_f_f64x4, tan_first_f64x4);
|
|
227
|
+
__m256d tan_reduced_second_f64x4 = _mm256_mul_pd(one_minus_f_f64x4, tan_second_f64x4);
|
|
218
228
|
|
|
219
229
|
// cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
|
|
220
|
-
__m256d
|
|
221
|
-
|
|
222
|
-
__m256d
|
|
223
|
-
__m256d
|
|
224
|
-
|
|
225
|
-
__m256d
|
|
226
|
-
|
|
227
|
-
// Initialize
|
|
228
|
-
__m256d
|
|
229
|
-
__m256d
|
|
230
|
-
__m256d
|
|
230
|
+
__m256d cos_reduced_first_f64x4 = _mm256_div_pd(
|
|
231
|
+
one_f64x4, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_first_f64x4, tan_reduced_first_f64x4, one_f64x4)));
|
|
232
|
+
__m256d sin_reduced_first_f64x4 = _mm256_mul_pd(tan_reduced_first_f64x4, cos_reduced_first_f64x4);
|
|
233
|
+
__m256d cos_reduced_second_f64x4 = _mm256_div_pd(
|
|
234
|
+
one_f64x4, _mm256_sqrt_pd(_mm256_fmadd_pd(tan_reduced_second_f64x4, tan_reduced_second_f64x4, one_f64x4)));
|
|
235
|
+
__m256d sin_reduced_second_f64x4 = _mm256_mul_pd(tan_reduced_second_f64x4, cos_reduced_second_f64x4);
|
|
236
|
+
|
|
237
|
+
// Initialize lambda_f64x4 and tracking variables
|
|
238
|
+
__m256d lambda_f64x4 = longitude_difference_f64x4;
|
|
239
|
+
__m256d sin_angular_distance_f64x4, cos_angular_distance_f64x4, angular_distance_f64x4;
|
|
240
|
+
__m256d sin_azimuth_f64x4, cos_squared_azimuth_f64x4, cos_double_angular_midpoint_f64x4;
|
|
231
241
|
|
|
232
242
|
// Track convergence and coincident points using masks
|
|
233
|
-
__m256d
|
|
234
|
-
__m256d
|
|
243
|
+
__m256d converged_mask_f64x4 = _mm256_setzero_pd();
|
|
244
|
+
__m256d coincident_mask_f64x4 = _mm256_setzero_pd();
|
|
235
245
|
|
|
236
246
|
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
|
|
237
247
|
// Check if all lanes converged
|
|
238
|
-
int converged_bits = _mm256_movemask_pd(
|
|
248
|
+
int converged_bits = _mm256_movemask_pd(converged_mask_f64x4);
|
|
239
249
|
if (converged_bits == 0xF) break;
|
|
240
250
|
|
|
241
|
-
__m256d
|
|
242
|
-
__m256d
|
|
251
|
+
__m256d sin_lambda_f64x4 = nk_sin_f64x4_haswell_(lambda_f64x4);
|
|
252
|
+
__m256d cos_lambda_f64x4 = nk_cos_f64x4_haswell_(lambda_f64x4);
|
|
243
253
|
|
|
244
|
-
// sin²(
|
|
245
|
-
__m256d
|
|
246
|
-
__m256d
|
|
247
|
-
_mm256_mul_pd(
|
|
248
|
-
_mm256_mul_pd(_mm256_mul_pd(
|
|
249
|
-
__m256d
|
|
250
|
-
|
|
254
|
+
// sin²(angular_distance_f64x4) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
|
|
255
|
+
__m256d cross_term_f64x4 = _mm256_mul_pd(cos_reduced_second_f64x4, sin_lambda_f64x4);
|
|
256
|
+
__m256d mixed_term_f64x4 = _mm256_sub_pd(
|
|
257
|
+
_mm256_mul_pd(cos_reduced_first_f64x4, sin_reduced_second_f64x4),
|
|
258
|
+
_mm256_mul_pd(_mm256_mul_pd(sin_reduced_first_f64x4, cos_reduced_second_f64x4), cos_lambda_f64x4));
|
|
259
|
+
__m256d sin_angular_dist_sq_f64x4 = _mm256_fmadd_pd(cross_term_f64x4, cross_term_f64x4,
|
|
260
|
+
_mm256_mul_pd(mixed_term_f64x4, mixed_term_f64x4));
|
|
261
|
+
sin_angular_distance_f64x4 = _mm256_sqrt_pd(sin_angular_dist_sq_f64x4);
|
|
251
262
|
|
|
252
|
-
// Check for coincident points (
|
|
253
|
-
|
|
263
|
+
// Check for coincident points (sin_angular_distance_f64x4 ≈ 0)
|
|
264
|
+
coincident_mask_f64x4 = _mm256_cmp_pd(sin_angular_distance_f64x4, epsilon_f64x4, _CMP_LT_OS);
|
|
254
265
|
|
|
255
|
-
// cos(
|
|
256
|
-
|
|
257
|
-
|
|
266
|
+
// cos(angular_distance_f64x4) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
|
|
267
|
+
cos_angular_distance_f64x4 = _mm256_fmadd_pd(_mm256_mul_pd(cos_reduced_first_f64x4, cos_reduced_second_f64x4),
|
|
268
|
+
cos_lambda_f64x4,
|
|
269
|
+
_mm256_mul_pd(sin_reduced_first_f64x4, sin_reduced_second_f64x4));
|
|
258
270
|
|
|
259
|
-
//
|
|
260
|
-
|
|
271
|
+
// angular_distance_f64x4 = atan2(sin, cos)
|
|
272
|
+
angular_distance_f64x4 = nk_atan2_f64x4_haswell_(sin_angular_distance_f64x4, cos_angular_distance_f64x4);
|
|
261
273
|
|
|
262
|
-
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(
|
|
274
|
+
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f64x4)
|
|
263
275
|
// Avoid division by zero by using blending
|
|
264
|
-
__m256d
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
276
|
+
__m256d safe_sin_angular_f64x4 = _mm256_blendv_pd(sin_angular_distance_f64x4, one_f64x4, coincident_mask_f64x4);
|
|
277
|
+
sin_azimuth_f64x4 = _mm256_div_pd(
|
|
278
|
+
_mm256_mul_pd(_mm256_mul_pd(cos_reduced_first_f64x4, cos_reduced_second_f64x4), sin_lambda_f64x4),
|
|
279
|
+
safe_sin_angular_f64x4);
|
|
280
|
+
cos_squared_azimuth_f64x4 = _mm256_sub_pd(one_f64x4, _mm256_mul_pd(sin_azimuth_f64x4, sin_azimuth_f64x4));
|
|
268
281
|
|
|
269
282
|
// Handle equatorial case: cos²α ≈ 0
|
|
270
|
-
__m256d
|
|
271
|
-
__m256d
|
|
283
|
+
__m256d equatorial_mask_f64x4 = _mm256_cmp_pd(cos_squared_azimuth_f64x4, epsilon_f64x4, _CMP_LT_OS);
|
|
284
|
+
__m256d safe_cos_sq_azimuth_f64x4 = _mm256_blendv_pd(cos_squared_azimuth_f64x4, one_f64x4,
|
|
285
|
+
equatorial_mask_f64x4);
|
|
272
286
|
|
|
273
287
|
// cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
|
|
274
|
-
__m256d
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
288
|
+
__m256d sin_product_f64x4 = _mm256_mul_pd(sin_reduced_first_f64x4, sin_reduced_second_f64x4);
|
|
289
|
+
cos_double_angular_midpoint_f64x4 = _mm256_sub_pd(
|
|
290
|
+
cos_angular_distance_f64x4,
|
|
291
|
+
_mm256_div_pd(_mm256_mul_pd(two_f64x4, sin_product_f64x4), safe_cos_sq_azimuth_f64x4));
|
|
292
|
+
cos_double_angular_midpoint_f64x4 = _mm256_blendv_pd(cos_double_angular_midpoint_f64x4, _mm256_setzero_pd(),
|
|
293
|
+
equatorial_mask_f64x4);
|
|
279
294
|
|
|
280
295
|
// C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
|
|
281
|
-
__m256d
|
|
282
|
-
_mm256_div_pd(
|
|
283
|
-
_mm256_mul_pd(
|
|
284
|
-
|
|
296
|
+
__m256d correction_factor_f64x4 = _mm256_mul_pd(
|
|
297
|
+
_mm256_div_pd(flattening_f64x4, sixteen_f64x4),
|
|
298
|
+
_mm256_mul_pd(
|
|
299
|
+
cos_squared_azimuth_f64x4,
|
|
300
|
+
_mm256_fmadd_pd(flattening_f64x4, _mm256_fnmadd_pd(three_f64x4, cos_squared_azimuth_f64x4, four_f64x4),
|
|
301
|
+
four_f64x4)));
|
|
285
302
|
|
|
286
303
|
// λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
|
|
287
|
-
__m256d
|
|
288
|
-
//
|
|
289
|
-
__m256d
|
|
290
|
-
//
|
|
291
|
-
__m256d
|
|
292
|
-
|
|
293
|
-
//
|
|
294
|
-
__m256d
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
304
|
+
__m256d cos_2sm_sq_f64x4 = _mm256_mul_pd(cos_double_angular_midpoint_f64x4, cos_double_angular_midpoint_f64x4);
|
|
305
|
+
// innermost_f64x4 = -1 + 2 × cos²(2σₘ)
|
|
306
|
+
__m256d innermost_f64x4 = _mm256_fmadd_pd(two_f64x4, cos_2sm_sq_f64x4, _mm256_set1_pd(-1.0));
|
|
307
|
+
// middle_f64x4 = cos(2σₘ) + C × cos(σ) × innermost_f64x4
|
|
308
|
+
__m256d middle_f64x4 = _mm256_fmadd_pd(_mm256_mul_pd(correction_factor_f64x4, cos_angular_distance_f64x4),
|
|
309
|
+
innermost_f64x4, cos_double_angular_midpoint_f64x4);
|
|
310
|
+
// inner_f64x4 = C × sin(σ) × middle_f64x4
|
|
311
|
+
__m256d inner_f64x4 = _mm256_mul_pd(_mm256_mul_pd(correction_factor_f64x4, sin_angular_distance_f64x4),
|
|
312
|
+
middle_f64x4);
|
|
313
|
+
|
|
314
|
+
// λ' = L + (1-C) * f * sin_α * (σ + inner_f64x4)
|
|
315
|
+
__m256d lambda_new_f64x4 = _mm256_fmadd_pd(
|
|
316
|
+
_mm256_mul_pd(_mm256_mul_pd(_mm256_sub_pd(one_f64x4, correction_factor_f64x4), flattening_f64x4),
|
|
317
|
+
sin_azimuth_f64x4),
|
|
318
|
+
_mm256_add_pd(angular_distance_f64x4, inner_f64x4), longitude_difference_f64x4);
|
|
300
319
|
|
|
301
320
|
// Check convergence: |λ - λ'| < threshold
|
|
302
|
-
__m256d
|
|
303
|
-
|
|
304
|
-
|
|
321
|
+
__m256d lambda_diff_abs_f64x4 = _mm256_andnot_pd(_mm256_set1_pd(-0.0),
|
|
322
|
+
_mm256_sub_pd(lambda_new_f64x4, lambda_f64x4));
|
|
323
|
+
__m256d newly_converged_f64x4 = _mm256_cmp_pd(lambda_diff_abs_f64x4, convergence_threshold_f64x4, _CMP_LT_OS);
|
|
324
|
+
converged_mask_f64x4 = _mm256_or_pd(converged_mask_f64x4, newly_converged_f64x4);
|
|
305
325
|
|
|
306
|
-
// Only update
|
|
307
|
-
|
|
326
|
+
// Only update lambda_f64x4 for non-converged lanes
|
|
327
|
+
lambda_f64x4 = _mm256_blendv_pd(lambda_new_f64x4, lambda_f64x4, converged_mask_f64x4);
|
|
308
328
|
}
|
|
309
329
|
|
|
310
330
|
// Final distance calculation
|
|
311
331
|
// u² = cos²α * (a² - b²) / b²
|
|
312
|
-
__m256d
|
|
313
|
-
__m256d
|
|
314
|
-
__m256d
|
|
332
|
+
__m256d a_sq_f64x4 = _mm256_mul_pd(equatorial_radius_f64x4, equatorial_radius_f64x4);
|
|
333
|
+
__m256d b_sq_f64x4 = _mm256_mul_pd(polar_radius_f64x4, polar_radius_f64x4);
|
|
334
|
+
__m256d u_squared_f64x4 = _mm256_div_pd(
|
|
335
|
+
_mm256_mul_pd(cos_squared_azimuth_f64x4, _mm256_sub_pd(a_sq_f64x4, b_sq_f64x4)), b_sq_f64x4);
|
|
315
336
|
|
|
316
337
|
// A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
|
|
317
|
-
__m256d
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
338
|
+
__m256d series_a_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, _mm256_set1_pd(-175.0), _mm256_set1_pd(320.0));
|
|
339
|
+
series_a_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_a_f64x4, _mm256_set1_pd(-768.0));
|
|
340
|
+
series_a_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_a_f64x4, _mm256_set1_pd(4096.0));
|
|
341
|
+
series_a_f64x4 = _mm256_fmadd_pd(_mm256_div_pd(u_squared_f64x4, _mm256_set1_pd(16384.0)), series_a_f64x4,
|
|
342
|
+
one_f64x4);
|
|
321
343
|
|
|
322
344
|
// B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
|
|
323
|
-
__m256d
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
345
|
+
__m256d series_b_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, _mm256_set1_pd(-47.0), _mm256_set1_pd(74.0));
|
|
346
|
+
series_b_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_b_f64x4, _mm256_set1_pd(-128.0));
|
|
347
|
+
series_b_f64x4 = _mm256_fmadd_pd(u_squared_f64x4, series_b_f64x4, _mm256_set1_pd(256.0));
|
|
348
|
+
series_b_f64x4 = _mm256_mul_pd(_mm256_div_pd(u_squared_f64x4, _mm256_set1_pd(1024.0)), series_b_f64x4);
|
|
327
349
|
|
|
328
350
|
// Δσ = B × sin(σ) × (cos(2σₘ) +
|
|
329
351
|
// B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
|
|
330
|
-
__m256d
|
|
331
|
-
__m256d
|
|
332
|
-
__m256d
|
|
333
|
-
|
|
334
|
-
__m256d
|
|
335
|
-
__m256d
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
352
|
+
__m256d cos_2sm_sq_f64x4 = _mm256_mul_pd(cos_double_angular_midpoint_f64x4, cos_double_angular_midpoint_f64x4);
|
|
353
|
+
__m256d sin_sq_f64x4 = _mm256_mul_pd(sin_angular_distance_f64x4, sin_angular_distance_f64x4);
|
|
354
|
+
__m256d term1_f64x4 = _mm256_fmadd_pd(two_f64x4, cos_2sm_sq_f64x4, _mm256_set1_pd(-1.0));
|
|
355
|
+
term1_f64x4 = _mm256_mul_pd(cos_angular_distance_f64x4, term1_f64x4);
|
|
356
|
+
__m256d term2_f64x4 = _mm256_fmadd_pd(four_f64x4, sin_sq_f64x4, _mm256_set1_pd(-3.0));
|
|
357
|
+
__m256d term3_f64x4 = _mm256_fmadd_pd(four_f64x4, cos_2sm_sq_f64x4, _mm256_set1_pd(-3.0));
|
|
358
|
+
term2_f64x4 = _mm256_mul_pd(
|
|
359
|
+
_mm256_mul_pd(_mm256_div_pd(series_b_f64x4, six_f64x4), cos_double_angular_midpoint_f64x4),
|
|
360
|
+
_mm256_mul_pd(term2_f64x4, term3_f64x4));
|
|
361
|
+
__m256d delta_sigma_f64x4 = _mm256_mul_pd(
|
|
362
|
+
series_b_f64x4, _mm256_mul_pd(sin_angular_distance_f64x4,
|
|
363
|
+
_mm256_add_pd(cos_double_angular_midpoint_f64x4,
|
|
364
|
+
_mm256_mul_pd(_mm256_div_pd(series_b_f64x4, four_f64x4),
|
|
365
|
+
_mm256_sub_pd(term1_f64x4, term2_f64x4)))));
|
|
342
366
|
|
|
343
367
|
// s = b * A * (σ - Δσ)
|
|
344
|
-
__m256d
|
|
345
|
-
|
|
368
|
+
__m256d distances_f64x4 = _mm256_mul_pd(_mm256_mul_pd(polar_radius_f64x4, series_a_f64x4),
|
|
369
|
+
_mm256_sub_pd(angular_distance_f64x4, delta_sigma_f64x4));
|
|
346
370
|
|
|
347
371
|
// Set coincident points to zero
|
|
348
|
-
|
|
372
|
+
distances_f64x4 = _mm256_blendv_pd(distances_f64x4, _mm256_setzero_pd(), coincident_mask_f64x4);
|
|
349
373
|
|
|
350
|
-
return
|
|
374
|
+
return distances_f64x4;
|
|
351
375
|
}
|
|
352
376
|
|
|
353
377
|
NK_PUBLIC void nk_vincenty_f64_haswell( //
|
|
@@ -356,14 +380,14 @@ NK_PUBLIC void nk_vincenty_f64_haswell( //
|
|
|
356
380
|
nk_size_t n, nk_f64_t *results) {
|
|
357
381
|
|
|
358
382
|
while (n >= 4) {
|
|
359
|
-
__m256d
|
|
360
|
-
__m256d
|
|
361
|
-
__m256d
|
|
362
|
-
__m256d
|
|
383
|
+
__m256d first_latitudes_f64x4 = _mm256_loadu_pd(a_lats);
|
|
384
|
+
__m256d first_longitudes_f64x4 = _mm256_loadu_pd(a_lons);
|
|
385
|
+
__m256d second_latitudes_f64x4 = _mm256_loadu_pd(b_lats);
|
|
386
|
+
__m256d second_longitudes_f64x4 = _mm256_loadu_pd(b_lons);
|
|
363
387
|
|
|
364
|
-
__m256d
|
|
365
|
-
|
|
366
|
-
_mm256_storeu_pd(results,
|
|
388
|
+
__m256d distances_f64x4 = nk_vincenty_f64x4_haswell_(first_latitudes_f64x4, first_longitudes_f64x4,
|
|
389
|
+
second_latitudes_f64x4, second_longitudes_f64x4);
|
|
390
|
+
_mm256_storeu_pd(results, distances_f64x4);
|
|
367
391
|
|
|
368
392
|
a_lats += 4, a_lons += 4, b_lats += 4, b_lons += 4, results += 4, n -= 4;
|
|
369
393
|
}
|
|
@@ -375,9 +399,9 @@ NK_PUBLIC void nk_vincenty_f64_haswell( //
|
|
|
375
399
|
nk_partial_load_b64x4_haswell_(a_lons, &a_lon_vec, n);
|
|
376
400
|
nk_partial_load_b64x4_haswell_(b_lats, &b_lat_vec, n);
|
|
377
401
|
nk_partial_load_b64x4_haswell_(b_lons, &b_lon_vec, n);
|
|
378
|
-
__m256d
|
|
379
|
-
|
|
380
|
-
result_vec.ymm_pd =
|
|
402
|
+
__m256d distances_f64x4 = nk_vincenty_f64x4_haswell_(a_lat_vec.ymm_pd, a_lon_vec.ymm_pd, b_lat_vec.ymm_pd,
|
|
403
|
+
b_lon_vec.ymm_pd);
|
|
404
|
+
result_vec.ymm_pd = distances_f64x4;
|
|
381
405
|
nk_partial_store_b64x4_haswell_(&result_vec, results, n);
|
|
382
406
|
}
|
|
383
407
|
}
|
|
@@ -386,164 +410,180 @@ NK_PUBLIC void nk_vincenty_f64_haswell( //
|
|
|
386
410
|
* @brief AVX2 helper for Vincenty's geodesic distance on 8 f32 point pairs.
|
|
387
411
|
* @note This is a true SIMD implementation using masked convergence tracking via blending.
|
|
388
412
|
*/
|
|
389
|
-
NK_INTERNAL __m256 nk_vincenty_f32x8_haswell_(
|
|
390
|
-
__m256
|
|
391
|
-
__m256
|
|
392
|
-
|
|
393
|
-
__m256 const
|
|
394
|
-
__m256 const
|
|
395
|
-
__m256 const
|
|
396
|
-
__m256 const
|
|
397
|
-
__m256 const
|
|
398
|
-
__m256 const
|
|
399
|
-
__m256 const
|
|
400
|
-
__m256 const
|
|
401
|
-
__m256 const
|
|
402
|
-
__m256 const
|
|
403
|
-
__m256 const
|
|
413
|
+
NK_INTERNAL __m256 nk_vincenty_f32x8_haswell_( //
|
|
414
|
+
__m256 first_latitudes_f32x8, __m256 first_longitudes_f32x8, //
|
|
415
|
+
__m256 second_latitudes_f32x8, __m256 second_longitudes_f32x8) {
|
|
416
|
+
|
|
417
|
+
__m256 const equatorial_radius_f32x8 = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
418
|
+
__m256 const polar_radius_f32x8 = _mm256_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
419
|
+
__m256 const flattening_f32x8 = _mm256_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
420
|
+
__m256 const convergence_threshold_f32x8 = _mm256_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
|
|
421
|
+
__m256 const one_f32x8 = _mm256_set1_ps(1.0f);
|
|
422
|
+
__m256 const two_f32x8 = _mm256_set1_ps(2.0f);
|
|
423
|
+
__m256 const three_f32x8 = _mm256_set1_ps(3.0f);
|
|
424
|
+
__m256 const four_f32x8 = _mm256_set1_ps(4.0f);
|
|
425
|
+
__m256 const six_f32x8 = _mm256_set1_ps(6.0f);
|
|
426
|
+
__m256 const sixteen_f32x8 = _mm256_set1_ps(16.0f);
|
|
427
|
+
__m256 const epsilon_f32x8 = _mm256_set1_ps(1e-7f);
|
|
404
428
|
|
|
405
429
|
// Longitude difference
|
|
406
|
-
__m256
|
|
430
|
+
__m256 longitude_difference_f32x8 = _mm256_sub_ps(second_longitudes_f32x8, first_longitudes_f32x8);
|
|
407
431
|
|
|
408
432
|
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
409
|
-
__m256
|
|
410
|
-
__m256
|
|
411
|
-
|
|
412
|
-
__m256
|
|
413
|
-
|
|
433
|
+
__m256 one_minus_f_f32x8 = _mm256_sub_ps(one_f32x8, flattening_f32x8);
|
|
434
|
+
__m256 tan_first_f32x8 = _mm256_div_ps(nk_sin_f32x8_haswell_(first_latitudes_f32x8),
|
|
435
|
+
nk_cos_f32x8_haswell_(first_latitudes_f32x8));
|
|
436
|
+
__m256 tan_second_f32x8 = _mm256_div_ps(nk_sin_f32x8_haswell_(second_latitudes_f32x8),
|
|
437
|
+
nk_cos_f32x8_haswell_(second_latitudes_f32x8));
|
|
438
|
+
__m256 tan_reduced_first_f32x8 = _mm256_mul_ps(one_minus_f_f32x8, tan_first_f32x8);
|
|
439
|
+
__m256 tan_reduced_second_f32x8 = _mm256_mul_ps(one_minus_f_f32x8, tan_second_f32x8);
|
|
414
440
|
|
|
415
441
|
// cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
|
|
416
|
-
__m256
|
|
417
|
-
|
|
418
|
-
__m256
|
|
419
|
-
__m256
|
|
420
|
-
|
|
421
|
-
__m256
|
|
422
|
-
|
|
423
|
-
// Initialize
|
|
424
|
-
__m256
|
|
425
|
-
__m256
|
|
426
|
-
__m256
|
|
442
|
+
__m256 cos_reduced_first_f32x8 = _mm256_div_ps(
|
|
443
|
+
one_f32x8, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_first_f32x8, tan_reduced_first_f32x8, one_f32x8)));
|
|
444
|
+
__m256 sin_reduced_first_f32x8 = _mm256_mul_ps(tan_reduced_first_f32x8, cos_reduced_first_f32x8);
|
|
445
|
+
__m256 cos_reduced_second_f32x8 = _mm256_div_ps(
|
|
446
|
+
one_f32x8, _mm256_sqrt_ps(_mm256_fmadd_ps(tan_reduced_second_f32x8, tan_reduced_second_f32x8, one_f32x8)));
|
|
447
|
+
__m256 sin_reduced_second_f32x8 = _mm256_mul_ps(tan_reduced_second_f32x8, cos_reduced_second_f32x8);
|
|
448
|
+
|
|
449
|
+
// Initialize lambda_f32x8 and tracking variables
|
|
450
|
+
__m256 lambda_f32x8 = longitude_difference_f32x8;
|
|
451
|
+
__m256 sin_angular_distance_f32x8, cos_angular_distance_f32x8, angular_distance_f32x8;
|
|
452
|
+
__m256 sin_azimuth_f32x8, cos_squared_azimuth_f32x8, cos_double_angular_midpoint_f32x8;
|
|
427
453
|
|
|
428
454
|
// Track convergence and coincident points using masks
|
|
429
|
-
__m256
|
|
430
|
-
__m256
|
|
455
|
+
__m256 converged_mask_f32x8 = _mm256_setzero_ps();
|
|
456
|
+
__m256 coincident_mask_f32x8 = _mm256_setzero_ps();
|
|
431
457
|
|
|
432
458
|
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS; ++iteration) {
|
|
433
459
|
// Check if all lanes converged
|
|
434
|
-
int converged_bits = _mm256_movemask_ps(
|
|
460
|
+
int converged_bits = _mm256_movemask_ps(converged_mask_f32x8);
|
|
435
461
|
if (converged_bits == 0xFF) break;
|
|
436
462
|
|
|
437
|
-
__m256
|
|
438
|
-
__m256
|
|
463
|
+
__m256 sin_lambda_f32x8 = nk_sin_f32x8_haswell_(lambda_f32x8);
|
|
464
|
+
__m256 cos_lambda_f32x8 = nk_cos_f32x8_haswell_(lambda_f32x8);
|
|
439
465
|
|
|
440
|
-
// sin²(
|
|
441
|
-
__m256
|
|
442
|
-
__m256
|
|
443
|
-
_mm256_mul_ps(
|
|
444
|
-
_mm256_mul_ps(_mm256_mul_ps(
|
|
445
|
-
__m256
|
|
446
|
-
|
|
466
|
+
// sin²(angular_distance_f32x8) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
|
|
467
|
+
__m256 cross_term_f32x8 = _mm256_mul_ps(cos_reduced_second_f32x8, sin_lambda_f32x8);
|
|
468
|
+
__m256 mixed_term_f32x8 = _mm256_sub_ps(
|
|
469
|
+
_mm256_mul_ps(cos_reduced_first_f32x8, sin_reduced_second_f32x8),
|
|
470
|
+
_mm256_mul_ps(_mm256_mul_ps(sin_reduced_first_f32x8, cos_reduced_second_f32x8), cos_lambda_f32x8));
|
|
471
|
+
__m256 sin_angular_dist_sq_f32x8 = _mm256_fmadd_ps(cross_term_f32x8, cross_term_f32x8,
|
|
472
|
+
_mm256_mul_ps(mixed_term_f32x8, mixed_term_f32x8));
|
|
473
|
+
sin_angular_distance_f32x8 = _mm256_sqrt_ps(sin_angular_dist_sq_f32x8);
|
|
447
474
|
|
|
448
|
-
// Check for coincident points (
|
|
449
|
-
|
|
475
|
+
// Check for coincident points (sin_angular_distance_f32x8 ≈ 0)
|
|
476
|
+
coincident_mask_f32x8 = _mm256_cmp_ps(sin_angular_distance_f32x8, epsilon_f32x8, _CMP_LT_OS);
|
|
450
477
|
|
|
451
|
-
// cos(
|
|
452
|
-
|
|
453
|
-
|
|
478
|
+
// cos(angular_distance_f32x8) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
|
|
479
|
+
cos_angular_distance_f32x8 = _mm256_fmadd_ps(_mm256_mul_ps(cos_reduced_first_f32x8, cos_reduced_second_f32x8),
|
|
480
|
+
cos_lambda_f32x8,
|
|
481
|
+
_mm256_mul_ps(sin_reduced_first_f32x8, sin_reduced_second_f32x8));
|
|
454
482
|
|
|
455
|
-
//
|
|
456
|
-
|
|
483
|
+
// angular_distance_f32x8 = atan2(sin, cos)
|
|
484
|
+
angular_distance_f32x8 = nk_atan2_f32x8_haswell_(sin_angular_distance_f32x8, cos_angular_distance_f32x8);
|
|
457
485
|
|
|
458
|
-
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(
|
|
486
|
+
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f32x8)
|
|
459
487
|
// Avoid division by zero by using blending
|
|
460
|
-
__m256
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
488
|
+
__m256 safe_sin_angular_f32x8 = _mm256_blendv_ps(sin_angular_distance_f32x8, one_f32x8, coincident_mask_f32x8);
|
|
489
|
+
sin_azimuth_f32x8 = _mm256_div_ps(
|
|
490
|
+
_mm256_mul_ps(_mm256_mul_ps(cos_reduced_first_f32x8, cos_reduced_second_f32x8), sin_lambda_f32x8),
|
|
491
|
+
safe_sin_angular_f32x8);
|
|
492
|
+
cos_squared_azimuth_f32x8 = _mm256_sub_ps(one_f32x8, _mm256_mul_ps(sin_azimuth_f32x8, sin_azimuth_f32x8));
|
|
464
493
|
|
|
465
494
|
// Handle equatorial case: cos²α ≈ 0
|
|
466
|
-
__m256
|
|
467
|
-
__m256
|
|
495
|
+
__m256 equatorial_mask_f32x8 = _mm256_cmp_ps(cos_squared_azimuth_f32x8, epsilon_f32x8, _CMP_LT_OS);
|
|
496
|
+
__m256 safe_cos_sq_azimuth_f32x8 = _mm256_blendv_ps(cos_squared_azimuth_f32x8, one_f32x8,
|
|
497
|
+
equatorial_mask_f32x8);
|
|
468
498
|
|
|
469
499
|
// cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
|
|
470
|
-
__m256
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
500
|
+
__m256 sin_product_f32x8 = _mm256_mul_ps(sin_reduced_first_f32x8, sin_reduced_second_f32x8);
|
|
501
|
+
cos_double_angular_midpoint_f32x8 = _mm256_sub_ps(
|
|
502
|
+
cos_angular_distance_f32x8,
|
|
503
|
+
_mm256_div_ps(_mm256_mul_ps(two_f32x8, sin_product_f32x8), safe_cos_sq_azimuth_f32x8));
|
|
504
|
+
cos_double_angular_midpoint_f32x8 = _mm256_blendv_ps(cos_double_angular_midpoint_f32x8, _mm256_setzero_ps(),
|
|
505
|
+
equatorial_mask_f32x8);
|
|
475
506
|
|
|
476
507
|
// C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
|
|
477
|
-
__m256
|
|
478
|
-
_mm256_div_ps(
|
|
479
|
-
_mm256_mul_ps(
|
|
480
|
-
|
|
508
|
+
__m256 correction_factor_f32x8 = _mm256_mul_ps(
|
|
509
|
+
_mm256_div_ps(flattening_f32x8, sixteen_f32x8),
|
|
510
|
+
_mm256_mul_ps(
|
|
511
|
+
cos_squared_azimuth_f32x8,
|
|
512
|
+
_mm256_fmadd_ps(flattening_f32x8, _mm256_fnmadd_ps(three_f32x8, cos_squared_azimuth_f32x8, four_f32x8),
|
|
513
|
+
four_f32x8)));
|
|
481
514
|
|
|
482
515
|
// λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
|
|
483
|
-
__m256
|
|
484
|
-
//
|
|
485
|
-
__m256
|
|
486
|
-
//
|
|
487
|
-
__m256
|
|
488
|
-
|
|
489
|
-
//
|
|
490
|
-
__m256
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
516
|
+
__m256 cos_2sm_sq_f32x8 = _mm256_mul_ps(cos_double_angular_midpoint_f32x8, cos_double_angular_midpoint_f32x8);
|
|
517
|
+
// innermost_f32x8 = -1 + 2 × cos²(2σₘ)
|
|
518
|
+
__m256 innermost_f32x8 = _mm256_fmadd_ps(two_f32x8, cos_2sm_sq_f32x8, _mm256_set1_ps(-1.0f));
|
|
519
|
+
// middle_f32x8 = cos(2σₘ) + C × cos(σ) × innermost_f32x8
|
|
520
|
+
__m256 middle_f32x8 = _mm256_fmadd_ps(_mm256_mul_ps(correction_factor_f32x8, cos_angular_distance_f32x8),
|
|
521
|
+
innermost_f32x8, cos_double_angular_midpoint_f32x8);
|
|
522
|
+
// inner_f32x8 = C × sin(σ) × middle_f32x8
|
|
523
|
+
__m256 inner_f32x8 = _mm256_mul_ps(_mm256_mul_ps(correction_factor_f32x8, sin_angular_distance_f32x8),
|
|
524
|
+
middle_f32x8);
|
|
525
|
+
|
|
526
|
+
// λ' = L + (1-C) * f * sin_α * (σ + inner_f32x8)
|
|
527
|
+
__m256 lambda_new_f32x8 = _mm256_fmadd_ps(
|
|
528
|
+
_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(one_f32x8, correction_factor_f32x8), flattening_f32x8),
|
|
529
|
+
sin_azimuth_f32x8),
|
|
530
|
+
_mm256_add_ps(angular_distance_f32x8, inner_f32x8), longitude_difference_f32x8);
|
|
496
531
|
|
|
497
532
|
// Check convergence: |λ - λ'| < threshold
|
|
498
|
-
__m256
|
|
499
|
-
|
|
500
|
-
|
|
533
|
+
__m256 lambda_diff_abs_f32x8 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f),
|
|
534
|
+
_mm256_sub_ps(lambda_new_f32x8, lambda_f32x8));
|
|
535
|
+
__m256 newly_converged_f32x8 = _mm256_cmp_ps(lambda_diff_abs_f32x8, convergence_threshold_f32x8, _CMP_LT_OS);
|
|
536
|
+
converged_mask_f32x8 = _mm256_or_ps(converged_mask_f32x8, newly_converged_f32x8);
|
|
501
537
|
|
|
502
|
-
// Only update
|
|
503
|
-
|
|
538
|
+
// Only update lambda_f32x8 for non-converged lanes
|
|
539
|
+
lambda_f32x8 = _mm256_blendv_ps(lambda_new_f32x8, lambda_f32x8, converged_mask_f32x8);
|
|
504
540
|
}
|
|
505
541
|
|
|
506
542
|
// Final distance calculation
|
|
507
543
|
// u² = cos²α * (a² - b²) / b²
|
|
508
|
-
__m256
|
|
509
|
-
__m256
|
|
510
|
-
__m256
|
|
544
|
+
__m256 a_sq_f32x8 = _mm256_mul_ps(equatorial_radius_f32x8, equatorial_radius_f32x8);
|
|
545
|
+
__m256 b_sq_f32x8 = _mm256_mul_ps(polar_radius_f32x8, polar_radius_f32x8);
|
|
546
|
+
__m256 u_squared_f32x8 = _mm256_div_ps(
|
|
547
|
+
_mm256_mul_ps(cos_squared_azimuth_f32x8, _mm256_sub_ps(a_sq_f32x8, b_sq_f32x8)), b_sq_f32x8);
|
|
511
548
|
|
|
512
549
|
// A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
|
|
513
|
-
__m256
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
550
|
+
__m256 series_a_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, _mm256_set1_ps(-175.0f), _mm256_set1_ps(320.0f));
|
|
551
|
+
series_a_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_a_f32x8, _mm256_set1_ps(-768.0f));
|
|
552
|
+
series_a_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_a_f32x8, _mm256_set1_ps(4096.0f));
|
|
553
|
+
series_a_f32x8 = _mm256_fmadd_ps(_mm256_div_ps(u_squared_f32x8, _mm256_set1_ps(16384.0f)), series_a_f32x8,
|
|
554
|
+
one_f32x8);
|
|
517
555
|
|
|
518
556
|
// B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
|
|
519
|
-
__m256
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
557
|
+
__m256 series_b_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, _mm256_set1_ps(-47.0f), _mm256_set1_ps(74.0f));
|
|
558
|
+
series_b_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_b_f32x8, _mm256_set1_ps(-128.0f));
|
|
559
|
+
series_b_f32x8 = _mm256_fmadd_ps(u_squared_f32x8, series_b_f32x8, _mm256_set1_ps(256.0f));
|
|
560
|
+
series_b_f32x8 = _mm256_mul_ps(_mm256_div_ps(u_squared_f32x8, _mm256_set1_ps(1024.0f)), series_b_f32x8);
|
|
523
561
|
|
|
524
562
|
// Δσ = B × sin(σ) × (cos(2σₘ) +
|
|
525
563
|
// B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
|
|
526
|
-
__m256
|
|
527
|
-
__m256
|
|
528
|
-
__m256
|
|
529
|
-
|
|
530
|
-
__m256
|
|
531
|
-
__m256
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
564
|
+
__m256 cos_2sm_sq_f32x8 = _mm256_mul_ps(cos_double_angular_midpoint_f32x8, cos_double_angular_midpoint_f32x8);
|
|
565
|
+
__m256 sin_sq_f32x8 = _mm256_mul_ps(sin_angular_distance_f32x8, sin_angular_distance_f32x8);
|
|
566
|
+
__m256 term1_f32x8 = _mm256_fmadd_ps(two_f32x8, cos_2sm_sq_f32x8, _mm256_set1_ps(-1.0f));
|
|
567
|
+
term1_f32x8 = _mm256_mul_ps(cos_angular_distance_f32x8, term1_f32x8);
|
|
568
|
+
__m256 term2_f32x8 = _mm256_fmadd_ps(four_f32x8, sin_sq_f32x8, _mm256_set1_ps(-3.0f));
|
|
569
|
+
__m256 term3_f32x8 = _mm256_fmadd_ps(four_f32x8, cos_2sm_sq_f32x8, _mm256_set1_ps(-3.0f));
|
|
570
|
+
term2_f32x8 = _mm256_mul_ps(
|
|
571
|
+
_mm256_mul_ps(_mm256_div_ps(series_b_f32x8, six_f32x8), cos_double_angular_midpoint_f32x8),
|
|
572
|
+
_mm256_mul_ps(term2_f32x8, term3_f32x8));
|
|
573
|
+
__m256 delta_sigma_f32x8 = _mm256_mul_ps(
|
|
574
|
+
series_b_f32x8, _mm256_mul_ps(sin_angular_distance_f32x8,
|
|
575
|
+
_mm256_add_ps(cos_double_angular_midpoint_f32x8,
|
|
576
|
+
_mm256_mul_ps(_mm256_div_ps(series_b_f32x8, four_f32x8),
|
|
577
|
+
_mm256_sub_ps(term1_f32x8, term2_f32x8)))));
|
|
538
578
|
|
|
539
579
|
// s = b * A * (σ - Δσ)
|
|
540
|
-
__m256
|
|
541
|
-
|
|
580
|
+
__m256 distances_f32x8 = _mm256_mul_ps(_mm256_mul_ps(polar_radius_f32x8, series_a_f32x8),
|
|
581
|
+
_mm256_sub_ps(angular_distance_f32x8, delta_sigma_f32x8));
|
|
542
582
|
|
|
543
583
|
// Set coincident points to zero
|
|
544
|
-
|
|
584
|
+
distances_f32x8 = _mm256_blendv_ps(distances_f32x8, _mm256_setzero_ps(), coincident_mask_f32x8);
|
|
545
585
|
|
|
546
|
-
return
|
|
586
|
+
return distances_f32x8;
|
|
547
587
|
}
|
|
548
588
|
|
|
549
589
|
NK_PUBLIC void nk_vincenty_f32_haswell( //
|
|
@@ -552,14 +592,14 @@ NK_PUBLIC void nk_vincenty_f32_haswell( //
|
|
|
552
592
|
nk_size_t n, nk_f32_t *results) {
|
|
553
593
|
|
|
554
594
|
while (n >= 8) {
|
|
555
|
-
__m256
|
|
556
|
-
__m256
|
|
557
|
-
__m256
|
|
558
|
-
__m256
|
|
595
|
+
__m256 first_latitudes_f32x8 = _mm256_loadu_ps(a_lats);
|
|
596
|
+
__m256 first_longitudes_f32x8 = _mm256_loadu_ps(a_lons);
|
|
597
|
+
__m256 second_latitudes_f32x8 = _mm256_loadu_ps(b_lats);
|
|
598
|
+
__m256 second_longitudes_f32x8 = _mm256_loadu_ps(b_lons);
|
|
559
599
|
|
|
560
|
-
__m256
|
|
561
|
-
|
|
562
|
-
_mm256_storeu_ps(results,
|
|
600
|
+
__m256 distances_f32x8 = nk_vincenty_f32x8_haswell_(first_latitudes_f32x8, first_longitudes_f32x8,
|
|
601
|
+
second_latitudes_f32x8, second_longitudes_f32x8);
|
|
602
|
+
_mm256_storeu_ps(results, distances_f32x8);
|
|
563
603
|
|
|
564
604
|
a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
|
|
565
605
|
}
|
|
@@ -571,9 +611,9 @@ NK_PUBLIC void nk_vincenty_f32_haswell( //
|
|
|
571
611
|
nk_partial_load_b32x8_serial_(a_lons, &a_lon_vec, n);
|
|
572
612
|
nk_partial_load_b32x8_serial_(b_lats, &b_lat_vec, n);
|
|
573
613
|
nk_partial_load_b32x8_serial_(b_lons, &b_lon_vec, n);
|
|
574
|
-
__m256
|
|
575
|
-
|
|
576
|
-
result_vec.ymm_ps =
|
|
614
|
+
__m256 distances_f32x8 = nk_vincenty_f32x8_haswell_(a_lat_vec.ymm_ps, a_lon_vec.ymm_ps, b_lat_vec.ymm_ps,
|
|
615
|
+
b_lon_vec.ymm_ps);
|
|
616
|
+
result_vec.ymm_ps = distances_f32x8;
|
|
577
617
|
nk_partial_store_b32x8_serial_(&result_vec, results, n);
|
|
578
618
|
}
|
|
579
619
|
}
|