numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -8,13 +8,14 @@
|
|
|
8
8
|
*
|
|
9
9
|
* @section geospatial_skylake_instructions Key AVX-512 Geospatial Instructions
|
|
10
10
|
*
|
|
11
|
-
* Intrinsic
|
|
12
|
-
* _mm512_sqrt_ps
|
|
13
|
-
* _mm512_sqrt_pd
|
|
14
|
-
* _mm256_div_ps
|
|
15
|
-
* _mm256_div_pd
|
|
16
|
-
* _mm256_fmadd_ps
|
|
17
|
-
* _mm256_fmadd_pd
|
|
11
|
+
* Intrinsic Instruction Icelake Genoa
|
|
12
|
+
* _mm512_sqrt_ps VSQRTPS (ZMM, ZMM) 19cy @ p0+p0+p05 15cy @ p01
|
|
13
|
+
* _mm512_sqrt_pd VSQRTPD (ZMM, ZMM) 23cy @ p0+p0+p05 21cy @ p01
|
|
14
|
+
* _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11cy @ p0 11cy @ p01
|
|
15
|
+
* _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13cy @ p0 13cy @ p01
|
|
16
|
+
* _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
|
|
17
|
+
* _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
|
|
18
|
+
* _mm512_cmp_ps_mask VCMPPS (K, ZMM, ZMM, I8) 4cy @ p5 5cy @ p01
|
|
18
19
|
*/
|
|
19
20
|
#ifndef NK_GEOSPATIAL_SKYLAKE_H
|
|
20
21
|
#define NK_GEOSPATIAL_SKYLAKE_H
|
|
@@ -37,44 +38,48 @@ extern "C" {
|
|
|
37
38
|
#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "f16c", "fma", "bmi", "bmi2")
|
|
38
39
|
#endif
|
|
39
40
|
|
|
40
|
-
NK_INTERNAL __m512d nk_haversine_f64x8_skylake_(
|
|
41
|
-
__m512d
|
|
42
|
-
__m512d
|
|
41
|
+
NK_INTERNAL __m512d nk_haversine_f64x8_skylake_( //
|
|
42
|
+
__m512d first_latitudes_f64x8, __m512d first_longitudes_f64x8, //
|
|
43
|
+
__m512d second_latitudes_f64x8, __m512d second_longitudes_f64x8) {
|
|
43
44
|
|
|
44
|
-
__m512d const
|
|
45
|
-
__m512d const
|
|
46
|
-
__m512d const
|
|
47
|
-
__m512d const
|
|
45
|
+
__m512d const earth_radius_f64x8 = _mm512_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
|
|
46
|
+
__m512d const half_f64x8 = _mm512_set1_pd(0.5);
|
|
47
|
+
__m512d const one_f64x8 = _mm512_set1_pd(1.0);
|
|
48
|
+
__m512d const two_f64x8 = _mm512_set1_pd(2.0);
|
|
48
49
|
|
|
49
|
-
__m512d
|
|
50
|
-
__m512d
|
|
50
|
+
__m512d latitude_delta_f64x8 = _mm512_sub_pd(second_latitudes_f64x8, first_latitudes_f64x8);
|
|
51
|
+
__m512d longitude_delta_f64x8 = _mm512_sub_pd(second_longitudes_f64x8, first_longitudes_f64x8);
|
|
51
52
|
|
|
52
53
|
// Haversine terms: sin²(Δ/2)
|
|
53
|
-
__m512d
|
|
54
|
-
__m512d
|
|
55
|
-
__m512d
|
|
56
|
-
__m512d
|
|
57
|
-
__m512d
|
|
58
|
-
|
|
54
|
+
__m512d latitude_delta_half_f64x8 = _mm512_mul_pd(latitude_delta_f64x8, half_f64x8);
|
|
55
|
+
__m512d longitude_delta_half_f64x8 = _mm512_mul_pd(longitude_delta_f64x8, half_f64x8);
|
|
56
|
+
__m512d sin_latitude_delta_half_f64x8 = nk_sin_f64x8_skylake_(latitude_delta_half_f64x8);
|
|
57
|
+
__m512d sin_longitude_delta_half_f64x8 = nk_sin_f64x8_skylake_(longitude_delta_half_f64x8);
|
|
58
|
+
__m512d sin_squared_latitude_delta_half_f64x8 = _mm512_mul_pd(sin_latitude_delta_half_f64x8,
|
|
59
|
+
sin_latitude_delta_half_f64x8);
|
|
60
|
+
__m512d sin_squared_longitude_delta_half_f64x8 = _mm512_mul_pd(sin_longitude_delta_half_f64x8,
|
|
61
|
+
sin_longitude_delta_half_f64x8);
|
|
59
62
|
|
|
60
63
|
// Latitude cosine product
|
|
61
|
-
__m512d
|
|
62
|
-
__m512d
|
|
63
|
-
__m512d
|
|
64
|
+
__m512d cos_first_latitude_f64x8 = nk_cos_f64x8_skylake_(first_latitudes_f64x8);
|
|
65
|
+
__m512d cos_second_latitude_f64x8 = nk_cos_f64x8_skylake_(second_latitudes_f64x8);
|
|
66
|
+
__m512d cos_latitude_product_f64x8 = _mm512_mul_pd(cos_first_latitude_f64x8, cos_second_latitude_f64x8);
|
|
64
67
|
|
|
65
68
|
// a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
|
|
66
|
-
__m512d
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
69
|
+
__m512d haversine_term_f64x8 = _mm512_add_pd(
|
|
70
|
+
sin_squared_latitude_delta_half_f64x8,
|
|
71
|
+
_mm512_mul_pd(cos_latitude_product_f64x8, sin_squared_longitude_delta_half_f64x8));
|
|
72
|
+
// Clamp haversine_term_f64x8 to [0, 1] to prevent NaN from sqrt of negative values
|
|
73
|
+
__m512d zero_f64x8 = _mm512_setzero_pd();
|
|
74
|
+
haversine_term_f64x8 = _mm512_max_pd(zero_f64x8, _mm512_min_pd(one_f64x8, haversine_term_f64x8));
|
|
71
75
|
|
|
72
76
|
// Central angle: c = 2 × atan2(√a, √(1-a))
|
|
73
|
-
__m512d
|
|
74
|
-
__m512d
|
|
75
|
-
__m512d
|
|
77
|
+
__m512d sqrt_haversine_f64x8 = _mm512_sqrt_pd(haversine_term_f64x8);
|
|
78
|
+
__m512d sqrt_complement_f64x8 = _mm512_sqrt_pd(_mm512_sub_pd(one_f64x8, haversine_term_f64x8));
|
|
79
|
+
__m512d central_angle_f64x8 = _mm512_mul_pd(two_f64x8,
|
|
80
|
+
nk_atan2_f64x8_skylake_(sqrt_haversine_f64x8, sqrt_complement_f64x8));
|
|
76
81
|
|
|
77
|
-
return _mm512_mul_pd(
|
|
82
|
+
return _mm512_mul_pd(earth_radius_f64x8, central_angle_f64x8);
|
|
78
83
|
}
|
|
79
84
|
|
|
80
85
|
NK_PUBLIC void nk_haversine_f64_skylake( //
|
|
@@ -83,14 +88,14 @@ NK_PUBLIC void nk_haversine_f64_skylake( //
|
|
|
83
88
|
nk_size_t n, nk_f64_t *results) {
|
|
84
89
|
|
|
85
90
|
while (n >= 8) {
|
|
86
|
-
__m512d
|
|
87
|
-
__m512d
|
|
88
|
-
__m512d
|
|
89
|
-
__m512d
|
|
91
|
+
__m512d first_latitudes_f64x8 = _mm512_loadu_pd(a_lats);
|
|
92
|
+
__m512d first_longitudes_f64x8 = _mm512_loadu_pd(a_lons);
|
|
93
|
+
__m512d second_latitudes_f64x8 = _mm512_loadu_pd(b_lats);
|
|
94
|
+
__m512d second_longitudes_f64x8 = _mm512_loadu_pd(b_lons);
|
|
90
95
|
|
|
91
|
-
__m512d
|
|
92
|
-
|
|
93
|
-
_mm512_storeu_pd(results,
|
|
96
|
+
__m512d distances_f64x8 = nk_haversine_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
|
|
97
|
+
second_latitudes_f64x8, second_longitudes_f64x8);
|
|
98
|
+
_mm512_storeu_pd(results, distances_f64x8);
|
|
94
99
|
|
|
95
100
|
a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
|
|
96
101
|
}
|
|
@@ -98,14 +103,14 @@ NK_PUBLIC void nk_haversine_f64_skylake( //
|
|
|
98
103
|
// Handle remaining elements with masked operations
|
|
99
104
|
if (n > 0) {
|
|
100
105
|
__mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
|
|
101
|
-
__m512d
|
|
102
|
-
__m512d
|
|
103
|
-
__m512d
|
|
104
|
-
__m512d
|
|
105
|
-
|
|
106
|
-
__m512d
|
|
107
|
-
|
|
108
|
-
_mm512_mask_storeu_pd(results, mask,
|
|
106
|
+
__m512d first_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lats);
|
|
107
|
+
__m512d first_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lons);
|
|
108
|
+
__m512d second_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lats);
|
|
109
|
+
__m512d second_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lons);
|
|
110
|
+
|
|
111
|
+
__m512d distances_f64x8 = nk_haversine_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
|
|
112
|
+
second_latitudes_f64x8, second_longitudes_f64x8);
|
|
113
|
+
_mm512_mask_storeu_pd(results, mask, distances_f64x8);
|
|
109
114
|
}
|
|
110
115
|
}
|
|
111
116
|
|
|
@@ -113,158 +118,171 @@ NK_PUBLIC void nk_haversine_f64_skylake( //
|
|
|
113
118
|
* @brief AVX-512 helper for Vincenty's geodesic distance on 8 f64 point pairs.
|
|
114
119
|
* @note This is a true SIMD implementation using masked convergence tracking.
|
|
115
120
|
*/
|
|
116
|
-
NK_INTERNAL __m512d nk_vincenty_f64x8_skylake_(
|
|
117
|
-
__m512d
|
|
118
|
-
__m512d
|
|
119
|
-
|
|
120
|
-
__m512d const
|
|
121
|
-
__m512d const
|
|
122
|
-
__m512d const
|
|
123
|
-
__m512d const
|
|
124
|
-
__m512d const
|
|
125
|
-
__m512d const
|
|
126
|
-
__m512d const
|
|
127
|
-
__m512d const
|
|
128
|
-
__m512d const
|
|
129
|
-
__m512d const
|
|
121
|
+
NK_INTERNAL __m512d nk_vincenty_f64x8_skylake_( //
|
|
122
|
+
__m512d first_latitudes_f64x8, __m512d first_longitudes_f64x8, //
|
|
123
|
+
__m512d second_latitudes_f64x8, __m512d second_longitudes_f64x8) {
|
|
124
|
+
|
|
125
|
+
__m512d const equatorial_radius_f64x8 = _mm512_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
126
|
+
__m512d const polar_radius_f64x8 = _mm512_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
127
|
+
__m512d const flattening_f64x8 = _mm512_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
128
|
+
__m512d const convergence_threshold_f64x8 = _mm512_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
|
|
129
|
+
__m512d const one_f64x8 = _mm512_set1_pd(1.0);
|
|
130
|
+
__m512d const two_f64x8 = _mm512_set1_pd(2.0);
|
|
131
|
+
__m512d const three_f64x8 = _mm512_set1_pd(3.0);
|
|
132
|
+
__m512d const four_f64x8 = _mm512_set1_pd(4.0);
|
|
133
|
+
__m512d const six_f64x8 = _mm512_set1_pd(6.0);
|
|
134
|
+
__m512d const sixteen_f64x8 = _mm512_set1_pd(16.0);
|
|
130
135
|
|
|
131
136
|
// Longitude difference
|
|
132
|
-
__m512d
|
|
137
|
+
__m512d longitude_difference_f64x8 = _mm512_sub_pd(second_longitudes_f64x8, first_longitudes_f64x8);
|
|
133
138
|
|
|
134
139
|
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
135
|
-
__m512d
|
|
136
|
-
__m512d
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
__m512d
|
|
140
|
+
__m512d one_minus_f_f64x8 = _mm512_sub_pd(one_f64x8, flattening_f64x8);
|
|
141
|
+
__m512d tan_first_f64x8 = _mm512_div_pd(nk_sin_f64x8_skylake_(first_latitudes_f64x8),
|
|
142
|
+
nk_cos_f64x8_skylake_(first_latitudes_f64x8));
|
|
143
|
+
__m512d tan_second_f64x8 = _mm512_div_pd(nk_sin_f64x8_skylake_(second_latitudes_f64x8),
|
|
144
|
+
nk_cos_f64x8_skylake_(second_latitudes_f64x8));
|
|
145
|
+
__m512d tan_reduced_first_f64x8 = _mm512_mul_pd(one_minus_f_f64x8, tan_first_f64x8);
|
|
146
|
+
__m512d tan_reduced_second_f64x8 = _mm512_mul_pd(one_minus_f_f64x8, tan_second_f64x8);
|
|
141
147
|
|
|
142
148
|
// cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
|
|
143
|
-
__m512d
|
|
144
|
-
|
|
145
|
-
__m512d
|
|
146
|
-
__m512d
|
|
147
|
-
|
|
148
|
-
__m512d
|
|
149
|
-
|
|
150
|
-
// Initialize
|
|
151
|
-
__m512d
|
|
152
|
-
__m512d
|
|
153
|
-
__m512d
|
|
149
|
+
__m512d cos_reduced_first_f64x8 = _mm512_div_pd(
|
|
150
|
+
one_f64x8, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_first_f64x8, tan_reduced_first_f64x8, one_f64x8)));
|
|
151
|
+
__m512d sin_reduced_first_f64x8 = _mm512_mul_pd(tan_reduced_first_f64x8, cos_reduced_first_f64x8);
|
|
152
|
+
__m512d cos_reduced_second_f64x8 = _mm512_div_pd(
|
|
153
|
+
one_f64x8, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_second_f64x8, tan_reduced_second_f64x8, one_f64x8)));
|
|
154
|
+
__m512d sin_reduced_second_f64x8 = _mm512_mul_pd(tan_reduced_second_f64x8, cos_reduced_second_f64x8);
|
|
155
|
+
|
|
156
|
+
// Initialize lambda_f64x8 and tracking variables
|
|
157
|
+
__m512d lambda_f64x8 = longitude_difference_f64x8;
|
|
158
|
+
__m512d sin_angular_distance_f64x8, cos_angular_distance_f64x8, angular_distance_f64x8;
|
|
159
|
+
__m512d sin_azimuth_f64x8, cos_squared_azimuth_f64x8, cos_double_angular_midpoint_f64x8;
|
|
154
160
|
|
|
155
161
|
// Track convergence and coincident points
|
|
156
162
|
__mmask8 converged_mask = 0;
|
|
157
163
|
__mmask8 coincident_mask = 0;
|
|
158
164
|
|
|
159
165
|
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFF; ++iteration) {
|
|
160
|
-
__m512d
|
|
161
|
-
__m512d
|
|
162
|
-
|
|
163
|
-
// sin²(
|
|
164
|
-
__m512d
|
|
165
|
-
__m512d
|
|
166
|
-
_mm512_mul_pd(
|
|
167
|
-
_mm512_mul_pd(_mm512_mul_pd(
|
|
168
|
-
__m512d
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
166
|
+
__m512d sin_lambda_f64x8 = nk_sin_f64x8_skylake_(lambda_f64x8);
|
|
167
|
+
__m512d cos_lambda_f64x8 = nk_cos_f64x8_skylake_(lambda_f64x8);
|
|
168
|
+
|
|
169
|
+
// sin²(angular_distance_f64x8) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
|
|
170
|
+
__m512d cross_term_f64x8 = _mm512_mul_pd(cos_reduced_second_f64x8, sin_lambda_f64x8);
|
|
171
|
+
__m512d mixed_term_f64x8 = _mm512_sub_pd(
|
|
172
|
+
_mm512_mul_pd(cos_reduced_first_f64x8, sin_reduced_second_f64x8),
|
|
173
|
+
_mm512_mul_pd(_mm512_mul_pd(sin_reduced_first_f64x8, cos_reduced_second_f64x8), cos_lambda_f64x8));
|
|
174
|
+
__m512d sin_angular_dist_sq_f64x8 = _mm512_fmadd_pd(cross_term_f64x8, cross_term_f64x8,
|
|
175
|
+
_mm512_mul_pd(mixed_term_f64x8, mixed_term_f64x8));
|
|
176
|
+
sin_angular_distance_f64x8 = _mm512_sqrt_pd(sin_angular_dist_sq_f64x8);
|
|
177
|
+
|
|
178
|
+
// Check for coincident points (sin_angular_distance_f64x8 ≈ 0)
|
|
179
|
+
coincident_mask = _mm512_cmp_pd_mask(sin_angular_distance_f64x8, _mm512_set1_pd(1e-15), _CMP_LT_OS);
|
|
180
|
+
|
|
181
|
+
// cos(angular_distance_f64x8) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
|
|
182
|
+
cos_angular_distance_f64x8 = _mm512_fmadd_pd(_mm512_mul_pd(cos_reduced_first_f64x8, cos_reduced_second_f64x8),
|
|
183
|
+
cos_lambda_f64x8,
|
|
184
|
+
_mm512_mul_pd(sin_reduced_first_f64x8, sin_reduced_second_f64x8));
|
|
185
|
+
|
|
186
|
+
// angular_distance_f64x8 = atan2(sin, cos)
|
|
187
|
+
angular_distance_f64x8 = nk_atan2_f64x8_skylake_(sin_angular_distance_f64x8, cos_angular_distance_f64x8);
|
|
188
|
+
|
|
189
|
+
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f64x8)
|
|
182
190
|
// Use masked divide: zero result for coincident lanes, avoids division by zero
|
|
183
|
-
|
|
191
|
+
sin_azimuth_f64x8 = _mm512_maskz_div_pd(
|
|
184
192
|
_knot_mask8(coincident_mask),
|
|
185
|
-
_mm512_mul_pd(_mm512_mul_pd(
|
|
186
|
-
|
|
193
|
+
_mm512_mul_pd(_mm512_mul_pd(cos_reduced_first_f64x8, cos_reduced_second_f64x8), sin_lambda_f64x8),
|
|
194
|
+
sin_angular_distance_f64x8);
|
|
195
|
+
cos_squared_azimuth_f64x8 = _mm512_sub_pd(one_f64x8, _mm512_mul_pd(sin_azimuth_f64x8, sin_azimuth_f64x8));
|
|
187
196
|
|
|
188
197
|
// Handle equatorial case: cos²α = 0
|
|
189
|
-
__mmask8 equatorial_mask = _mm512_cmp_pd_mask(
|
|
198
|
+
__mmask8 equatorial_mask = _mm512_cmp_pd_mask(cos_squared_azimuth_f64x8, _mm512_set1_pd(1e-15), _CMP_LT_OS);
|
|
190
199
|
|
|
191
200
|
// cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
|
|
192
|
-
// Use masked divide: for equatorial lanes,
|
|
201
|
+
// Use masked divide: for equatorial lanes, quotient_f64x8 = cos_angular_distance_f64x8 (passthrough),
|
|
193
202
|
// so subtraction yields zero. Avoids division by zero.
|
|
194
|
-
__m512d
|
|
195
|
-
__m512d
|
|
196
|
-
|
|
197
|
-
|
|
203
|
+
__m512d sin_product_f64x8 = _mm512_mul_pd(sin_reduced_first_f64x8, sin_reduced_second_f64x8);
|
|
204
|
+
__m512d quotient_f64x8 = _mm512_mask_div_pd(cos_angular_distance_f64x8, _knot_mask8(equatorial_mask),
|
|
205
|
+
_mm512_mul_pd(two_f64x8, sin_product_f64x8),
|
|
206
|
+
cos_squared_azimuth_f64x8);
|
|
207
|
+
cos_double_angular_midpoint_f64x8 = _mm512_sub_pd(cos_angular_distance_f64x8, quotient_f64x8);
|
|
198
208
|
|
|
199
209
|
// C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
|
|
200
|
-
__m512d
|
|
201
|
-
_mm512_div_pd(
|
|
202
|
-
_mm512_mul_pd(
|
|
203
|
-
|
|
210
|
+
__m512d correction_factor_f64x8 = _mm512_mul_pd(
|
|
211
|
+
_mm512_div_pd(flattening_f64x8, sixteen_f64x8),
|
|
212
|
+
_mm512_mul_pd(
|
|
213
|
+
cos_squared_azimuth_f64x8,
|
|
214
|
+
_mm512_fmadd_pd(flattening_f64x8, _mm512_fnmadd_pd(three_f64x8, cos_squared_azimuth_f64x8, four_f64x8),
|
|
215
|
+
four_f64x8)));
|
|
204
216
|
|
|
205
217
|
// λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
|
|
206
|
-
__m512d
|
|
207
|
-
//
|
|
208
|
-
__m512d
|
|
209
|
-
//
|
|
210
|
-
__m512d
|
|
211
|
-
|
|
212
|
-
//
|
|
213
|
-
__m512d
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
218
|
+
__m512d cos_2sm_sq_f64x8 = _mm512_mul_pd(cos_double_angular_midpoint_f64x8, cos_double_angular_midpoint_f64x8);
|
|
219
|
+
// innermost_f64x8 = -1 + 2 × cos²(2σₘ)
|
|
220
|
+
__m512d innermost_f64x8 = _mm512_fmadd_pd(two_f64x8, cos_2sm_sq_f64x8, _mm512_set1_pd(-1.0));
|
|
221
|
+
// middle_f64x8 = cos(2σₘ) + C × cos(σ) × innermost_f64x8
|
|
222
|
+
__m512d middle_f64x8 = _mm512_fmadd_pd(_mm512_mul_pd(correction_factor_f64x8, cos_angular_distance_f64x8),
|
|
223
|
+
innermost_f64x8, cos_double_angular_midpoint_f64x8);
|
|
224
|
+
// inner_f64x8 = C × sin(σ) × middle_f64x8
|
|
225
|
+
__m512d inner_f64x8 = _mm512_mul_pd(_mm512_mul_pd(correction_factor_f64x8, sin_angular_distance_f64x8),
|
|
226
|
+
middle_f64x8);
|
|
227
|
+
|
|
228
|
+
// λ' = L + (1-C) * f * sin_α * (σ + inner_f64x8)
|
|
229
|
+
__m512d lambda_new_f64x8 = _mm512_fmadd_pd(
|
|
230
|
+
_mm512_mul_pd(_mm512_mul_pd(_mm512_sub_pd(one_f64x8, correction_factor_f64x8), flattening_f64x8),
|
|
231
|
+
sin_azimuth_f64x8),
|
|
232
|
+
_mm512_add_pd(angular_distance_f64x8, inner_f64x8), longitude_difference_f64x8);
|
|
219
233
|
|
|
220
234
|
// Check convergence: |λ - λ'| < threshold
|
|
221
|
-
__m512d
|
|
222
|
-
converged_mask = _mm512_cmp_pd_mask(
|
|
235
|
+
__m512d lambda_diff_f64x8 = _mm512_abs_pd(_mm512_sub_pd(lambda_new_f64x8, lambda_f64x8));
|
|
236
|
+
converged_mask = _mm512_cmp_pd_mask(lambda_diff_f64x8, convergence_threshold_f64x8, _CMP_LT_OS);
|
|
223
237
|
|
|
224
|
-
|
|
238
|
+
lambda_f64x8 = lambda_new_f64x8;
|
|
225
239
|
}
|
|
226
240
|
|
|
227
241
|
// Final distance calculation
|
|
228
242
|
// u² = cos²α * (a² - b²) / b²
|
|
229
|
-
__m512d
|
|
230
|
-
__m512d
|
|
231
|
-
__m512d
|
|
243
|
+
__m512d a_sq_f64x8 = _mm512_mul_pd(equatorial_radius_f64x8, equatorial_radius_f64x8);
|
|
244
|
+
__m512d b_sq_f64x8 = _mm512_mul_pd(polar_radius_f64x8, polar_radius_f64x8);
|
|
245
|
+
__m512d u_squared_f64x8 = _mm512_div_pd(
|
|
246
|
+
_mm512_mul_pd(cos_squared_azimuth_f64x8, _mm512_sub_pd(a_sq_f64x8, b_sq_f64x8)), b_sq_f64x8);
|
|
232
247
|
|
|
233
248
|
// A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
|
|
234
|
-
__m512d
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
249
|
+
__m512d series_a_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, _mm512_set1_pd(-175.0), _mm512_set1_pd(320.0));
|
|
250
|
+
series_a_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_a_f64x8, _mm512_set1_pd(-768.0));
|
|
251
|
+
series_a_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_a_f64x8, _mm512_set1_pd(4096.0));
|
|
252
|
+
series_a_f64x8 = _mm512_fmadd_pd(_mm512_div_pd(u_squared_f64x8, _mm512_set1_pd(16384.0)), series_a_f64x8,
|
|
253
|
+
one_f64x8);
|
|
238
254
|
|
|
239
255
|
// B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
|
|
240
|
-
__m512d
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
256
|
+
__m512d series_b_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, _mm512_set1_pd(-47.0), _mm512_set1_pd(74.0));
|
|
257
|
+
series_b_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_b_f64x8, _mm512_set1_pd(-128.0));
|
|
258
|
+
series_b_f64x8 = _mm512_fmadd_pd(u_squared_f64x8, series_b_f64x8, _mm512_set1_pd(256.0));
|
|
259
|
+
series_b_f64x8 = _mm512_mul_pd(_mm512_div_pd(u_squared_f64x8, _mm512_set1_pd(1024.0)), series_b_f64x8);
|
|
244
260
|
|
|
245
261
|
// Δσ = B × sin(σ) × (cos(2σₘ) +
|
|
246
262
|
// B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
|
|
247
|
-
__m512d
|
|
248
|
-
__m512d
|
|
249
|
-
__m512d
|
|
250
|
-
|
|
251
|
-
__m512d
|
|
252
|
-
__m512d
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
263
|
+
__m512d cos_2sm_sq_f64x8 = _mm512_mul_pd(cos_double_angular_midpoint_f64x8, cos_double_angular_midpoint_f64x8);
|
|
264
|
+
__m512d sin_sq_f64x8 = _mm512_mul_pd(sin_angular_distance_f64x8, sin_angular_distance_f64x8);
|
|
265
|
+
__m512d term1_f64x8 = _mm512_fmadd_pd(two_f64x8, cos_2sm_sq_f64x8, _mm512_set1_pd(-1.0));
|
|
266
|
+
term1_f64x8 = _mm512_mul_pd(cos_angular_distance_f64x8, term1_f64x8);
|
|
267
|
+
__m512d term2_f64x8 = _mm512_fmadd_pd(four_f64x8, sin_sq_f64x8, _mm512_set1_pd(-3.0));
|
|
268
|
+
__m512d term3_f64x8 = _mm512_fmadd_pd(four_f64x8, cos_2sm_sq_f64x8, _mm512_set1_pd(-3.0));
|
|
269
|
+
term2_f64x8 = _mm512_mul_pd(
|
|
270
|
+
_mm512_mul_pd(_mm512_div_pd(series_b_f64x8, six_f64x8), cos_double_angular_midpoint_f64x8),
|
|
271
|
+
_mm512_mul_pd(term2_f64x8, term3_f64x8));
|
|
272
|
+
__m512d delta_sigma_f64x8 = _mm512_mul_pd(
|
|
273
|
+
series_b_f64x8, _mm512_mul_pd(sin_angular_distance_f64x8,
|
|
274
|
+
_mm512_add_pd(cos_double_angular_midpoint_f64x8,
|
|
275
|
+
_mm512_mul_pd(_mm512_div_pd(series_b_f64x8, four_f64x8),
|
|
276
|
+
_mm512_sub_pd(term1_f64x8, term2_f64x8)))));
|
|
259
277
|
|
|
260
278
|
// s = b * A * (σ - Δσ)
|
|
261
|
-
__m512d
|
|
262
|
-
|
|
279
|
+
__m512d distances_f64x8 = _mm512_mul_pd(_mm512_mul_pd(polar_radius_f64x8, series_a_f64x8),
|
|
280
|
+
_mm512_sub_pd(angular_distance_f64x8, delta_sigma_f64x8));
|
|
263
281
|
|
|
264
282
|
// Set coincident points to zero
|
|
265
|
-
|
|
283
|
+
distances_f64x8 = _mm512_mask_blend_pd(coincident_mask, distances_f64x8, _mm512_setzero_pd());
|
|
266
284
|
|
|
267
|
-
return
|
|
285
|
+
return distances_f64x8;
|
|
268
286
|
}
|
|
269
287
|
|
|
270
288
|
NK_PUBLIC void nk_vincenty_f64_skylake( //
|
|
@@ -273,14 +291,14 @@ NK_PUBLIC void nk_vincenty_f64_skylake( //
|
|
|
273
291
|
nk_size_t n, nk_f64_t *results) {
|
|
274
292
|
|
|
275
293
|
while (n >= 8) {
|
|
276
|
-
__m512d
|
|
277
|
-
__m512d
|
|
278
|
-
__m512d
|
|
279
|
-
__m512d
|
|
294
|
+
__m512d first_latitudes_f64x8 = _mm512_loadu_pd(a_lats);
|
|
295
|
+
__m512d first_longitudes_f64x8 = _mm512_loadu_pd(a_lons);
|
|
296
|
+
__m512d second_latitudes_f64x8 = _mm512_loadu_pd(b_lats);
|
|
297
|
+
__m512d second_longitudes_f64x8 = _mm512_loadu_pd(b_lons);
|
|
280
298
|
|
|
281
|
-
__m512d
|
|
282
|
-
|
|
283
|
-
_mm512_storeu_pd(results,
|
|
299
|
+
__m512d distances_f64x8 = nk_vincenty_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
|
|
300
|
+
second_latitudes_f64x8, second_longitudes_f64x8);
|
|
301
|
+
_mm512_storeu_pd(results, distances_f64x8);
|
|
284
302
|
|
|
285
303
|
a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
|
|
286
304
|
}
|
|
@@ -288,56 +306,60 @@ NK_PUBLIC void nk_vincenty_f64_skylake( //
|
|
|
288
306
|
// Handle remaining elements with masked operations
|
|
289
307
|
if (n > 0) {
|
|
290
308
|
__mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
|
|
291
|
-
__m512d
|
|
292
|
-
__m512d
|
|
293
|
-
__m512d
|
|
294
|
-
__m512d
|
|
295
|
-
|
|
296
|
-
__m512d
|
|
297
|
-
|
|
298
|
-
_mm512_mask_storeu_pd(results, mask,
|
|
309
|
+
__m512d first_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lats);
|
|
310
|
+
__m512d first_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, a_lons);
|
|
311
|
+
__m512d second_latitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lats);
|
|
312
|
+
__m512d second_longitudes_f64x8 = _mm512_maskz_loadu_pd(mask, b_lons);
|
|
313
|
+
|
|
314
|
+
__m512d distances_f64x8 = nk_vincenty_f64x8_skylake_(first_latitudes_f64x8, first_longitudes_f64x8,
|
|
315
|
+
second_latitudes_f64x8, second_longitudes_f64x8);
|
|
316
|
+
_mm512_mask_storeu_pd(results, mask, distances_f64x8);
|
|
299
317
|
}
|
|
300
318
|
}
|
|
301
319
|
|
|
302
|
-
NK_INTERNAL __m512 nk_haversine_f32x16_skylake_(
|
|
303
|
-
__m512
|
|
304
|
-
__m512
|
|
320
|
+
NK_INTERNAL __m512 nk_haversine_f32x16_skylake_( //
|
|
321
|
+
__m512 first_latitudes_f32x16, __m512 first_longitudes_f32x16, //
|
|
322
|
+
__m512 second_latitudes_f32x16, __m512 second_longitudes_f32x16) {
|
|
305
323
|
|
|
306
|
-
__m512 const
|
|
307
|
-
__m512 const
|
|
308
|
-
__m512 const
|
|
309
|
-
__m512 const
|
|
324
|
+
__m512 const earth_radius_f32x16 = _mm512_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
|
|
325
|
+
__m512 const half_f32x16 = _mm512_set1_ps(0.5f);
|
|
326
|
+
__m512 const one_f32x16 = _mm512_set1_ps(1.0f);
|
|
327
|
+
__m512 const two_f32x16 = _mm512_set1_ps(2.0f);
|
|
310
328
|
|
|
311
|
-
__m512
|
|
312
|
-
__m512
|
|
329
|
+
__m512 latitude_delta_f32x16 = _mm512_sub_ps(second_latitudes_f32x16, first_latitudes_f32x16);
|
|
330
|
+
__m512 longitude_delta_f32x16 = _mm512_sub_ps(second_longitudes_f32x16, first_longitudes_f32x16);
|
|
313
331
|
|
|
314
332
|
// Haversine terms: sin²(Δ/2)
|
|
315
|
-
__m512
|
|
316
|
-
__m512
|
|
317
|
-
__m512
|
|
318
|
-
__m512
|
|
319
|
-
__m512
|
|
320
|
-
|
|
333
|
+
__m512 latitude_delta_half_f32x16 = _mm512_mul_ps(latitude_delta_f32x16, half_f32x16);
|
|
334
|
+
__m512 longitude_delta_half_f32x16 = _mm512_mul_ps(longitude_delta_f32x16, half_f32x16);
|
|
335
|
+
__m512 sin_latitude_delta_half_f32x16 = nk_sin_f32x16_skylake_(latitude_delta_half_f32x16);
|
|
336
|
+
__m512 sin_longitude_delta_half_f32x16 = nk_sin_f32x16_skylake_(longitude_delta_half_f32x16);
|
|
337
|
+
__m512 sin_squared_latitude_delta_half_f32x16 = _mm512_mul_ps(sin_latitude_delta_half_f32x16,
|
|
338
|
+
sin_latitude_delta_half_f32x16);
|
|
339
|
+
__m512 sin_squared_longitude_delta_half_f32x16 = _mm512_mul_ps(sin_longitude_delta_half_f32x16,
|
|
340
|
+
sin_longitude_delta_half_f32x16);
|
|
321
341
|
|
|
322
342
|
// Latitude cosine product
|
|
323
|
-
__m512
|
|
324
|
-
__m512
|
|
325
|
-
__m512
|
|
343
|
+
__m512 cos_first_latitude_f32x16 = nk_cos_f32x16_skylake_(first_latitudes_f32x16);
|
|
344
|
+
__m512 cos_second_latitude_f32x16 = nk_cos_f32x16_skylake_(second_latitudes_f32x16);
|
|
345
|
+
__m512 cos_latitude_product_f32x16 = _mm512_mul_ps(cos_first_latitude_f32x16, cos_second_latitude_f32x16);
|
|
326
346
|
|
|
327
347
|
// a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
|
|
328
|
-
__m512
|
|
329
|
-
|
|
348
|
+
__m512 haversine_term_f32x16 = _mm512_add_ps(
|
|
349
|
+
sin_squared_latitude_delta_half_f32x16,
|
|
350
|
+
_mm512_mul_ps(cos_latitude_product_f32x16, sin_squared_longitude_delta_half_f32x16));
|
|
330
351
|
|
|
331
352
|
// Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
|
|
332
|
-
__m512
|
|
333
|
-
|
|
353
|
+
__m512 zero_f32x16 = _mm512_setzero_ps();
|
|
354
|
+
haversine_term_f32x16 = _mm512_max_ps(zero_f32x16, _mm512_min_ps(one_f32x16, haversine_term_f32x16));
|
|
334
355
|
|
|
335
356
|
// Central angle: c = 2 × atan2(√a, √(1-a))
|
|
336
|
-
__m512
|
|
337
|
-
__m512
|
|
338
|
-
__m512
|
|
357
|
+
__m512 sqrt_haversine_f32x16 = _mm512_sqrt_ps(haversine_term_f32x16);
|
|
358
|
+
__m512 sqrt_complement_f32x16 = _mm512_sqrt_ps(_mm512_sub_ps(one_f32x16, haversine_term_f32x16));
|
|
359
|
+
__m512 central_angle_f32x16 = _mm512_mul_ps(
|
|
360
|
+
two_f32x16, nk_atan2_f32x16_skylake_(sqrt_haversine_f32x16, sqrt_complement_f32x16));
|
|
339
361
|
|
|
340
|
-
return _mm512_mul_ps(
|
|
362
|
+
return _mm512_mul_ps(earth_radius_f32x16, central_angle_f32x16);
|
|
341
363
|
}
|
|
342
364
|
|
|
343
365
|
NK_PUBLIC void nk_haversine_f32_skylake( //
|
|
@@ -346,14 +368,14 @@ NK_PUBLIC void nk_haversine_f32_skylake( //
|
|
|
346
368
|
nk_size_t n, nk_f32_t *results) {
|
|
347
369
|
|
|
348
370
|
while (n >= 16) {
|
|
349
|
-
__m512
|
|
350
|
-
__m512
|
|
351
|
-
__m512
|
|
352
|
-
__m512
|
|
371
|
+
__m512 first_latitudes_f32x16 = _mm512_loadu_ps(a_lats);
|
|
372
|
+
__m512 first_longitudes_f32x16 = _mm512_loadu_ps(a_lons);
|
|
373
|
+
__m512 second_latitudes_f32x16 = _mm512_loadu_ps(b_lats);
|
|
374
|
+
__m512 second_longitudes_f32x16 = _mm512_loadu_ps(b_lons);
|
|
353
375
|
|
|
354
|
-
__m512
|
|
355
|
-
|
|
356
|
-
_mm512_storeu_ps(results,
|
|
376
|
+
__m512 distances_f32x16 = nk_haversine_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
|
|
377
|
+
second_latitudes_f32x16, second_longitudes_f32x16);
|
|
378
|
+
_mm512_storeu_ps(results, distances_f32x16);
|
|
357
379
|
|
|
358
380
|
a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
|
|
359
381
|
}
|
|
@@ -361,14 +383,14 @@ NK_PUBLIC void nk_haversine_f32_skylake( //
|
|
|
361
383
|
// Handle remaining elements with masked operations
|
|
362
384
|
if (n > 0) {
|
|
363
385
|
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
|
|
364
|
-
__m512
|
|
365
|
-
__m512
|
|
366
|
-
__m512
|
|
367
|
-
__m512
|
|
368
|
-
|
|
369
|
-
__m512
|
|
370
|
-
|
|
371
|
-
_mm512_mask_storeu_ps(results, mask,
|
|
386
|
+
__m512 first_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lats);
|
|
387
|
+
__m512 first_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lons);
|
|
388
|
+
__m512 second_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lats);
|
|
389
|
+
__m512 second_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lons);
|
|
390
|
+
|
|
391
|
+
__m512 distances_f32x16 = nk_haversine_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
|
|
392
|
+
second_latitudes_f32x16, second_longitudes_f32x16);
|
|
393
|
+
_mm512_mask_storeu_ps(results, mask, distances_f32x16);
|
|
372
394
|
}
|
|
373
395
|
}
|
|
374
396
|
|
|
@@ -376,158 +398,172 @@ NK_PUBLIC void nk_haversine_f32_skylake( //
|
|
|
376
398
|
* @brief AVX-512 helper for Vincenty's geodesic distance on 16 f32 point pairs.
|
|
377
399
|
* @note This is a true SIMD implementation using masked convergence tracking.
|
|
378
400
|
*/
|
|
379
|
-
NK_INTERNAL __m512 nk_vincenty_f32x16_skylake_(
|
|
380
|
-
__m512
|
|
381
|
-
__m512
|
|
382
|
-
|
|
383
|
-
__m512 const
|
|
384
|
-
__m512 const
|
|
385
|
-
__m512 const
|
|
386
|
-
__m512 const
|
|
387
|
-
__m512 const
|
|
388
|
-
__m512 const
|
|
389
|
-
__m512 const
|
|
390
|
-
__m512 const
|
|
391
|
-
__m512 const
|
|
392
|
-
__m512 const
|
|
401
|
+
NK_INTERNAL __m512 nk_vincenty_f32x16_skylake_( //
|
|
402
|
+
__m512 first_latitudes_f32x16, __m512 first_longitudes_f32x16, //
|
|
403
|
+
__m512 second_latitudes_f32x16, __m512 second_longitudes_f32x16) {
|
|
404
|
+
|
|
405
|
+
__m512 const equatorial_radius_f32x16 = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
406
|
+
__m512 const polar_radius_f32x16 = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
407
|
+
__m512 const flattening_f32x16 = _mm512_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
408
|
+
__m512 const convergence_threshold_f32x16 = _mm512_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
|
|
409
|
+
__m512 const one_f32x16 = _mm512_set1_ps(1.0f);
|
|
410
|
+
__m512 const two_f32x16 = _mm512_set1_ps(2.0f);
|
|
411
|
+
__m512 const three_f32x16 = _mm512_set1_ps(3.0f);
|
|
412
|
+
__m512 const four_f32x16 = _mm512_set1_ps(4.0f);
|
|
413
|
+
__m512 const six_f32x16 = _mm512_set1_ps(6.0f);
|
|
414
|
+
__m512 const sixteen_f32x16 = _mm512_set1_ps(16.0f);
|
|
393
415
|
|
|
394
416
|
// Longitude difference
|
|
395
|
-
__m512
|
|
417
|
+
__m512 longitude_difference_f32x16 = _mm512_sub_ps(second_longitudes_f32x16, first_longitudes_f32x16);
|
|
396
418
|
|
|
397
419
|
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
398
|
-
__m512
|
|
399
|
-
__m512
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
__m512
|
|
420
|
+
__m512 one_minus_f_f32x16 = _mm512_sub_ps(one_f32x16, flattening_f32x16);
|
|
421
|
+
__m512 tan_first_f32x16 = _mm512_div_ps(nk_sin_f32x16_skylake_(first_latitudes_f32x16),
|
|
422
|
+
nk_cos_f32x16_skylake_(first_latitudes_f32x16));
|
|
423
|
+
__m512 tan_second_f32x16 = _mm512_div_ps(nk_sin_f32x16_skylake_(second_latitudes_f32x16),
|
|
424
|
+
nk_cos_f32x16_skylake_(second_latitudes_f32x16));
|
|
425
|
+
__m512 tan_reduced_first_f32x16 = _mm512_mul_ps(one_minus_f_f32x16, tan_first_f32x16);
|
|
426
|
+
__m512 tan_reduced_second_f32x16 = _mm512_mul_ps(one_minus_f_f32x16, tan_second_f32x16);
|
|
404
427
|
|
|
405
428
|
// cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
|
|
406
|
-
__m512
|
|
407
|
-
|
|
408
|
-
__m512
|
|
409
|
-
__m512
|
|
410
|
-
|
|
411
|
-
__m512
|
|
412
|
-
|
|
413
|
-
// Initialize
|
|
414
|
-
__m512
|
|
415
|
-
__m512
|
|
416
|
-
__m512
|
|
429
|
+
__m512 cos_reduced_first_f32x16 = _mm512_div_ps(
|
|
430
|
+
one_f32x16, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_first_f32x16, tan_reduced_first_f32x16, one_f32x16)));
|
|
431
|
+
__m512 sin_reduced_first_f32x16 = _mm512_mul_ps(tan_reduced_first_f32x16, cos_reduced_first_f32x16);
|
|
432
|
+
__m512 cos_reduced_second_f32x16 = _mm512_div_ps(
|
|
433
|
+
one_f32x16, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_second_f32x16, tan_reduced_second_f32x16, one_f32x16)));
|
|
434
|
+
__m512 sin_reduced_second_f32x16 = _mm512_mul_ps(tan_reduced_second_f32x16, cos_reduced_second_f32x16);
|
|
435
|
+
|
|
436
|
+
// Initialize lambda_f32x16 and tracking variables
|
|
437
|
+
__m512 lambda_f32x16 = longitude_difference_f32x16;
|
|
438
|
+
__m512 sin_angular_distance_f32x16, cos_angular_distance_f32x16, angular_distance_f32x16;
|
|
439
|
+
__m512 sin_azimuth_f32x16, cos_squared_azimuth_f32x16, cos_double_angular_midpoint_f32x16;
|
|
417
440
|
|
|
418
441
|
// Track convergence and coincident points
|
|
419
442
|
__mmask16 converged_mask = 0;
|
|
420
443
|
__mmask16 coincident_mask = 0;
|
|
421
444
|
|
|
422
445
|
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFFFF; ++iteration) {
|
|
423
|
-
__m512
|
|
424
|
-
__m512
|
|
425
|
-
|
|
426
|
-
// sin²(
|
|
427
|
-
__m512
|
|
428
|
-
__m512
|
|
429
|
-
_mm512_mul_ps(
|
|
430
|
-
_mm512_mul_ps(_mm512_mul_ps(
|
|
431
|
-
__m512
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
446
|
+
__m512 sin_lambda_f32x16 = nk_sin_f32x16_skylake_(lambda_f32x16);
|
|
447
|
+
__m512 cos_lambda_f32x16 = nk_cos_f32x16_skylake_(lambda_f32x16);
|
|
448
|
+
|
|
449
|
+
// sin²(angular_distance_f32x16) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
|
|
450
|
+
__m512 cross_term_f32x16 = _mm512_mul_ps(cos_reduced_second_f32x16, sin_lambda_f32x16);
|
|
451
|
+
__m512 mixed_term_f32x16 = _mm512_sub_ps(
|
|
452
|
+
_mm512_mul_ps(cos_reduced_first_f32x16, sin_reduced_second_f32x16),
|
|
453
|
+
_mm512_mul_ps(_mm512_mul_ps(sin_reduced_first_f32x16, cos_reduced_second_f32x16), cos_lambda_f32x16));
|
|
454
|
+
__m512 sin_angular_dist_sq_f32x16 = _mm512_fmadd_ps(cross_term_f32x16, cross_term_f32x16,
|
|
455
|
+
_mm512_mul_ps(mixed_term_f32x16, mixed_term_f32x16));
|
|
456
|
+
sin_angular_distance_f32x16 = _mm512_sqrt_ps(sin_angular_dist_sq_f32x16);
|
|
457
|
+
|
|
458
|
+
// Check for coincident points (sin_angular_distance_f32x16 ≈ 0)
|
|
459
|
+
coincident_mask = _mm512_cmp_ps_mask(sin_angular_distance_f32x16, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
|
|
460
|
+
|
|
461
|
+
// cos(angular_distance_f32x16) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
|
|
462
|
+
cos_angular_distance_f32x16 = _mm512_fmadd_ps(
|
|
463
|
+
_mm512_mul_ps(cos_reduced_first_f32x16, cos_reduced_second_f32x16), cos_lambda_f32x16,
|
|
464
|
+
_mm512_mul_ps(sin_reduced_first_f32x16, sin_reduced_second_f32x16));
|
|
465
|
+
|
|
466
|
+
// angular_distance_f32x16 = atan2(sin, cos)
|
|
467
|
+
angular_distance_f32x16 = nk_atan2_f32x16_skylake_(sin_angular_distance_f32x16, cos_angular_distance_f32x16);
|
|
468
|
+
|
|
469
|
+
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance_f32x16)
|
|
445
470
|
// Use masked divide: zero result for coincident lanes, avoids division by zero
|
|
446
|
-
|
|
471
|
+
sin_azimuth_f32x16 = _mm512_maskz_div_ps(
|
|
447
472
|
_knot_mask16(coincident_mask),
|
|
448
|
-
_mm512_mul_ps(_mm512_mul_ps(
|
|
449
|
-
|
|
473
|
+
_mm512_mul_ps(_mm512_mul_ps(cos_reduced_first_f32x16, cos_reduced_second_f32x16), sin_lambda_f32x16),
|
|
474
|
+
sin_angular_distance_f32x16);
|
|
475
|
+
cos_squared_azimuth_f32x16 = _mm512_sub_ps(one_f32x16, _mm512_mul_ps(sin_azimuth_f32x16, sin_azimuth_f32x16));
|
|
450
476
|
|
|
451
477
|
// Handle equatorial case: cos²α = 0
|
|
452
|
-
__mmask16 equatorial_mask = _mm512_cmp_ps_mask(
|
|
478
|
+
__mmask16 equatorial_mask = _mm512_cmp_ps_mask(cos_squared_azimuth_f32x16, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
|
|
453
479
|
|
|
454
480
|
// cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
|
|
455
|
-
// Use masked divide: for equatorial lanes,
|
|
481
|
+
// Use masked divide: for equatorial lanes, quotient_f32x16 = cos_angular_distance_f32x16 (passthrough),
|
|
456
482
|
// so subtraction yields zero. Avoids division by zero.
|
|
457
|
-
__m512
|
|
458
|
-
__m512
|
|
459
|
-
|
|
460
|
-
|
|
483
|
+
__m512 sin_product_f32x16 = _mm512_mul_ps(sin_reduced_first_f32x16, sin_reduced_second_f32x16);
|
|
484
|
+
__m512 quotient_f32x16 = _mm512_mask_div_ps(cos_angular_distance_f32x16, _knot_mask16(equatorial_mask),
|
|
485
|
+
_mm512_mul_ps(two_f32x16, sin_product_f32x16),
|
|
486
|
+
cos_squared_azimuth_f32x16);
|
|
487
|
+
cos_double_angular_midpoint_f32x16 = _mm512_sub_ps(cos_angular_distance_f32x16, quotient_f32x16);
|
|
461
488
|
|
|
462
489
|
// C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
|
|
463
|
-
__m512
|
|
464
|
-
_mm512_div_ps(
|
|
465
|
-
_mm512_mul_ps(
|
|
466
|
-
|
|
490
|
+
__m512 correction_factor_f32x16 = _mm512_mul_ps(
|
|
491
|
+
_mm512_div_ps(flattening_f32x16, sixteen_f32x16),
|
|
492
|
+
_mm512_mul_ps(
|
|
493
|
+
cos_squared_azimuth_f32x16,
|
|
494
|
+
_mm512_fmadd_ps(flattening_f32x16,
|
|
495
|
+
_mm512_fnmadd_ps(three_f32x16, cos_squared_azimuth_f32x16, four_f32x16), four_f32x16)));
|
|
467
496
|
|
|
468
497
|
// λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
|
|
469
|
-
__m512
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
498
|
+
__m512 cos_2sm_sq_f32x16 = _mm512_mul_ps(cos_double_angular_midpoint_f32x16,
|
|
499
|
+
cos_double_angular_midpoint_f32x16);
|
|
500
|
+
// innermost_f32x16 = -1 + 2 × cos²(2σₘ)
|
|
501
|
+
__m512 innermost_f32x16 = _mm512_fmadd_ps(two_f32x16, cos_2sm_sq_f32x16, _mm512_set1_ps(-1.0f));
|
|
502
|
+
// middle_f32x16 = cos(2σₘ) + C × cos(σ) × innermost_f32x16
|
|
503
|
+
__m512 middle_f32x16 = _mm512_fmadd_ps(_mm512_mul_ps(correction_factor_f32x16, cos_angular_distance_f32x16),
|
|
504
|
+
innermost_f32x16, cos_double_angular_midpoint_f32x16);
|
|
505
|
+
// inner_f32x16 = C × sin(σ) × middle_f32x16
|
|
506
|
+
__m512 inner_f32x16 = _mm512_mul_ps(_mm512_mul_ps(correction_factor_f32x16, sin_angular_distance_f32x16),
|
|
507
|
+
middle_f32x16);
|
|
508
|
+
|
|
509
|
+
// λ' = L + (1-C) * f * sin_α * (σ + inner_f32x16)
|
|
510
|
+
__m512 lambda_new_f32x16 = _mm512_fmadd_ps(
|
|
511
|
+
_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(one_f32x16, correction_factor_f32x16), flattening_f32x16),
|
|
512
|
+
sin_azimuth_f32x16),
|
|
513
|
+
_mm512_add_ps(angular_distance_f32x16, inner_f32x16), longitude_difference_f32x16);
|
|
482
514
|
|
|
483
515
|
// Check convergence: |λ - λ'| < threshold
|
|
484
|
-
__m512
|
|
485
|
-
converged_mask = _mm512_cmp_ps_mask(
|
|
516
|
+
__m512 lambda_diff_f32x16 = _mm512_abs_ps(_mm512_sub_ps(lambda_new_f32x16, lambda_f32x16));
|
|
517
|
+
converged_mask = _mm512_cmp_ps_mask(lambda_diff_f32x16, convergence_threshold_f32x16, _CMP_LT_OS);
|
|
486
518
|
|
|
487
|
-
|
|
519
|
+
lambda_f32x16 = lambda_new_f32x16;
|
|
488
520
|
}
|
|
489
521
|
|
|
490
522
|
// Final distance calculation
|
|
491
523
|
// u² = cos²α * (a² - b²) / b²
|
|
492
|
-
__m512
|
|
493
|
-
__m512
|
|
494
|
-
__m512
|
|
524
|
+
__m512 a_sq_f32x16 = _mm512_mul_ps(equatorial_radius_f32x16, equatorial_radius_f32x16);
|
|
525
|
+
__m512 b_sq_f32x16 = _mm512_mul_ps(polar_radius_f32x16, polar_radius_f32x16);
|
|
526
|
+
__m512 u_squared_f32x16 = _mm512_div_ps(
|
|
527
|
+
_mm512_mul_ps(cos_squared_azimuth_f32x16, _mm512_sub_ps(a_sq_f32x16, b_sq_f32x16)), b_sq_f32x16);
|
|
495
528
|
|
|
496
529
|
// A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
|
|
497
|
-
__m512
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
530
|
+
__m512 series_a_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, _mm512_set1_ps(-175.0f), _mm512_set1_ps(320.0f));
|
|
531
|
+
series_a_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_a_f32x16, _mm512_set1_ps(-768.0f));
|
|
532
|
+
series_a_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_a_f32x16, _mm512_set1_ps(4096.0f));
|
|
533
|
+
series_a_f32x16 = _mm512_fmadd_ps(_mm512_div_ps(u_squared_f32x16, _mm512_set1_ps(16384.0f)), series_a_f32x16,
|
|
534
|
+
one_f32x16);
|
|
501
535
|
|
|
502
536
|
// B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
|
|
503
|
-
__m512
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
537
|
+
__m512 series_b_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, _mm512_set1_ps(-47.0f), _mm512_set1_ps(74.0f));
|
|
538
|
+
series_b_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_b_f32x16, _mm512_set1_ps(-128.0f));
|
|
539
|
+
series_b_f32x16 = _mm512_fmadd_ps(u_squared_f32x16, series_b_f32x16, _mm512_set1_ps(256.0f));
|
|
540
|
+
series_b_f32x16 = _mm512_mul_ps(_mm512_div_ps(u_squared_f32x16, _mm512_set1_ps(1024.0f)), series_b_f32x16);
|
|
507
541
|
|
|
508
542
|
// Δσ = B × sin(σ) × (cos(2σₘ) +
|
|
509
543
|
// B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
|
|
510
|
-
__m512
|
|
511
|
-
__m512
|
|
512
|
-
__m512
|
|
513
|
-
|
|
514
|
-
__m512
|
|
515
|
-
__m512
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
544
|
+
__m512 cos_2sm_sq_f32x16 = _mm512_mul_ps(cos_double_angular_midpoint_f32x16, cos_double_angular_midpoint_f32x16);
|
|
545
|
+
__m512 sin_sq_f32x16 = _mm512_mul_ps(sin_angular_distance_f32x16, sin_angular_distance_f32x16);
|
|
546
|
+
__m512 term1_f32x16 = _mm512_fmadd_ps(two_f32x16, cos_2sm_sq_f32x16, _mm512_set1_ps(-1.0f));
|
|
547
|
+
term1_f32x16 = _mm512_mul_ps(cos_angular_distance_f32x16, term1_f32x16);
|
|
548
|
+
__m512 term2_f32x16 = _mm512_fmadd_ps(four_f32x16, sin_sq_f32x16, _mm512_set1_ps(-3.0f));
|
|
549
|
+
__m512 term3_f32x16 = _mm512_fmadd_ps(four_f32x16, cos_2sm_sq_f32x16, _mm512_set1_ps(-3.0f));
|
|
550
|
+
term2_f32x16 = _mm512_mul_ps(
|
|
551
|
+
_mm512_mul_ps(_mm512_div_ps(series_b_f32x16, six_f32x16), cos_double_angular_midpoint_f32x16),
|
|
552
|
+
_mm512_mul_ps(term2_f32x16, term3_f32x16));
|
|
553
|
+
__m512 delta_sigma_f32x16 = _mm512_mul_ps(
|
|
554
|
+
series_b_f32x16, _mm512_mul_ps(sin_angular_distance_f32x16,
|
|
555
|
+
_mm512_add_ps(cos_double_angular_midpoint_f32x16,
|
|
556
|
+
_mm512_mul_ps(_mm512_div_ps(series_b_f32x16, four_f32x16),
|
|
557
|
+
_mm512_sub_ps(term1_f32x16, term2_f32x16)))));
|
|
522
558
|
|
|
523
559
|
// s = b * A * (σ - Δσ)
|
|
524
|
-
__m512
|
|
525
|
-
|
|
560
|
+
__m512 distances_f32x16 = _mm512_mul_ps(_mm512_mul_ps(polar_radius_f32x16, series_a_f32x16),
|
|
561
|
+
_mm512_sub_ps(angular_distance_f32x16, delta_sigma_f32x16));
|
|
526
562
|
|
|
527
563
|
// Set coincident points to zero
|
|
528
|
-
|
|
564
|
+
distances_f32x16 = _mm512_mask_blend_ps(coincident_mask, distances_f32x16, _mm512_setzero_ps());
|
|
529
565
|
|
|
530
|
-
return
|
|
566
|
+
return distances_f32x16;
|
|
531
567
|
}
|
|
532
568
|
|
|
533
569
|
NK_PUBLIC void nk_vincenty_f32_skylake( //
|
|
@@ -536,14 +572,14 @@ NK_PUBLIC void nk_vincenty_f32_skylake( //
|
|
|
536
572
|
nk_size_t n, nk_f32_t *results) {
|
|
537
573
|
|
|
538
574
|
while (n >= 16) {
|
|
539
|
-
__m512
|
|
540
|
-
__m512
|
|
541
|
-
__m512
|
|
542
|
-
__m512
|
|
575
|
+
__m512 first_latitudes_f32x16 = _mm512_loadu_ps(a_lats);
|
|
576
|
+
__m512 first_longitudes_f32x16 = _mm512_loadu_ps(a_lons);
|
|
577
|
+
__m512 second_latitudes_f32x16 = _mm512_loadu_ps(b_lats);
|
|
578
|
+
__m512 second_longitudes_f32x16 = _mm512_loadu_ps(b_lons);
|
|
543
579
|
|
|
544
|
-
__m512
|
|
545
|
-
|
|
546
|
-
_mm512_storeu_ps(results,
|
|
580
|
+
__m512 distances_f32x16 = nk_vincenty_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
|
|
581
|
+
second_latitudes_f32x16, second_longitudes_f32x16);
|
|
582
|
+
_mm512_storeu_ps(results, distances_f32x16);
|
|
547
583
|
|
|
548
584
|
a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
|
|
549
585
|
}
|
|
@@ -551,14 +587,14 @@ NK_PUBLIC void nk_vincenty_f32_skylake( //
|
|
|
551
587
|
// Handle remaining elements with masked operations
|
|
552
588
|
if (n > 0) {
|
|
553
589
|
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
|
|
554
|
-
__m512
|
|
555
|
-
__m512
|
|
556
|
-
__m512
|
|
557
|
-
__m512
|
|
558
|
-
|
|
559
|
-
__m512
|
|
560
|
-
|
|
561
|
-
_mm512_mask_storeu_ps(results, mask,
|
|
590
|
+
__m512 first_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lats);
|
|
591
|
+
__m512 first_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, a_lons);
|
|
592
|
+
__m512 second_latitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lats);
|
|
593
|
+
__m512 second_longitudes_f32x16 = _mm512_maskz_loadu_ps(mask, b_lons);
|
|
594
|
+
|
|
595
|
+
__m512 distances_f32x16 = nk_vincenty_f32x16_skylake_(first_latitudes_f32x16, first_longitudes_f32x16,
|
|
596
|
+
second_latitudes_f32x16, second_longitudes_f32x16);
|
|
597
|
+
_mm512_mask_storeu_ps(results, mask, distances_f32x16);
|
|
562
598
|
}
|
|
563
599
|
}
|
|
564
600
|
|