numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,577 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SIMD-accelerated Geospatial Distances for Skylake.
|
|
3
|
+
* @file include/numkong/geospatial/skylake.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 6, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/geospatial.h
|
|
8
|
+
*
|
|
9
|
+
* @section geospatial_skylake_instructions Key AVX-512 Geospatial Instructions
|
|
10
|
+
*
|
|
11
|
+
* Intrinsic Instruction Ice Genoa
|
|
12
|
+
* _mm512_sqrt_ps VSQRTPS (ZMM, ZMM) 19c @ p05 15c @ p01
|
|
13
|
+
* _mm512_sqrt_pd VSQRTPD (ZMM, ZMM) 23c @ p05 21c @ p01
|
|
14
|
+
* _mm256_div_ps VDIVPS (YMM, YMM, YMM) 11c @ p0 11c @ p01
|
|
15
|
+
* _mm256_div_pd VDIVPD (YMM, YMM, YMM) 13c @ p0 13c @ p01
|
|
16
|
+
* _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4c @ p01 4c @ p01
|
|
17
|
+
* _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4c @ p01 4c @ p01
|
|
18
|
+
*/
|
|
19
|
+
#ifndef NK_GEOSPATIAL_SKYLAKE_H
|
|
20
|
+
#define NK_GEOSPATIAL_SKYLAKE_H
|
|
21
|
+
|
|
22
|
+
#if NK_TARGET_X86_
|
|
23
|
+
#if NK_TARGET_SKYLAKE
|
|
24
|
+
|
|
25
|
+
#include "numkong/types.h"
|
|
26
|
+
#include "numkong/trigonometry/skylake.h" // `nk_sin_f64x8_skylake_`, `nk_cos_f64x8_skylake_`, `nk_atan2_f64x8_skylake_`, etc.
|
|
27
|
+
|
|
28
|
+
#if defined(__cplusplus)
|
|
29
|
+
extern "C" {
|
|
30
|
+
#endif
|
|
31
|
+
|
|
32
|
+
#if defined(__clang__)
|
|
33
|
+
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512dq,f16c,fma,bmi,bmi2"))), \
|
|
34
|
+
apply_to = function)
|
|
35
|
+
#elif defined(__GNUC__)
|
|
36
|
+
#pragma GCC push_options
|
|
37
|
+
#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "f16c", "fma", "bmi", "bmi2")
|
|
38
|
+
#endif
|
|
39
|
+
|
|
40
|
+
NK_INTERNAL __m512d nk_haversine_f64x8_skylake_( //
|
|
41
|
+
__m512d first_latitudes, __m512d first_longitudes, //
|
|
42
|
+
__m512d second_latitudes, __m512d second_longitudes) {
|
|
43
|
+
|
|
44
|
+
__m512d const earth_radius = _mm512_set1_pd(NK_EARTH_MEDIATORIAL_RADIUS);
|
|
45
|
+
__m512d const half = _mm512_set1_pd(0.5);
|
|
46
|
+
__m512d const one = _mm512_set1_pd(1.0);
|
|
47
|
+
__m512d const two = _mm512_set1_pd(2.0);
|
|
48
|
+
|
|
49
|
+
__m512d latitude_delta = _mm512_sub_pd(second_latitudes, first_latitudes);
|
|
50
|
+
__m512d longitude_delta = _mm512_sub_pd(second_longitudes, first_longitudes);
|
|
51
|
+
|
|
52
|
+
// Haversine terms: sin²(Δ/2)
|
|
53
|
+
__m512d latitude_delta_half = _mm512_mul_pd(latitude_delta, half);
|
|
54
|
+
__m512d longitude_delta_half = _mm512_mul_pd(longitude_delta, half);
|
|
55
|
+
__m512d sin_latitude_delta_half = nk_sin_f64x8_skylake_(latitude_delta_half);
|
|
56
|
+
__m512d sin_longitude_delta_half = nk_sin_f64x8_skylake_(longitude_delta_half);
|
|
57
|
+
__m512d sin_squared_latitude_delta_half = _mm512_mul_pd(sin_latitude_delta_half, sin_latitude_delta_half);
|
|
58
|
+
__m512d sin_squared_longitude_delta_half = _mm512_mul_pd(sin_longitude_delta_half, sin_longitude_delta_half);
|
|
59
|
+
|
|
60
|
+
// Latitude cosine product
|
|
61
|
+
__m512d cos_first_latitude = nk_cos_f64x8_skylake_(first_latitudes);
|
|
62
|
+
__m512d cos_second_latitude = nk_cos_f64x8_skylake_(second_latitudes);
|
|
63
|
+
__m512d cos_latitude_product = _mm512_mul_pd(cos_first_latitude, cos_second_latitude);
|
|
64
|
+
|
|
65
|
+
// a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
|
|
66
|
+
__m512d haversine_term = _mm512_add_pd(sin_squared_latitude_delta_half,
|
|
67
|
+
_mm512_mul_pd(cos_latitude_product, sin_squared_longitude_delta_half));
|
|
68
|
+
// Clamp haversine_term to [0, 1] to prevent NaN from sqrt of negative values
|
|
69
|
+
__m512d zero = _mm512_setzero_pd();
|
|
70
|
+
haversine_term = _mm512_max_pd(zero, _mm512_min_pd(one, haversine_term));
|
|
71
|
+
|
|
72
|
+
// Central angle: c = 2 × atan2(√a, √(1-a))
|
|
73
|
+
__m512d sqrt_haversine = _mm512_sqrt_pd(haversine_term);
|
|
74
|
+
__m512d sqrt_complement = _mm512_sqrt_pd(_mm512_sub_pd(one, haversine_term));
|
|
75
|
+
__m512d central_angle = _mm512_mul_pd(two, nk_atan2_f64x8_skylake_(sqrt_haversine, sqrt_complement));
|
|
76
|
+
|
|
77
|
+
return _mm512_mul_pd(earth_radius, central_angle);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
NK_PUBLIC void nk_haversine_f64_skylake( //
|
|
81
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
82
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
83
|
+
nk_size_t n, nk_f64_t *results) {
|
|
84
|
+
|
|
85
|
+
while (n >= 8) {
|
|
86
|
+
__m512d first_latitudes = _mm512_loadu_pd(a_lats);
|
|
87
|
+
__m512d first_longitudes = _mm512_loadu_pd(a_lons);
|
|
88
|
+
__m512d second_latitudes = _mm512_loadu_pd(b_lats);
|
|
89
|
+
__m512d second_longitudes = _mm512_loadu_pd(b_lons);
|
|
90
|
+
|
|
91
|
+
__m512d distances = nk_haversine_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
|
|
92
|
+
second_longitudes);
|
|
93
|
+
_mm512_storeu_pd(results, distances);
|
|
94
|
+
|
|
95
|
+
a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Handle remaining elements with masked operations
|
|
99
|
+
if (n > 0) {
|
|
100
|
+
__mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
|
|
101
|
+
__m512d first_latitudes = _mm512_maskz_loadu_pd(mask, a_lats);
|
|
102
|
+
__m512d first_longitudes = _mm512_maskz_loadu_pd(mask, a_lons);
|
|
103
|
+
__m512d second_latitudes = _mm512_maskz_loadu_pd(mask, b_lats);
|
|
104
|
+
__m512d second_longitudes = _mm512_maskz_loadu_pd(mask, b_lons);
|
|
105
|
+
|
|
106
|
+
__m512d distances = nk_haversine_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
|
|
107
|
+
second_longitudes);
|
|
108
|
+
_mm512_mask_storeu_pd(results, mask, distances);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* @brief AVX-512 helper for Vincenty's geodesic distance on 8 f64 point pairs.
|
|
114
|
+
* @note This is a true SIMD implementation using masked convergence tracking.
|
|
115
|
+
*/
|
|
116
|
+
NK_INTERNAL __m512d nk_vincenty_f64x8_skylake_( //
|
|
117
|
+
__m512d first_latitudes, __m512d first_longitudes, //
|
|
118
|
+
__m512d second_latitudes, __m512d second_longitudes) {
|
|
119
|
+
|
|
120
|
+
__m512d const equatorial_radius = _mm512_set1_pd(NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
121
|
+
__m512d const polar_radius = _mm512_set1_pd(NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
122
|
+
__m512d const flattening = _mm512_set1_pd(1.0 / NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
123
|
+
__m512d const convergence_threshold = _mm512_set1_pd(NK_VINCENTY_CONVERGENCE_THRESHOLD_F64);
|
|
124
|
+
__m512d const one = _mm512_set1_pd(1.0);
|
|
125
|
+
__m512d const two = _mm512_set1_pd(2.0);
|
|
126
|
+
__m512d const three = _mm512_set1_pd(3.0);
|
|
127
|
+
__m512d const four = _mm512_set1_pd(4.0);
|
|
128
|
+
__m512d const six = _mm512_set1_pd(6.0);
|
|
129
|
+
__m512d const sixteen = _mm512_set1_pd(16.0);
|
|
130
|
+
|
|
131
|
+
// Longitude difference
|
|
132
|
+
__m512d longitude_difference = _mm512_sub_pd(second_longitudes, first_longitudes);
|
|
133
|
+
|
|
134
|
+
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
135
|
+
__m512d one_minus_f = _mm512_sub_pd(one, flattening);
|
|
136
|
+
__m512d tan_first = _mm512_div_pd(nk_sin_f64x8_skylake_(first_latitudes), nk_cos_f64x8_skylake_(first_latitudes));
|
|
137
|
+
__m512d tan_second = _mm512_div_pd(nk_sin_f64x8_skylake_(second_latitudes),
|
|
138
|
+
nk_cos_f64x8_skylake_(second_latitudes));
|
|
139
|
+
__m512d tan_reduced_first = _mm512_mul_pd(one_minus_f, tan_first);
|
|
140
|
+
__m512d tan_reduced_second = _mm512_mul_pd(one_minus_f, tan_second);
|
|
141
|
+
|
|
142
|
+
// cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
|
|
143
|
+
__m512d cos_reduced_first = _mm512_div_pd(
|
|
144
|
+
one, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_first, tan_reduced_first, one)));
|
|
145
|
+
__m512d sin_reduced_first = _mm512_mul_pd(tan_reduced_first, cos_reduced_first);
|
|
146
|
+
__m512d cos_reduced_second = _mm512_div_pd(
|
|
147
|
+
one, _mm512_sqrt_pd(_mm512_fmadd_pd(tan_reduced_second, tan_reduced_second, one)));
|
|
148
|
+
__m512d sin_reduced_second = _mm512_mul_pd(tan_reduced_second, cos_reduced_second);
|
|
149
|
+
|
|
150
|
+
// Initialize lambda and tracking variables
|
|
151
|
+
__m512d lambda = longitude_difference;
|
|
152
|
+
__m512d sin_angular_distance, cos_angular_distance, angular_distance;
|
|
153
|
+
__m512d sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
|
|
154
|
+
|
|
155
|
+
// Track convergence and coincident points
|
|
156
|
+
__mmask8 converged_mask = 0;
|
|
157
|
+
__mmask8 coincident_mask = 0;
|
|
158
|
+
|
|
159
|
+
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFF; ++iteration) {
|
|
160
|
+
__m512d sin_lambda = nk_sin_f64x8_skylake_(lambda);
|
|
161
|
+
__m512d cos_lambda = nk_cos_f64x8_skylake_(lambda);
|
|
162
|
+
|
|
163
|
+
// sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
|
|
164
|
+
__m512d cross_term = _mm512_mul_pd(cos_reduced_second, sin_lambda);
|
|
165
|
+
__m512d mixed_term = _mm512_sub_pd(
|
|
166
|
+
_mm512_mul_pd(cos_reduced_first, sin_reduced_second),
|
|
167
|
+
_mm512_mul_pd(_mm512_mul_pd(sin_reduced_first, cos_reduced_second), cos_lambda));
|
|
168
|
+
__m512d sin_angular_dist_sq = _mm512_fmadd_pd(cross_term, cross_term, _mm512_mul_pd(mixed_term, mixed_term));
|
|
169
|
+
sin_angular_distance = _mm512_sqrt_pd(sin_angular_dist_sq);
|
|
170
|
+
|
|
171
|
+
// Check for coincident points (sin_angular_distance ≈ 0)
|
|
172
|
+
coincident_mask = _mm512_cmp_pd_mask(sin_angular_distance, _mm512_set1_pd(1e-15), _CMP_LT_OS);
|
|
173
|
+
|
|
174
|
+
// cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
|
|
175
|
+
cos_angular_distance = _mm512_fmadd_pd(_mm512_mul_pd(cos_reduced_first, cos_reduced_second), cos_lambda,
|
|
176
|
+
_mm512_mul_pd(sin_reduced_first, sin_reduced_second));
|
|
177
|
+
|
|
178
|
+
// angular_distance = atan2(sin, cos)
|
|
179
|
+
angular_distance = nk_atan2_f64x8_skylake_(sin_angular_distance, cos_angular_distance);
|
|
180
|
+
|
|
181
|
+
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
|
|
182
|
+
// Use masked divide: zero result for coincident lanes, avoids division by zero
|
|
183
|
+
sin_azimuth = _mm512_maskz_div_pd(
|
|
184
|
+
_knot_mask8(coincident_mask),
|
|
185
|
+
_mm512_mul_pd(_mm512_mul_pd(cos_reduced_first, cos_reduced_second), sin_lambda), sin_angular_distance);
|
|
186
|
+
cos_squared_azimuth = _mm512_sub_pd(one, _mm512_mul_pd(sin_azimuth, sin_azimuth));
|
|
187
|
+
|
|
188
|
+
// Handle equatorial case: cos²α = 0
|
|
189
|
+
__mmask8 equatorial_mask = _mm512_cmp_pd_mask(cos_squared_azimuth, _mm512_set1_pd(1e-15), _CMP_LT_OS);
|
|
190
|
+
|
|
191
|
+
// cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
|
|
192
|
+
// Use masked divide: for equatorial lanes, quotient = cos_angular_distance (passthrough),
|
|
193
|
+
// so subtraction yields zero. Avoids division by zero.
|
|
194
|
+
__m512d sin_product = _mm512_mul_pd(sin_reduced_first, sin_reduced_second);
|
|
195
|
+
__m512d quotient = _mm512_mask_div_pd(cos_angular_distance, _knot_mask8(equatorial_mask),
|
|
196
|
+
_mm512_mul_pd(two, sin_product), cos_squared_azimuth);
|
|
197
|
+
cos_double_angular_midpoint = _mm512_sub_pd(cos_angular_distance, quotient);
|
|
198
|
+
|
|
199
|
+
// C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
|
|
200
|
+
__m512d correction_factor = _mm512_mul_pd(
|
|
201
|
+
_mm512_div_pd(flattening, sixteen),
|
|
202
|
+
_mm512_mul_pd(cos_squared_azimuth,
|
|
203
|
+
_mm512_fmadd_pd(flattening, _mm512_fnmadd_pd(three, cos_squared_azimuth, four), four)));
|
|
204
|
+
|
|
205
|
+
// λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
|
|
206
|
+
__m512d cos_2sm_sq = _mm512_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
|
|
207
|
+
// innermost = -1 + 2 × cos²(2σₘ)
|
|
208
|
+
__m512d innermost = _mm512_fmadd_pd(two, cos_2sm_sq, _mm512_set1_pd(-1.0));
|
|
209
|
+
// middle = cos(2σₘ) + C × cos(σ) × innermost
|
|
210
|
+
__m512d middle = _mm512_fmadd_pd(_mm512_mul_pd(correction_factor, cos_angular_distance), innermost,
|
|
211
|
+
cos_double_angular_midpoint);
|
|
212
|
+
// inner = C × sin(σ) × middle
|
|
213
|
+
__m512d inner = _mm512_mul_pd(_mm512_mul_pd(correction_factor, sin_angular_distance), middle);
|
|
214
|
+
|
|
215
|
+
// λ' = L + (1-C) * f * sin_α * (σ + inner)
|
|
216
|
+
__m512d lambda_new = _mm512_fmadd_pd(
|
|
217
|
+
_mm512_mul_pd(_mm512_mul_pd(_mm512_sub_pd(one, correction_factor), flattening), sin_azimuth),
|
|
218
|
+
_mm512_add_pd(angular_distance, inner), longitude_difference);
|
|
219
|
+
|
|
220
|
+
// Check convergence: |λ - λ'| < threshold
|
|
221
|
+
__m512d lambda_diff = _mm512_abs_pd(_mm512_sub_pd(lambda_new, lambda));
|
|
222
|
+
converged_mask = _mm512_cmp_pd_mask(lambda_diff, convergence_threshold, _CMP_LT_OS);
|
|
223
|
+
|
|
224
|
+
lambda = lambda_new;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Final distance calculation
|
|
228
|
+
// u² = cos²α * (a² - b²) / b²
|
|
229
|
+
__m512d a_sq = _mm512_mul_pd(equatorial_radius, equatorial_radius);
|
|
230
|
+
__m512d b_sq = _mm512_mul_pd(polar_radius, polar_radius);
|
|
231
|
+
__m512d u_squared = _mm512_div_pd(_mm512_mul_pd(cos_squared_azimuth, _mm512_sub_pd(a_sq, b_sq)), b_sq);
|
|
232
|
+
|
|
233
|
+
// A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
|
|
234
|
+
__m512d series_a = _mm512_fmadd_pd(u_squared, _mm512_set1_pd(-175.0), _mm512_set1_pd(320.0));
|
|
235
|
+
series_a = _mm512_fmadd_pd(u_squared, series_a, _mm512_set1_pd(-768.0));
|
|
236
|
+
series_a = _mm512_fmadd_pd(u_squared, series_a, _mm512_set1_pd(4096.0));
|
|
237
|
+
series_a = _mm512_fmadd_pd(_mm512_div_pd(u_squared, _mm512_set1_pd(16384.0)), series_a, one);
|
|
238
|
+
|
|
239
|
+
// B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
|
|
240
|
+
__m512d series_b = _mm512_fmadd_pd(u_squared, _mm512_set1_pd(-47.0), _mm512_set1_pd(74.0));
|
|
241
|
+
series_b = _mm512_fmadd_pd(u_squared, series_b, _mm512_set1_pd(-128.0));
|
|
242
|
+
series_b = _mm512_fmadd_pd(u_squared, series_b, _mm512_set1_pd(256.0));
|
|
243
|
+
series_b = _mm512_mul_pd(_mm512_div_pd(u_squared, _mm512_set1_pd(1024.0)), series_b);
|
|
244
|
+
|
|
245
|
+
// Δσ = B × sin(σ) × (cos(2σₘ) +
|
|
246
|
+
// B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
|
|
247
|
+
__m512d cos_2sm_sq = _mm512_mul_pd(cos_double_angular_midpoint, cos_double_angular_midpoint);
|
|
248
|
+
__m512d sin_sq = _mm512_mul_pd(sin_angular_distance, sin_angular_distance);
|
|
249
|
+
__m512d term1 = _mm512_fmadd_pd(two, cos_2sm_sq, _mm512_set1_pd(-1.0));
|
|
250
|
+
term1 = _mm512_mul_pd(cos_angular_distance, term1);
|
|
251
|
+
__m512d term2 = _mm512_fmadd_pd(four, sin_sq, _mm512_set1_pd(-3.0));
|
|
252
|
+
__m512d term3 = _mm512_fmadd_pd(four, cos_2sm_sq, _mm512_set1_pd(-3.0));
|
|
253
|
+
term2 = _mm512_mul_pd(_mm512_mul_pd(_mm512_div_pd(series_b, six), cos_double_angular_midpoint),
|
|
254
|
+
_mm512_mul_pd(term2, term3));
|
|
255
|
+
__m512d delta_sigma = _mm512_mul_pd(
|
|
256
|
+
series_b, _mm512_mul_pd(sin_angular_distance, _mm512_add_pd(cos_double_angular_midpoint,
|
|
257
|
+
_mm512_mul_pd(_mm512_div_pd(series_b, four),
|
|
258
|
+
_mm512_sub_pd(term1, term2)))));
|
|
259
|
+
|
|
260
|
+
// s = b * A * (σ - Δσ)
|
|
261
|
+
__m512d distances = _mm512_mul_pd(_mm512_mul_pd(polar_radius, series_a),
|
|
262
|
+
_mm512_sub_pd(angular_distance, delta_sigma));
|
|
263
|
+
|
|
264
|
+
// Set coincident points to zero
|
|
265
|
+
distances = _mm512_mask_blend_pd(coincident_mask, distances, _mm512_setzero_pd());
|
|
266
|
+
|
|
267
|
+
return distances;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
NK_PUBLIC void nk_vincenty_f64_skylake( //
|
|
271
|
+
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
272
|
+
nk_f64_t const *b_lats, nk_f64_t const *b_lons, //
|
|
273
|
+
nk_size_t n, nk_f64_t *results) {
|
|
274
|
+
|
|
275
|
+
while (n >= 8) {
|
|
276
|
+
__m512d first_latitudes = _mm512_loadu_pd(a_lats);
|
|
277
|
+
__m512d first_longitudes = _mm512_loadu_pd(a_lons);
|
|
278
|
+
__m512d second_latitudes = _mm512_loadu_pd(b_lats);
|
|
279
|
+
__m512d second_longitudes = _mm512_loadu_pd(b_lons);
|
|
280
|
+
|
|
281
|
+
__m512d distances = nk_vincenty_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
|
|
282
|
+
second_longitudes);
|
|
283
|
+
_mm512_storeu_pd(results, distances);
|
|
284
|
+
|
|
285
|
+
a_lats += 8, a_lons += 8, b_lats += 8, b_lons += 8, results += 8, n -= 8;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// Handle remaining elements with masked operations
|
|
289
|
+
if (n > 0) {
|
|
290
|
+
__mmask8 mask = (__mmask8)_bzhi_u32(0xFF, n);
|
|
291
|
+
__m512d first_latitudes = _mm512_maskz_loadu_pd(mask, a_lats);
|
|
292
|
+
__m512d first_longitudes = _mm512_maskz_loadu_pd(mask, a_lons);
|
|
293
|
+
__m512d second_latitudes = _mm512_maskz_loadu_pd(mask, b_lats);
|
|
294
|
+
__m512d second_longitudes = _mm512_maskz_loadu_pd(mask, b_lons);
|
|
295
|
+
|
|
296
|
+
__m512d distances = nk_vincenty_f64x8_skylake_(first_latitudes, first_longitudes, second_latitudes,
|
|
297
|
+
second_longitudes);
|
|
298
|
+
_mm512_mask_storeu_pd(results, mask, distances);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
NK_INTERNAL __m512 nk_haversine_f32x16_skylake_( //
|
|
303
|
+
__m512 first_latitudes, __m512 first_longitudes, //
|
|
304
|
+
__m512 second_latitudes, __m512 second_longitudes) {
|
|
305
|
+
|
|
306
|
+
__m512 const earth_radius = _mm512_set1_ps((float)NK_EARTH_MEDIATORIAL_RADIUS);
|
|
307
|
+
__m512 const half = _mm512_set1_ps(0.5f);
|
|
308
|
+
__m512 const one = _mm512_set1_ps(1.0f);
|
|
309
|
+
__m512 const two = _mm512_set1_ps(2.0f);
|
|
310
|
+
|
|
311
|
+
__m512 latitude_delta = _mm512_sub_ps(second_latitudes, first_latitudes);
|
|
312
|
+
__m512 longitude_delta = _mm512_sub_ps(second_longitudes, first_longitudes);
|
|
313
|
+
|
|
314
|
+
// Haversine terms: sin²(Δ/2)
|
|
315
|
+
__m512 latitude_delta_half = _mm512_mul_ps(latitude_delta, half);
|
|
316
|
+
__m512 longitude_delta_half = _mm512_mul_ps(longitude_delta, half);
|
|
317
|
+
__m512 sin_latitude_delta_half = nk_sin_f32x16_skylake_(latitude_delta_half);
|
|
318
|
+
__m512 sin_longitude_delta_half = nk_sin_f32x16_skylake_(longitude_delta_half);
|
|
319
|
+
__m512 sin_squared_latitude_delta_half = _mm512_mul_ps(sin_latitude_delta_half, sin_latitude_delta_half);
|
|
320
|
+
__m512 sin_squared_longitude_delta_half = _mm512_mul_ps(sin_longitude_delta_half, sin_longitude_delta_half);
|
|
321
|
+
|
|
322
|
+
// Latitude cosine product
|
|
323
|
+
__m512 cos_first_latitude = nk_cos_f32x16_skylake_(first_latitudes);
|
|
324
|
+
__m512 cos_second_latitude = nk_cos_f32x16_skylake_(second_latitudes);
|
|
325
|
+
__m512 cos_latitude_product = _mm512_mul_ps(cos_first_latitude, cos_second_latitude);
|
|
326
|
+
|
|
327
|
+
// a = sin²(Δlat/2) + cos(lat1) × cos(lat2) × sin²(Δlon/2)
|
|
328
|
+
__m512 haversine_term = _mm512_add_ps(sin_squared_latitude_delta_half,
|
|
329
|
+
_mm512_mul_ps(cos_latitude_product, sin_squared_longitude_delta_half));
|
|
330
|
+
|
|
331
|
+
// Clamp to [0, 1] to avoid NaN from sqrt of negative numbers (due to floating point errors)
|
|
332
|
+
__m512 zero = _mm512_setzero_ps();
|
|
333
|
+
haversine_term = _mm512_max_ps(zero, _mm512_min_ps(one, haversine_term));
|
|
334
|
+
|
|
335
|
+
// Central angle: c = 2 × atan2(√a, √(1-a))
|
|
336
|
+
__m512 sqrt_haversine = _mm512_sqrt_ps(haversine_term);
|
|
337
|
+
__m512 sqrt_complement = _mm512_sqrt_ps(_mm512_sub_ps(one, haversine_term));
|
|
338
|
+
__m512 central_angle = _mm512_mul_ps(two, nk_atan2_f32x16_skylake_(sqrt_haversine, sqrt_complement));
|
|
339
|
+
|
|
340
|
+
return _mm512_mul_ps(earth_radius, central_angle);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
NK_PUBLIC void nk_haversine_f32_skylake( //
|
|
344
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
345
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
346
|
+
nk_size_t n, nk_f32_t *results) {
|
|
347
|
+
|
|
348
|
+
while (n >= 16) {
|
|
349
|
+
__m512 first_latitudes = _mm512_loadu_ps(a_lats);
|
|
350
|
+
__m512 first_longitudes = _mm512_loadu_ps(a_lons);
|
|
351
|
+
__m512 second_latitudes = _mm512_loadu_ps(b_lats);
|
|
352
|
+
__m512 second_longitudes = _mm512_loadu_ps(b_lons);
|
|
353
|
+
|
|
354
|
+
__m512 distances = nk_haversine_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
|
|
355
|
+
second_longitudes);
|
|
356
|
+
_mm512_storeu_ps(results, distances);
|
|
357
|
+
|
|
358
|
+
a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Handle remaining elements with masked operations
|
|
362
|
+
if (n > 0) {
|
|
363
|
+
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
|
|
364
|
+
__m512 first_latitudes = _mm512_maskz_loadu_ps(mask, a_lats);
|
|
365
|
+
__m512 first_longitudes = _mm512_maskz_loadu_ps(mask, a_lons);
|
|
366
|
+
__m512 second_latitudes = _mm512_maskz_loadu_ps(mask, b_lats);
|
|
367
|
+
__m512 second_longitudes = _mm512_maskz_loadu_ps(mask, b_lons);
|
|
368
|
+
|
|
369
|
+
__m512 distances = nk_haversine_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
|
|
370
|
+
second_longitudes);
|
|
371
|
+
_mm512_mask_storeu_ps(results, mask, distances);
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/**
|
|
376
|
+
* @brief AVX-512 helper for Vincenty's geodesic distance on 16 f32 point pairs.
|
|
377
|
+
* @note This is a true SIMD implementation using masked convergence tracking.
|
|
378
|
+
*/
|
|
379
|
+
NK_INTERNAL __m512 nk_vincenty_f32x16_skylake_( //
|
|
380
|
+
__m512 first_latitudes, __m512 first_longitudes, //
|
|
381
|
+
__m512 second_latitudes, __m512 second_longitudes) {
|
|
382
|
+
|
|
383
|
+
__m512 const equatorial_radius = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_EQUATORIAL_RADIUS);
|
|
384
|
+
__m512 const polar_radius = _mm512_set1_ps((float)NK_EARTH_ELLIPSOID_POLAR_RADIUS);
|
|
385
|
+
__m512 const flattening = _mm512_set1_ps(1.0f / (float)NK_EARTH_ELLIPSOID_INVERSE_FLATTENING);
|
|
386
|
+
__m512 const convergence_threshold = _mm512_set1_ps(NK_VINCENTY_CONVERGENCE_THRESHOLD_F32);
|
|
387
|
+
__m512 const one = _mm512_set1_ps(1.0f);
|
|
388
|
+
__m512 const two = _mm512_set1_ps(2.0f);
|
|
389
|
+
__m512 const three = _mm512_set1_ps(3.0f);
|
|
390
|
+
__m512 const four = _mm512_set1_ps(4.0f);
|
|
391
|
+
__m512 const six = _mm512_set1_ps(6.0f);
|
|
392
|
+
__m512 const sixteen = _mm512_set1_ps(16.0f);
|
|
393
|
+
|
|
394
|
+
// Longitude difference
|
|
395
|
+
__m512 longitude_difference = _mm512_sub_ps(second_longitudes, first_longitudes);
|
|
396
|
+
|
|
397
|
+
// Reduced latitudes: tan(U) = (1-f) * tan(lat)
|
|
398
|
+
__m512 one_minus_f = _mm512_sub_ps(one, flattening);
|
|
399
|
+
__m512 tan_first = _mm512_div_ps(nk_sin_f32x16_skylake_(first_latitudes), nk_cos_f32x16_skylake_(first_latitudes));
|
|
400
|
+
__m512 tan_second = _mm512_div_ps(nk_sin_f32x16_skylake_(second_latitudes),
|
|
401
|
+
nk_cos_f32x16_skylake_(second_latitudes));
|
|
402
|
+
__m512 tan_reduced_first = _mm512_mul_ps(one_minus_f, tan_first);
|
|
403
|
+
__m512 tan_reduced_second = _mm512_mul_ps(one_minus_f, tan_second);
|
|
404
|
+
|
|
405
|
+
// cos(U) = 1/√(1 + tan²(U)), sin(U) = tan(U) × cos(U)
|
|
406
|
+
__m512 cos_reduced_first = _mm512_div_ps(
|
|
407
|
+
one, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_first, tan_reduced_first, one)));
|
|
408
|
+
__m512 sin_reduced_first = _mm512_mul_ps(tan_reduced_first, cos_reduced_first);
|
|
409
|
+
__m512 cos_reduced_second = _mm512_div_ps(
|
|
410
|
+
one, _mm512_sqrt_ps(_mm512_fmadd_ps(tan_reduced_second, tan_reduced_second, one)));
|
|
411
|
+
__m512 sin_reduced_second = _mm512_mul_ps(tan_reduced_second, cos_reduced_second);
|
|
412
|
+
|
|
413
|
+
// Initialize lambda and tracking variables
|
|
414
|
+
__m512 lambda = longitude_difference;
|
|
415
|
+
__m512 sin_angular_distance, cos_angular_distance, angular_distance;
|
|
416
|
+
__m512 sin_azimuth, cos_squared_azimuth, cos_double_angular_midpoint;
|
|
417
|
+
|
|
418
|
+
// Track convergence and coincident points
|
|
419
|
+
__mmask16 converged_mask = 0;
|
|
420
|
+
__mmask16 coincident_mask = 0;
|
|
421
|
+
|
|
422
|
+
for (nk_u32_t iteration = 0; iteration < NK_VINCENTY_MAX_ITERATIONS && converged_mask != 0xFFFF; ++iteration) {
|
|
423
|
+
__m512 sin_lambda = nk_sin_f32x16_skylake_(lambda);
|
|
424
|
+
__m512 cos_lambda = nk_cos_f32x16_skylake_(lambda);
|
|
425
|
+
|
|
426
|
+
// sin²(angular_distance) = (cos(U₂) × sin(λ))² + (cos(U₁) × sin(U₂) - sin(U₁) × cos(U₂) × cos(λ))²
|
|
427
|
+
__m512 cross_term = _mm512_mul_ps(cos_reduced_second, sin_lambda);
|
|
428
|
+
__m512 mixed_term = _mm512_sub_ps(
|
|
429
|
+
_mm512_mul_ps(cos_reduced_first, sin_reduced_second),
|
|
430
|
+
_mm512_mul_ps(_mm512_mul_ps(sin_reduced_first, cos_reduced_second), cos_lambda));
|
|
431
|
+
__m512 sin_angular_dist_sq = _mm512_fmadd_ps(cross_term, cross_term, _mm512_mul_ps(mixed_term, mixed_term));
|
|
432
|
+
sin_angular_distance = _mm512_sqrt_ps(sin_angular_dist_sq);
|
|
433
|
+
|
|
434
|
+
// Check for coincident points (sin_angular_distance ≈ 0)
|
|
435
|
+
coincident_mask = _mm512_cmp_ps_mask(sin_angular_distance, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
|
|
436
|
+
|
|
437
|
+
// cos(angular_distance) = sin(U₁) × sin(U₂) + cos(U₁) × cos(U₂) × cos(λ)
|
|
438
|
+
cos_angular_distance = _mm512_fmadd_ps(_mm512_mul_ps(cos_reduced_first, cos_reduced_second), cos_lambda,
|
|
439
|
+
_mm512_mul_ps(sin_reduced_first, sin_reduced_second));
|
|
440
|
+
|
|
441
|
+
// angular_distance = atan2(sin, cos)
|
|
442
|
+
angular_distance = nk_atan2_f32x16_skylake_(sin_angular_distance, cos_angular_distance);
|
|
443
|
+
|
|
444
|
+
// sin(azimuth) = cos(U₁) × cos(U₂) × sin(λ) / sin(angular_distance)
|
|
445
|
+
// Use masked divide: zero result for coincident lanes, avoids division by zero
|
|
446
|
+
sin_azimuth = _mm512_maskz_div_ps(
|
|
447
|
+
_knot_mask16(coincident_mask),
|
|
448
|
+
_mm512_mul_ps(_mm512_mul_ps(cos_reduced_first, cos_reduced_second), sin_lambda), sin_angular_distance);
|
|
449
|
+
cos_squared_azimuth = _mm512_sub_ps(one, _mm512_mul_ps(sin_azimuth, sin_azimuth));
|
|
450
|
+
|
|
451
|
+
// Handle equatorial case: cos²α = 0
|
|
452
|
+
__mmask16 equatorial_mask = _mm512_cmp_ps_mask(cos_squared_azimuth, _mm512_set1_ps(1e-7f), _CMP_LT_OS);
|
|
453
|
+
|
|
454
|
+
// cos(2σₘ) = cos(σ) - 2 × sin(U₁) × sin(U₂) / cos²(α)
|
|
455
|
+
// Use masked divide: for equatorial lanes, quotient = cos_angular_distance (passthrough),
|
|
456
|
+
// so subtraction yields zero. Avoids division by zero.
|
|
457
|
+
__m512 sin_product = _mm512_mul_ps(sin_reduced_first, sin_reduced_second);
|
|
458
|
+
__m512 quotient = _mm512_mask_div_ps(cos_angular_distance, _knot_mask16(equatorial_mask),
|
|
459
|
+
_mm512_mul_ps(two, sin_product), cos_squared_azimuth);
|
|
460
|
+
cos_double_angular_midpoint = _mm512_sub_ps(cos_angular_distance, quotient);
|
|
461
|
+
|
|
462
|
+
// C = f/16 * cos²α * (4 + f*(4 - 3*cos²α))
|
|
463
|
+
__m512 correction_factor = _mm512_mul_ps(
|
|
464
|
+
_mm512_div_ps(flattening, sixteen),
|
|
465
|
+
_mm512_mul_ps(cos_squared_azimuth,
|
|
466
|
+
_mm512_fmadd_ps(flattening, _mm512_fnmadd_ps(three, cos_squared_azimuth, four), four)));
|
|
467
|
+
|
|
468
|
+
// λ' = L + (1-C) × f × sin(α) × (σ + C × sin(σ) × (cos(2σₘ) + C × cos(σ) × (-1 + 2 × cos²(2σₘ))))
|
|
469
|
+
__m512 cos_2sm_sq = _mm512_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
|
|
470
|
+
// innermost = -1 + 2 × cos²(2σₘ)
|
|
471
|
+
__m512 innermost = _mm512_fmadd_ps(two, cos_2sm_sq, _mm512_set1_ps(-1.0f));
|
|
472
|
+
// middle = cos(2σₘ) + C × cos(σ) × innermost
|
|
473
|
+
__m512 middle = _mm512_fmadd_ps(_mm512_mul_ps(correction_factor, cos_angular_distance), innermost,
|
|
474
|
+
cos_double_angular_midpoint);
|
|
475
|
+
// inner = C × sin(σ) × middle
|
|
476
|
+
__m512 inner = _mm512_mul_ps(_mm512_mul_ps(correction_factor, sin_angular_distance), middle);
|
|
477
|
+
|
|
478
|
+
// λ' = L + (1-C) * f * sin_α * (σ + inner)
|
|
479
|
+
__m512 lambda_new = _mm512_fmadd_ps(
|
|
480
|
+
_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(one, correction_factor), flattening), sin_azimuth),
|
|
481
|
+
_mm512_add_ps(angular_distance, inner), longitude_difference);
|
|
482
|
+
|
|
483
|
+
// Check convergence: |λ - λ'| < threshold
|
|
484
|
+
__m512 lambda_diff = _mm512_abs_ps(_mm512_sub_ps(lambda_new, lambda));
|
|
485
|
+
converged_mask = _mm512_cmp_ps_mask(lambda_diff, convergence_threshold, _CMP_LT_OS);
|
|
486
|
+
|
|
487
|
+
lambda = lambda_new;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// Final distance calculation
|
|
491
|
+
// u² = cos²α * (a² - b²) / b²
|
|
492
|
+
__m512 a_sq = _mm512_mul_ps(equatorial_radius, equatorial_radius);
|
|
493
|
+
__m512 b_sq = _mm512_mul_ps(polar_radius, polar_radius);
|
|
494
|
+
__m512 u_squared = _mm512_div_ps(_mm512_mul_ps(cos_squared_azimuth, _mm512_sub_ps(a_sq, b_sq)), b_sq);
|
|
495
|
+
|
|
496
|
+
// A = 1 + u²/16384 * (4096 + u²*(-768 + u²*(320 - 175*u²)))
|
|
497
|
+
__m512 series_a = _mm512_fmadd_ps(u_squared, _mm512_set1_ps(-175.0f), _mm512_set1_ps(320.0f));
|
|
498
|
+
series_a = _mm512_fmadd_ps(u_squared, series_a, _mm512_set1_ps(-768.0f));
|
|
499
|
+
series_a = _mm512_fmadd_ps(u_squared, series_a, _mm512_set1_ps(4096.0f));
|
|
500
|
+
series_a = _mm512_fmadd_ps(_mm512_div_ps(u_squared, _mm512_set1_ps(16384.0f)), series_a, one);
|
|
501
|
+
|
|
502
|
+
// B = u²/1024 * (256 + u²*(-128 + u²*(74 - 47*u²)))
|
|
503
|
+
__m512 series_b = _mm512_fmadd_ps(u_squared, _mm512_set1_ps(-47.0f), _mm512_set1_ps(74.0f));
|
|
504
|
+
series_b = _mm512_fmadd_ps(u_squared, series_b, _mm512_set1_ps(-128.0f));
|
|
505
|
+
series_b = _mm512_fmadd_ps(u_squared, series_b, _mm512_set1_ps(256.0f));
|
|
506
|
+
series_b = _mm512_mul_ps(_mm512_div_ps(u_squared, _mm512_set1_ps(1024.0f)), series_b);
|
|
507
|
+
|
|
508
|
+
// Δσ = B × sin(σ) × (cos(2σₘ) +
|
|
509
|
+
// B/4 × (cos(σ) × (-1 + 2 × cos²(2σₘ)) - B/6 × cos(2σₘ) × (-3 + 4 × sin²(σ)) × (-3 + 4 × cos²(2σₘ))))
|
|
510
|
+
__m512 cos_2sm_sq = _mm512_mul_ps(cos_double_angular_midpoint, cos_double_angular_midpoint);
|
|
511
|
+
__m512 sin_sq = _mm512_mul_ps(sin_angular_distance, sin_angular_distance);
|
|
512
|
+
__m512 term1 = _mm512_fmadd_ps(two, cos_2sm_sq, _mm512_set1_ps(-1.0f));
|
|
513
|
+
term1 = _mm512_mul_ps(cos_angular_distance, term1);
|
|
514
|
+
__m512 term2 = _mm512_fmadd_ps(four, sin_sq, _mm512_set1_ps(-3.0f));
|
|
515
|
+
__m512 term3 = _mm512_fmadd_ps(four, cos_2sm_sq, _mm512_set1_ps(-3.0f));
|
|
516
|
+
term2 = _mm512_mul_ps(_mm512_mul_ps(_mm512_div_ps(series_b, six), cos_double_angular_midpoint),
|
|
517
|
+
_mm512_mul_ps(term2, term3));
|
|
518
|
+
__m512 delta_sigma = _mm512_mul_ps(
|
|
519
|
+
series_b, _mm512_mul_ps(sin_angular_distance, _mm512_add_ps(cos_double_angular_midpoint,
|
|
520
|
+
_mm512_mul_ps(_mm512_div_ps(series_b, four),
|
|
521
|
+
_mm512_sub_ps(term1, term2)))));
|
|
522
|
+
|
|
523
|
+
// s = b * A * (σ - Δσ)
|
|
524
|
+
__m512 distances = _mm512_mul_ps(_mm512_mul_ps(polar_radius, series_a),
|
|
525
|
+
_mm512_sub_ps(angular_distance, delta_sigma));
|
|
526
|
+
|
|
527
|
+
// Set coincident points to zero
|
|
528
|
+
distances = _mm512_mask_blend_ps(coincident_mask, distances, _mm512_setzero_ps());
|
|
529
|
+
|
|
530
|
+
return distances;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
NK_PUBLIC void nk_vincenty_f32_skylake( //
|
|
534
|
+
nk_f32_t const *a_lats, nk_f32_t const *a_lons, //
|
|
535
|
+
nk_f32_t const *b_lats, nk_f32_t const *b_lons, //
|
|
536
|
+
nk_size_t n, nk_f32_t *results) {
|
|
537
|
+
|
|
538
|
+
while (n >= 16) {
|
|
539
|
+
__m512 first_latitudes = _mm512_loadu_ps(a_lats);
|
|
540
|
+
__m512 first_longitudes = _mm512_loadu_ps(a_lons);
|
|
541
|
+
__m512 second_latitudes = _mm512_loadu_ps(b_lats);
|
|
542
|
+
__m512 second_longitudes = _mm512_loadu_ps(b_lons);
|
|
543
|
+
|
|
544
|
+
__m512 distances = nk_vincenty_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
|
|
545
|
+
second_longitudes);
|
|
546
|
+
_mm512_storeu_ps(results, distances);
|
|
547
|
+
|
|
548
|
+
a_lats += 16, a_lons += 16, b_lats += 16, b_lons += 16, results += 16, n -= 16;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
// Handle remaining elements with masked operations
|
|
552
|
+
if (n > 0) {
|
|
553
|
+
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
|
|
554
|
+
__m512 first_latitudes = _mm512_maskz_loadu_ps(mask, a_lats);
|
|
555
|
+
__m512 first_longitudes = _mm512_maskz_loadu_ps(mask, a_lons);
|
|
556
|
+
__m512 second_latitudes = _mm512_maskz_loadu_ps(mask, b_lats);
|
|
557
|
+
__m512 second_longitudes = _mm512_maskz_loadu_ps(mask, b_lons);
|
|
558
|
+
|
|
559
|
+
__m512 distances = nk_vincenty_f32x16_skylake_(first_latitudes, first_longitudes, second_latitudes,
|
|
560
|
+
second_longitudes);
|
|
561
|
+
_mm512_mask_storeu_ps(results, mask, distances);
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
#if defined(__clang__)
|
|
566
|
+
#pragma clang attribute pop
|
|
567
|
+
#elif defined(__GNUC__)
|
|
568
|
+
#pragma GCC pop_options
|
|
569
|
+
#endif
|
|
570
|
+
|
|
571
|
+
#if defined(__cplusplus)
|
|
572
|
+
} // extern "C"
|
|
573
|
+
#endif
|
|
574
|
+
|
|
575
|
+
#endif // NK_TARGET_SKYLAKE
|
|
576
|
+
#endif // NK_TARGET_X86_
|
|
577
|
+
#endif // NK_GEOSPATIAL_SKYLAKE_H
|