numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -9,12 +9,12 @@
|
|
|
9
9
|
*
|
|
10
10
|
* @section skylake_trig_instructions Key AVX-512 Trigonometry Instructions
|
|
11
11
|
*
|
|
12
|
-
* Intrinsic
|
|
13
|
-
* _mm512_fmadd_ps
|
|
14
|
-
* _mm512_mul_ps
|
|
15
|
-
* _mm512_and_ps
|
|
16
|
-
* _mm512_cmp_ps_mask
|
|
17
|
-
* _mm512_roundscale_ps
|
|
12
|
+
* Intrinsic Instruction Skylake-X Genoa
|
|
13
|
+
* _mm512_fmadd_ps VFMADD132PS (ZMM, ZMM, ZMM) 4cy @ p05 4cy @ p01
|
|
14
|
+
* _mm512_mul_ps VMULPS (ZMM, ZMM, ZMM) 4cy @ p05 3cy @ p01
|
|
15
|
+
* _mm512_and_ps VANDPS (ZMM, ZMM, ZMM) 1cy @ p05 1cy @ p0123
|
|
16
|
+
* _mm512_cmp_ps_mask VCMPPS (K, ZMM, ZMM, I8) 4cy @ p5 5cy @ p01
|
|
17
|
+
* _mm512_roundscale_ps VRNDSCALEPS (ZMM, ZMM, I8) 8cy @ p05+p05 3cy @ p23
|
|
18
18
|
*
|
|
19
19
|
* Trigonometric functions use polynomial approximations evaluated via Horner's method with FMA chains.
|
|
20
20
|
* AVX-512 mask registers enable branchless range reduction and sign handling without blend overhead.
|
|
@@ -42,394 +42,398 @@ extern "C" {
|
|
|
42
42
|
|
|
43
43
|
NK_INTERNAL __m512 nk_sin_f32x16_skylake_(__m512 const angles_radians) {
|
|
44
44
|
// Cody-Waite constants for argument reduction
|
|
45
|
-
__m512 const
|
|
46
|
-
__m512 const
|
|
47
|
-
__m512 const
|
|
45
|
+
__m512 const pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
|
|
46
|
+
__m512 const pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
|
|
47
|
+
__m512 const pi_reciprocal_f32x16 = _mm512_set1_ps(0.31830988618379067154f); // 1/π
|
|
48
48
|
// Degree-9 minimax coefficients
|
|
49
|
-
__m512 const
|
|
50
|
-
__m512 const
|
|
51
|
-
__m512 const
|
|
52
|
-
__m512 const
|
|
53
|
-
|
|
54
|
-
// Compute (
|
|
55
|
-
__m512
|
|
56
|
-
__m512
|
|
49
|
+
__m512 const coeff_9_f32x16 = _mm512_set1_ps(+2.7557319224e-6f);
|
|
50
|
+
__m512 const coeff_7_f32x16 = _mm512_set1_ps(-1.9841269841e-4f);
|
|
51
|
+
__m512 const coeff_5_f32x16 = _mm512_set1_ps(+8.3333293855e-3f);
|
|
52
|
+
__m512 const coeff_3_f32x16 = _mm512_set1_ps(-1.6666666641e-1f);
|
|
53
|
+
|
|
54
|
+
// Compute (multiples_of_pi_i32x16) = round(angle / π)
|
|
55
|
+
__m512 quotients_f32x16 = _mm512_mul_ps(angles_radians, pi_reciprocal_f32x16);
|
|
56
|
+
__m512 rounded_quotients_f32x16 = _mm512_roundscale_ps(quotients_f32x16,
|
|
57
|
+
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
57
58
|
// Use explicit rounding to match roundscale (MXCSR-independent)
|
|
58
|
-
__m512i
|
|
59
|
-
|
|
59
|
+
__m512i multiples_of_pi_i32x16 = _mm512_cvt_roundps_epi32(rounded_quotients_f32x16,
|
|
60
|
+
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
60
61
|
|
|
61
62
|
// Cody-Waite range reduction
|
|
62
|
-
__m512
|
|
63
|
-
|
|
64
|
-
__m512 const
|
|
65
|
-
__m512 const
|
|
63
|
+
__m512 angles_f32x16 = _mm512_fnmadd_ps(rounded_quotients_f32x16, pi_high_f32x16, angles_radians);
|
|
64
|
+
angles_f32x16 = _mm512_fnmadd_ps(rounded_quotients_f32x16, pi_low_f32x16, angles_f32x16);
|
|
65
|
+
__m512 const angles_squared_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
|
|
66
|
+
__m512 const angles_cubed_f32x16 = _mm512_mul_ps(angles_f32x16, angles_squared_f32x16);
|
|
66
67
|
|
|
67
68
|
// Degree-9 polynomial via Horner's method
|
|
68
|
-
__m512
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
// If
|
|
74
|
-
__mmask16 odd_mask = _mm512_test_epi32_mask(
|
|
75
|
-
__m512
|
|
76
|
-
|
|
77
|
-
return
|
|
69
|
+
__m512 polynomials_f32x16 = coeff_9_f32x16;
|
|
70
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_7_f32x16);
|
|
71
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_5_f32x16);
|
|
72
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_3_f32x16);
|
|
73
|
+
|
|
74
|
+
// If multiples_of_pi_i32x16 is odd, flip the sign of the results_f32x16
|
|
75
|
+
__mmask16 odd_mask = _mm512_test_epi32_mask(multiples_of_pi_i32x16, _mm512_set1_epi32(1));
|
|
76
|
+
__m512 results_f32x16 = _mm512_fmadd_ps(angles_cubed_f32x16, polynomials_f32x16, angles_f32x16);
|
|
77
|
+
results_f32x16 = _mm512_mask_sub_ps(results_f32x16, odd_mask, _mm512_setzero_ps(), results_f32x16);
|
|
78
|
+
return results_f32x16;
|
|
78
79
|
}
|
|
79
80
|
|
|
80
81
|
NK_INTERNAL __m512 nk_cos_f32x16_skylake_(__m512 const angles_radians) {
|
|
81
82
|
// Cody-Waite constants for argument reduction
|
|
82
|
-
__m512 const
|
|
83
|
-
__m512 const
|
|
84
|
-
__m512 const
|
|
85
|
-
__m512 const
|
|
83
|
+
__m512 const pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
|
|
84
|
+
__m512 const pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
|
|
85
|
+
__m512 const pi_half_f32x16 = _mm512_set1_ps(1.57079632679489661923f); // π/2
|
|
86
|
+
__m512 const pi_reciprocal_f32x16 = _mm512_set1_ps(0.31830988618379067154f); // 1/π
|
|
86
87
|
// Degree-9 minimax coefficients
|
|
87
|
-
__m512 const
|
|
88
|
-
__m512 const
|
|
89
|
-
__m512 const
|
|
90
|
-
__m512 const
|
|
91
|
-
|
|
92
|
-
// Compute (
|
|
93
|
-
__m512
|
|
94
|
-
__m512
|
|
88
|
+
__m512 const coeff_9_f32x16 = _mm512_set1_ps(+2.7557319224e-6f);
|
|
89
|
+
__m512 const coeff_7_f32x16 = _mm512_set1_ps(-1.9841269841e-4f);
|
|
90
|
+
__m512 const coeff_5_f32x16 = _mm512_set1_ps(+8.3333293855e-3f);
|
|
91
|
+
__m512 const coeff_3_f32x16 = _mm512_set1_ps(-1.6666666641e-1f);
|
|
92
|
+
|
|
93
|
+
// Compute (multiples_of_pi_i32x16) = round((angle / π) - 0.5)
|
|
94
|
+
__m512 quotients_f32x16 = _mm512_fmsub_ps(angles_radians, pi_reciprocal_f32x16, _mm512_set1_ps(0.5f));
|
|
95
|
+
__m512 rounded_quotients_f32x16 = _mm512_roundscale_ps(quotients_f32x16,
|
|
96
|
+
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
95
97
|
// Use explicit rounding to match roundscale (MXCSR-independent)
|
|
96
|
-
__m512i
|
|
97
|
-
|
|
98
|
+
__m512i multiples_of_pi_i32x16 = _mm512_cvt_roundps_epi32(rounded_quotients_f32x16,
|
|
99
|
+
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
98
100
|
|
|
99
101
|
// Cody-Waite range reduction: angle = angle_radians - (multiples * pi + pi/2)
|
|
100
|
-
__m512 const
|
|
101
|
-
__m512
|
|
102
|
-
|
|
103
|
-
__m512 const
|
|
104
|
-
__m512 const
|
|
102
|
+
__m512 const offset_f32x16 = _mm512_fmadd_ps(rounded_quotients_f32x16, pi_high_f32x16, pi_half_f32x16);
|
|
103
|
+
__m512 angles_f32x16 = _mm512_sub_ps(angles_radians, offset_f32x16);
|
|
104
|
+
angles_f32x16 = _mm512_fnmadd_ps(rounded_quotients_f32x16, pi_low_f32x16, angles_f32x16);
|
|
105
|
+
__m512 const angles_squared_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
|
|
106
|
+
__m512 const angles_cubed_f32x16 = _mm512_mul_ps(angles_f32x16, angles_squared_f32x16);
|
|
105
107
|
|
|
106
108
|
// Degree-9 polynomial via Horner's method
|
|
107
|
-
__m512
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
__m512
|
|
112
|
-
|
|
113
|
-
// If
|
|
114
|
-
__mmask16 even_mask = _mm512_testn_epi32_mask(
|
|
115
|
-
|
|
116
|
-
return
|
|
109
|
+
__m512 polynomials_f32x16 = coeff_9_f32x16;
|
|
110
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_7_f32x16);
|
|
111
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_5_f32x16);
|
|
112
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_3_f32x16);
|
|
113
|
+
__m512 results_f32x16 = _mm512_fmadd_ps(angles_cubed_f32x16, polynomials_f32x16, angles_f32x16);
|
|
114
|
+
|
|
115
|
+
// If multiples_of_pi_i32x16 is even, flip the sign of the results_f32x16
|
|
116
|
+
__mmask16 even_mask = _mm512_testn_epi32_mask(multiples_of_pi_i32x16, _mm512_set1_epi32(1));
|
|
117
|
+
results_f32x16 = _mm512_mask_sub_ps(results_f32x16, even_mask, _mm512_setzero_ps(), results_f32x16);
|
|
118
|
+
return results_f32x16;
|
|
117
119
|
}
|
|
118
120
|
|
|
119
121
|
NK_INTERNAL __m512 nk_atan_f32x16_skylake_(__m512 const inputs) {
|
|
120
122
|
// Polynomial coefficients
|
|
121
|
-
__m512 const
|
|
122
|
-
__m512 const
|
|
123
|
-
__m512 const
|
|
124
|
-
__m512 const
|
|
125
|
-
__m512 const
|
|
126
|
-
__m512 const
|
|
127
|
-
__m512 const
|
|
128
|
-
__m512 const
|
|
123
|
+
__m512 const coeff_8_f32x16 = _mm512_set1_ps(-0.333331018686294555664062f);
|
|
124
|
+
__m512 const coeff_7_f32x16 = _mm512_set1_ps(+0.199926957488059997558594f);
|
|
125
|
+
__m512 const coeff_6_f32x16 = _mm512_set1_ps(-0.142027363181114196777344f);
|
|
126
|
+
__m512 const coeff_5_f32x16 = _mm512_set1_ps(+0.106347933411598205566406f);
|
|
127
|
+
__m512 const coeff_4_f32x16 = _mm512_set1_ps(-0.0748900920152664184570312f);
|
|
128
|
+
__m512 const coeff_3_f32x16 = _mm512_set1_ps(+0.0425049886107444763183594f);
|
|
129
|
+
__m512 const coeff_2_f32x16 = _mm512_set1_ps(-0.0159569028764963150024414f);
|
|
130
|
+
__m512 const coeff_1_f32x16 = _mm512_set1_ps(+0.00282363896258175373077393f);
|
|
129
131
|
|
|
130
132
|
// Adjust for quadrant
|
|
131
|
-
__m512
|
|
132
|
-
__mmask16 const negative_mask = _mm512_fpclass_ps_mask(
|
|
133
|
-
|
|
134
|
-
__mmask16 const reciprocal_mask = _mm512_cmp_ps_mask(
|
|
135
|
-
|
|
133
|
+
__m512 values_f32x16 = inputs;
|
|
134
|
+
__mmask16 const negative_mask = _mm512_fpclass_ps_mask(values_f32x16, 0x40);
|
|
135
|
+
values_f32x16 = _mm512_abs_ps(values_f32x16);
|
|
136
|
+
__mmask16 const reciprocal_mask = _mm512_cmp_ps_mask(values_f32x16, _mm512_set1_ps(1.0f), _CMP_GT_OS);
|
|
137
|
+
values_f32x16 = _mm512_mask_div_ps(values_f32x16, reciprocal_mask, _mm512_set1_ps(1.0f), values_f32x16);
|
|
136
138
|
|
|
137
139
|
// Argument reduction
|
|
138
|
-
__m512 const
|
|
139
|
-
__m512 const
|
|
140
|
+
__m512 const values_squared_f32x16 = _mm512_mul_ps(values_f32x16, values_f32x16);
|
|
141
|
+
__m512 const values_cubed_f32x16 = _mm512_mul_ps(values_f32x16, values_squared_f32x16);
|
|
140
142
|
|
|
141
143
|
// Polynomial evaluation
|
|
142
|
-
__m512
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
// Adjust
|
|
152
|
-
__m512
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
144
|
+
__m512 polynomials_f32x16 = coeff_1_f32x16;
|
|
145
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_2_f32x16);
|
|
146
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_3_f32x16);
|
|
147
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_4_f32x16);
|
|
148
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_5_f32x16);
|
|
149
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_6_f32x16);
|
|
150
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_7_f32x16);
|
|
151
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_8_f32x16);
|
|
152
|
+
|
|
153
|
+
// Adjust result_f32x16 for quadrants
|
|
154
|
+
__m512 result_f32x16 = _mm512_fmadd_ps(values_cubed_f32x16, polynomials_f32x16, values_f32x16);
|
|
155
|
+
result_f32x16 = _mm512_mask_sub_ps(result_f32x16, reciprocal_mask, _mm512_set1_ps(1.5707963267948966f),
|
|
156
|
+
result_f32x16);
|
|
157
|
+
result_f32x16 = _mm512_mask_sub_ps(result_f32x16, negative_mask, _mm512_setzero_ps(), result_f32x16);
|
|
158
|
+
return result_f32x16;
|
|
156
159
|
}
|
|
157
160
|
|
|
158
161
|
NK_INTERNAL __m512 nk_atan2_f32x16_skylake_(__m512 const ys_inputs, __m512 const xs_inputs) {
|
|
159
162
|
// Polynomial coefficients
|
|
160
|
-
__m512 const
|
|
161
|
-
__m512 const
|
|
162
|
-
__m512 const
|
|
163
|
-
__m512 const
|
|
164
|
-
__m512 const
|
|
165
|
-
__m512 const
|
|
166
|
-
__m512 const
|
|
167
|
-
__m512 const
|
|
163
|
+
__m512 const coeff_8_f32x16 = _mm512_set1_ps(-0.333331018686294555664062f);
|
|
164
|
+
__m512 const coeff_7_f32x16 = _mm512_set1_ps(+0.199926957488059997558594f);
|
|
165
|
+
__m512 const coeff_6_f32x16 = _mm512_set1_ps(-0.142027363181114196777344f);
|
|
166
|
+
__m512 const coeff_5_f32x16 = _mm512_set1_ps(+0.106347933411598205566406f);
|
|
167
|
+
__m512 const coeff_4_f32x16 = _mm512_set1_ps(-0.0748900920152664184570312f);
|
|
168
|
+
__m512 const coeff_3_f32x16 = _mm512_set1_ps(+0.0425049886107444763183594f);
|
|
169
|
+
__m512 const coeff_2_f32x16 = _mm512_set1_ps(-0.0159569028764963150024414f);
|
|
170
|
+
__m512 const coeff_1_f32x16 = _mm512_set1_ps(+0.00282363896258175373077393f);
|
|
168
171
|
|
|
169
172
|
// Quadrant adjustments normalizing to absolute values of x and y
|
|
170
173
|
__mmask16 const xs_negative_mask = _mm512_fpclass_ps_mask(xs_inputs, 0x40);
|
|
171
|
-
__m512
|
|
172
|
-
__m512
|
|
174
|
+
__m512 xs_f32x16 = _mm512_abs_ps(xs_inputs);
|
|
175
|
+
__m512 ys_f32x16 = _mm512_abs_ps(ys_inputs);
|
|
173
176
|
// Ensure proper fraction where the numerator is smaller than the denominator
|
|
174
|
-
__mmask16 const swap_mask = _mm512_cmp_ps_mask(
|
|
175
|
-
__m512
|
|
176
|
-
|
|
177
|
-
|
|
177
|
+
__mmask16 const swap_mask = _mm512_cmp_ps_mask(ys_f32x16, xs_f32x16, _CMP_GT_OS);
|
|
178
|
+
__m512 temps_f32x16 = xs_f32x16;
|
|
179
|
+
xs_f32x16 = _mm512_mask_blend_ps(swap_mask, xs_f32x16, ys_f32x16);
|
|
180
|
+
ys_f32x16 = _mm512_mask_sub_ps(ys_f32x16, swap_mask, _mm512_setzero_ps(), temps_f32x16);
|
|
178
181
|
|
|
179
|
-
// Compute
|
|
180
|
-
__m512 const
|
|
181
|
-
__m512 const
|
|
182
|
-
__m512 const
|
|
182
|
+
// Compute ratio_f32x16 and ratio²
|
|
183
|
+
__m512 const ratio_f32x16 = _mm512_div_ps(ys_f32x16, xs_f32x16);
|
|
184
|
+
__m512 const ratio_squared_f32x16 = _mm512_mul_ps(ratio_f32x16, ratio_f32x16);
|
|
185
|
+
__m512 const ratio_cubed_f32x16 = _mm512_mul_ps(ratio_f32x16, ratio_squared_f32x16);
|
|
183
186
|
|
|
184
187
|
// Polynomial evaluation
|
|
185
|
-
__m512
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
// Compute
|
|
188
|
+
__m512 polynomials_f32x16 = coeff_1_f32x16;
|
|
189
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_2_f32x16);
|
|
190
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_3_f32x16);
|
|
191
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_4_f32x16);
|
|
192
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_5_f32x16);
|
|
193
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_6_f32x16);
|
|
194
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_7_f32x16);
|
|
195
|
+
polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_8_f32x16);
|
|
196
|
+
|
|
197
|
+
// Compute quadrant_f32x16 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
|
|
195
198
|
// -2 for x<0 && !swap, -1 for x<0 && swap
|
|
196
|
-
__m512
|
|
197
|
-
__m512
|
|
198
|
-
__m512
|
|
199
|
-
|
|
200
|
-
__m512
|
|
201
|
-
__m512
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
// Adjust for
|
|
205
|
-
__m512
|
|
206
|
-
|
|
199
|
+
__m512 results_f32x16 = _mm512_fmadd_ps(ratio_cubed_f32x16, polynomials_f32x16, ratio_f32x16);
|
|
200
|
+
__m512 quadrant_f32x16 = _mm512_setzero_ps();
|
|
201
|
+
__m512 neg_two_f32x16 = _mm512_set1_ps(-2.0f);
|
|
202
|
+
quadrant_f32x16 = _mm512_mask_blend_ps(xs_negative_mask, quadrant_f32x16, neg_two_f32x16);
|
|
203
|
+
__m512 one_f32x16 = _mm512_set1_ps(1.0f);
|
|
204
|
+
__m512 quadrant_incremented_f32x16 = _mm512_add_ps(quadrant_f32x16, one_f32x16);
|
|
205
|
+
quadrant_f32x16 = _mm512_mask_blend_ps(swap_mask, quadrant_f32x16, quadrant_incremented_f32x16);
|
|
206
|
+
|
|
207
|
+
// Adjust for quadrant_f32x16: result += quadrant_f32x16 * π/2
|
|
208
|
+
__m512 pi_half_f32x16 = _mm512_set1_ps(1.5707963267948966f);
|
|
209
|
+
results_f32x16 = _mm512_fmadd_ps(quadrant_f32x16, pi_half_f32x16, results_f32x16);
|
|
207
210
|
|
|
208
211
|
// Transfer sign from x (XOR with sign bit of x_input)
|
|
209
|
-
__m512
|
|
210
|
-
|
|
212
|
+
__m512 xs_sign_bits_f32x16 = _mm512_and_ps(xs_inputs, _mm512_set1_ps(-0.0f));
|
|
213
|
+
results_f32x16 = _mm512_xor_ps(results_f32x16, xs_sign_bits_f32x16);
|
|
211
214
|
|
|
212
215
|
// Transfer sign from y (XOR with sign bit of y_input)
|
|
213
|
-
__m512
|
|
214
|
-
|
|
216
|
+
__m512 ys_sign_bits_f32x16 = _mm512_and_ps(ys_inputs, _mm512_set1_ps(-0.0f));
|
|
217
|
+
results_f32x16 = _mm512_xor_ps(results_f32x16, ys_sign_bits_f32x16);
|
|
215
218
|
|
|
216
|
-
return
|
|
219
|
+
return results_f32x16;
|
|
217
220
|
}
|
|
218
221
|
|
|
219
222
|
NK_PUBLIC void nk_each_sin_f32_skylake(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
220
223
|
nk_size_t i = 0;
|
|
221
224
|
for (; i + 16 <= n; i += 16) {
|
|
222
|
-
__m512
|
|
223
|
-
__m512
|
|
224
|
-
_mm512_storeu_ps(outs + i,
|
|
225
|
+
__m512 angles_f32x16 = _mm512_loadu_ps(ins + i);
|
|
226
|
+
__m512 results_f32x16 = nk_sin_f32x16_skylake_(angles_f32x16);
|
|
227
|
+
_mm512_storeu_ps(outs + i, results_f32x16);
|
|
225
228
|
}
|
|
226
229
|
if (i < n) {
|
|
227
230
|
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n - i);
|
|
228
|
-
__m512
|
|
229
|
-
__m512
|
|
230
|
-
_mm512_mask_storeu_ps(outs + i, mask,
|
|
231
|
+
__m512 angles_f32x16 = _mm512_maskz_loadu_ps(mask, ins + i);
|
|
232
|
+
__m512 results_f32x16 = nk_sin_f32x16_skylake_(angles_f32x16);
|
|
233
|
+
_mm512_mask_storeu_ps(outs + i, mask, results_f32x16);
|
|
231
234
|
}
|
|
232
235
|
}
|
|
233
236
|
NK_PUBLIC void nk_each_cos_f32_skylake(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
234
237
|
nk_size_t i = 0;
|
|
235
238
|
for (; i + 16 <= n; i += 16) {
|
|
236
|
-
__m512
|
|
237
|
-
__m512
|
|
238
|
-
_mm512_storeu_ps(outs + i,
|
|
239
|
+
__m512 angles_f32x16 = _mm512_loadu_ps(ins + i);
|
|
240
|
+
__m512 results_f32x16 = nk_cos_f32x16_skylake_(angles_f32x16);
|
|
241
|
+
_mm512_storeu_ps(outs + i, results_f32x16);
|
|
239
242
|
}
|
|
240
243
|
if (i < n) {
|
|
241
244
|
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n - i);
|
|
242
|
-
__m512
|
|
243
|
-
__m512
|
|
244
|
-
_mm512_mask_storeu_ps(outs + i, mask,
|
|
245
|
+
__m512 angles_f32x16 = _mm512_maskz_loadu_ps(mask, ins + i);
|
|
246
|
+
__m512 results_f32x16 = nk_cos_f32x16_skylake_(angles_f32x16);
|
|
247
|
+
_mm512_mask_storeu_ps(outs + i, mask, results_f32x16);
|
|
245
248
|
}
|
|
246
249
|
}
|
|
247
250
|
NK_PUBLIC void nk_each_atan_f32_skylake(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
248
251
|
nk_size_t i = 0;
|
|
249
252
|
for (; i + 16 <= n; i += 16) {
|
|
250
|
-
__m512
|
|
251
|
-
__m512
|
|
252
|
-
_mm512_storeu_ps(outs + i,
|
|
253
|
+
__m512 angles_f32x16 = _mm512_loadu_ps(ins + i);
|
|
254
|
+
__m512 results_f32x16 = nk_atan_f32x16_skylake_(angles_f32x16);
|
|
255
|
+
_mm512_storeu_ps(outs + i, results_f32x16);
|
|
253
256
|
}
|
|
254
257
|
if (i < n) {
|
|
255
258
|
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n - i);
|
|
256
|
-
__m512
|
|
257
|
-
__m512
|
|
258
|
-
_mm512_mask_storeu_ps(outs + i, mask,
|
|
259
|
+
__m512 angles_f32x16 = _mm512_maskz_loadu_ps(mask, ins + i);
|
|
260
|
+
__m512 results_f32x16 = nk_atan_f32x16_skylake_(angles_f32x16);
|
|
261
|
+
_mm512_mask_storeu_ps(outs + i, mask, results_f32x16);
|
|
259
262
|
}
|
|
260
263
|
}
|
|
261
264
|
|
|
262
265
|
NK_INTERNAL __m512d nk_sin_f64x8_skylake_(__m512d const angles_radians) {
|
|
263
266
|
// Constants for argument reduction
|
|
264
|
-
__m512d const
|
|
265
|
-
__m512d const
|
|
266
|
-
__m512d const
|
|
267
|
+
__m512d const pi_high_f64x8 = _mm512_set1_pd(3.141592653589793116); // High-digits part of π
|
|
268
|
+
__m512d const pi_low_f64x8 = _mm512_set1_pd(1.2246467991473532072e-16); // Low-digits part of π
|
|
269
|
+
__m512d const pi_reciprocal_f64x8 = _mm512_set1_pd(0.31830988618379067154); // 1/π
|
|
267
270
|
|
|
268
271
|
// Polynomial coefficients for sine/cosine approximation (minimax polynomial)
|
|
269
|
-
__m512d const
|
|
270
|
-
__m512d const
|
|
271
|
-
__m512d const
|
|
272
|
-
__m512d const
|
|
273
|
-
__m512d const
|
|
274
|
-
__m512d const
|
|
275
|
-
__m512d const
|
|
276
|
-
__m512d const
|
|
277
|
-
__m512d const
|
|
278
|
-
|
|
279
|
-
// Compute (
|
|
280
|
-
__m512d const
|
|
281
|
-
__m512d const
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
272
|
+
__m512d const coeff_0_f64x8 = _mm512_set1_pd(+0.00833333333333332974823815);
|
|
273
|
+
__m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.000198412698412696162806809);
|
|
274
|
+
__m512d const coeff_2_f64x8 = _mm512_set1_pd(+2.75573192239198747630416e-06);
|
|
275
|
+
__m512d const coeff_3_f64x8 = _mm512_set1_pd(-2.50521083763502045810755e-08);
|
|
276
|
+
__m512d const coeff_4_f64x8 = _mm512_set1_pd(+1.60590430605664501629054e-10);
|
|
277
|
+
__m512d const coeff_5_f64x8 = _mm512_set1_pd(-7.64712219118158833288484e-13);
|
|
278
|
+
__m512d const coeff_6_f64x8 = _mm512_set1_pd(+2.81009972710863200091251e-15);
|
|
279
|
+
__m512d const coeff_7_f64x8 = _mm512_set1_pd(-7.97255955009037868891952e-18);
|
|
280
|
+
__m512d const coeff_8_f64x8 = _mm512_set1_pd(-0.166666666666666657414808);
|
|
281
|
+
|
|
282
|
+
// Compute (rounded_quotients_f64x8) = round(angle / π)
|
|
283
|
+
__m512d const quotients_f64x8 = _mm512_mul_pd(angles_radians, pi_reciprocal_f64x8);
|
|
284
|
+
__m512d const rounded_quotients_f64x8 = _mm512_roundscale_pd(quotients_f64x8,
|
|
285
|
+
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
286
|
+
|
|
287
|
+
// Reduce the angle to: angle - (rounded_quotients_f64x8 * π_high + rounded_quotients_f64x8 * π_low)
|
|
288
|
+
__m512d angles_f64x8 = angles_radians;
|
|
289
|
+
angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_high_f64x8, angles_f64x8);
|
|
290
|
+
angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_low_f64x8, angles_f64x8);
|
|
291
|
+
|
|
292
|
+
// If rounded_quotients_f64x8 is odd (bit 0 set), negate the angle
|
|
289
293
|
// Use explicit rounding to match roundscale (MXCSR-independent)
|
|
290
294
|
__mmask8 const sign_flip_mask = _mm256_test_epi32_mask(
|
|
291
|
-
_mm512_cvt_roundpd_epi32(
|
|
295
|
+
_mm512_cvt_roundpd_epi32(rounded_quotients_f64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
|
|
292
296
|
_mm256_set1_epi32(1));
|
|
293
|
-
|
|
297
|
+
angles_f64x8 = _mm512_mask_sub_pd(angles_f64x8, sign_flip_mask, _mm512_setzero_pd(), angles_f64x8);
|
|
294
298
|
|
|
295
|
-
__m512d const
|
|
296
|
-
__m512d const
|
|
297
|
-
__m512d const
|
|
298
|
-
__m512d const
|
|
299
|
+
__m512d const angles_squared_f64x8 = _mm512_mul_pd(angles_f64x8, angles_f64x8);
|
|
300
|
+
__m512d const angles_cubed_f64x8 = _mm512_mul_pd(angles_f64x8, angles_squared_f64x8);
|
|
301
|
+
__m512d const angles_quadratic_f64x8 = _mm512_mul_pd(angles_squared_f64x8, angles_squared_f64x8);
|
|
302
|
+
__m512d const angles_octic_f64x8 = _mm512_mul_pd(angles_quadratic_f64x8, angles_quadratic_f64x8);
|
|
299
303
|
|
|
300
304
|
// Compute higher-degree polynomial terms
|
|
301
|
-
__m512d const
|
|
302
|
-
__m512d const
|
|
303
|
-
__m512d const
|
|
305
|
+
__m512d const poly_67_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_7_f64x8, coeff_6_f64x8);
|
|
306
|
+
__m512d const poly_45_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_5_f64x8, coeff_4_f64x8);
|
|
307
|
+
__m512d const poly_4567_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_67_f64x8, poly_45_f64x8);
|
|
304
308
|
|
|
305
309
|
// Compute lower-degree polynomial terms
|
|
306
|
-
__m512d const
|
|
307
|
-
__m512d const
|
|
308
|
-
__m512d const
|
|
310
|
+
__m512d const poly_23_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_3_f64x8, coeff_2_f64x8);
|
|
311
|
+
__m512d const poly_01_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_1_f64x8, coeff_0_f64x8);
|
|
312
|
+
__m512d const poly_0123_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_23_f64x8, poly_01_f64x8);
|
|
309
313
|
|
|
310
314
|
// Combine polynomial terms
|
|
311
|
-
__m512d
|
|
312
|
-
|
|
313
|
-
|
|
315
|
+
__m512d results_f64x8 = _mm512_fmadd_pd(angles_octic_f64x8, poly_4567_f64x8, poly_0123_f64x8);
|
|
316
|
+
results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_squared_f64x8, coeff_8_f64x8);
|
|
317
|
+
results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_cubed_f64x8, angles_f64x8);
|
|
314
318
|
|
|
315
319
|
// Handle the special case of negative zero input
|
|
316
320
|
__mmask8 const non_zero_mask = _mm512_cmpneq_pd_mask(angles_radians, _mm512_setzero_pd());
|
|
317
|
-
|
|
318
|
-
return
|
|
321
|
+
results_f64x8 = _mm512_maskz_mov_pd(non_zero_mask, results_f64x8);
|
|
322
|
+
return results_f64x8;
|
|
319
323
|
}
|
|
320
324
|
|
|
321
325
|
NK_INTERNAL __m512d nk_cos_f64x8_skylake_(__m512d const angles_radians) {
|
|
322
326
|
// Constants for argument reduction
|
|
323
|
-
__m512d const
|
|
324
|
-
__m512d const
|
|
325
|
-
__m512d const
|
|
327
|
+
__m512d const pi_high_half_f64x8 = _mm512_set1_pd(3.141592653589793116 * 0.5); // High-digits part of π
|
|
328
|
+
__m512d const pi_low_half_f64x8 = _mm512_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π
|
|
329
|
+
__m512d const pi_reciprocal_f64x8 = _mm512_set1_pd(0.31830988618379067154); // 1/π
|
|
326
330
|
|
|
327
331
|
// Polynomial coefficients for sine/cosine approximation (minimax polynomial)
|
|
328
|
-
__m512d const
|
|
329
|
-
__m512d const
|
|
330
|
-
__m512d const
|
|
331
|
-
__m512d const
|
|
332
|
-
__m512d const
|
|
333
|
-
__m512d const
|
|
334
|
-
__m512d const
|
|
335
|
-
__m512d const
|
|
336
|
-
__m512d const
|
|
337
|
-
|
|
338
|
-
// Compute (
|
|
339
|
-
// Use fmsub: a*b - c =
|
|
340
|
-
__m512d const
|
|
341
|
-
__m512d const
|
|
342
|
-
_mm512_set1_pd(2),
|
|
343
|
-
_mm512_roundscale_pd(
|
|
332
|
+
__m512d const coeff_0_f64x8 = _mm512_set1_pd(+0.00833333333333332974823815);
|
|
333
|
+
__m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.000198412698412696162806809);
|
|
334
|
+
__m512d const coeff_2_f64x8 = _mm512_set1_pd(+2.75573192239198747630416e-06);
|
|
335
|
+
__m512d const coeff_3_f64x8 = _mm512_set1_pd(-2.50521083763502045810755e-08);
|
|
336
|
+
__m512d const coeff_4_f64x8 = _mm512_set1_pd(+1.60590430605664501629054e-10);
|
|
337
|
+
__m512d const coeff_5_f64x8 = _mm512_set1_pd(-7.64712219118158833288484e-13);
|
|
338
|
+
__m512d const coeff_6_f64x8 = _mm512_set1_pd(+2.81009972710863200091251e-15);
|
|
339
|
+
__m512d const coeff_7_f64x8 = _mm512_set1_pd(-7.97255955009037868891952e-18);
|
|
340
|
+
__m512d const coeff_8_f64x8 = _mm512_set1_pd(-0.166666666666666657414808);
|
|
341
|
+
|
|
342
|
+
// Compute (rounded_quotients_f64x8) = 2 * round(angle / π - 0.5) + 1
|
|
343
|
+
// Use fmsub: a*b - c = angles_f64x8 * (1/π) - 0.5
|
|
344
|
+
__m512d const quotients_f64x8 = _mm512_fmsub_pd(angles_radians, pi_reciprocal_f64x8, _mm512_set1_pd(0.5));
|
|
345
|
+
__m512d const rounded_quotients_f64x8 = _mm512_fmadd_pd( //
|
|
346
|
+
_mm512_set1_pd(2), //
|
|
347
|
+
_mm512_roundscale_pd(quotients_f64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
|
|
344
348
|
_mm512_set1_pd(1));
|
|
345
349
|
|
|
346
|
-
// Reduce the angle to: angle - (
|
|
347
|
-
__m512d
|
|
348
|
-
|
|
349
|
-
|
|
350
|
+
// Reduce the angle to: angle - (rounded_quotients_f64x8 * π_high + rounded_quotients_f64x8 * π_low)
|
|
351
|
+
__m512d angles_f64x8 = angles_radians;
|
|
352
|
+
angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_high_half_f64x8, angles_f64x8);
|
|
353
|
+
angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_low_half_f64x8, angles_f64x8);
|
|
350
354
|
// Use explicit rounding to match roundscale (MXCSR-independent)
|
|
351
355
|
__mmask8 const sign_flip_mask = _mm256_testn_epi32_mask(
|
|
352
|
-
_mm512_cvt_roundpd_epi32(
|
|
356
|
+
_mm512_cvt_roundpd_epi32(rounded_quotients_f64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
|
|
353
357
|
_mm256_set1_epi32(2));
|
|
354
|
-
|
|
355
|
-
__m512d const
|
|
356
|
-
__m512d const
|
|
357
|
-
__m512d const
|
|
358
|
-
__m512d const
|
|
358
|
+
angles_f64x8 = _mm512_mask_sub_pd(angles_f64x8, sign_flip_mask, _mm512_setzero_pd(), angles_f64x8);
|
|
359
|
+
__m512d const angles_squared_f64x8 = _mm512_mul_pd(angles_f64x8, angles_f64x8);
|
|
360
|
+
__m512d const angles_cubed_f64x8 = _mm512_mul_pd(angles_f64x8, angles_squared_f64x8);
|
|
361
|
+
__m512d const angles_quadratic_f64x8 = _mm512_mul_pd(angles_squared_f64x8, angles_squared_f64x8);
|
|
362
|
+
__m512d const angles_octic_f64x8 = _mm512_mul_pd(angles_quadratic_f64x8, angles_quadratic_f64x8);
|
|
359
363
|
|
|
360
364
|
// Compute higher-degree polynomial terms
|
|
361
|
-
__m512d const
|
|
362
|
-
__m512d const
|
|
363
|
-
__m512d const
|
|
365
|
+
__m512d const poly_67_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_7_f64x8, coeff_6_f64x8);
|
|
366
|
+
__m512d const poly_45_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_5_f64x8, coeff_4_f64x8);
|
|
367
|
+
__m512d const poly_4567_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_67_f64x8, poly_45_f64x8);
|
|
364
368
|
|
|
365
369
|
// Compute lower-degree polynomial terms
|
|
366
|
-
__m512d const
|
|
367
|
-
__m512d const
|
|
368
|
-
__m512d const
|
|
370
|
+
__m512d const poly_23_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_3_f64x8, coeff_2_f64x8);
|
|
371
|
+
__m512d const poly_01_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_1_f64x8, coeff_0_f64x8);
|
|
372
|
+
__m512d const poly_0123_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_23_f64x8, poly_01_f64x8);
|
|
369
373
|
|
|
370
374
|
// Combine polynomial terms
|
|
371
|
-
__m512d
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
return
|
|
375
|
+
__m512d results_f64x8 = _mm512_fmadd_pd(angles_octic_f64x8, poly_4567_f64x8, poly_0123_f64x8);
|
|
376
|
+
results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_squared_f64x8, coeff_8_f64x8);
|
|
377
|
+
results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_cubed_f64x8, angles_f64x8);
|
|
378
|
+
return results_f64x8;
|
|
375
379
|
}
|
|
376
380
|
|
|
377
381
|
NK_INTERNAL __m512d nk_atan_f64x8_skylake_(__m512d const inputs) {
|
|
378
382
|
// Polynomial coefficients for atan approximation
|
|
379
|
-
__m512d const
|
|
380
|
-
__m512d const
|
|
381
|
-
__m512d const
|
|
382
|
-
__m512d const
|
|
383
|
-
__m512d const
|
|
384
|
-
__m512d const
|
|
385
|
-
__m512d const
|
|
386
|
-
__m512d const
|
|
387
|
-
__m512d const
|
|
388
|
-
__m512d const
|
|
389
|
-
__m512d const
|
|
390
|
-
__m512d const
|
|
391
|
-
__m512d const
|
|
392
|
-
__m512d const
|
|
393
|
-
__m512d const
|
|
394
|
-
__m512d const
|
|
395
|
-
__m512d const
|
|
396
|
-
__m512d const
|
|
397
|
-
__m512d const
|
|
383
|
+
__m512d const coeff_19_f64x8 = _mm512_set1_pd(-1.88796008463073496563746e-05);
|
|
384
|
+
__m512d const coeff_18_f64x8 = _mm512_set1_pd(+0.000209850076645816976906797);
|
|
385
|
+
__m512d const coeff_17_f64x8 = _mm512_set1_pd(-0.00110611831486672482563471);
|
|
386
|
+
__m512d const coeff_16_f64x8 = _mm512_set1_pd(+0.00370026744188713119232403);
|
|
387
|
+
__m512d const coeff_15_f64x8 = _mm512_set1_pd(-0.00889896195887655491740809);
|
|
388
|
+
__m512d const coeff_14_f64x8 = _mm512_set1_pd(+0.016599329773529201970117);
|
|
389
|
+
__m512d const coeff_13_f64x8 = _mm512_set1_pd(-0.0254517624932312641616861);
|
|
390
|
+
__m512d const coeff_12_f64x8 = _mm512_set1_pd(+0.0337852580001353069993897);
|
|
391
|
+
__m512d const coeff_11_f64x8 = _mm512_set1_pd(-0.0407629191276836500001934);
|
|
392
|
+
__m512d const coeff_10_f64x8 = _mm512_set1_pd(+0.0466667150077840625632675);
|
|
393
|
+
__m512d const coeff_9_f64x8 = _mm512_set1_pd(-0.0523674852303482457616113);
|
|
394
|
+
__m512d const coeff_8_f64x8 = _mm512_set1_pd(+0.0587666392926673580854313);
|
|
395
|
+
__m512d const coeff_7_f64x8 = _mm512_set1_pd(-0.0666573579361080525984562);
|
|
396
|
+
__m512d const coeff_6_f64x8 = _mm512_set1_pd(+0.0769219538311769618355029);
|
|
397
|
+
__m512d const coeff_5_f64x8 = _mm512_set1_pd(-0.090908995008245008229153);
|
|
398
|
+
__m512d const coeff_4_f64x8 = _mm512_set1_pd(+0.111111105648261418443745);
|
|
399
|
+
__m512d const coeff_3_f64x8 = _mm512_set1_pd(-0.14285714266771329383765);
|
|
400
|
+
__m512d const coeff_2_f64x8 = _mm512_set1_pd(+0.199999999996591265594148);
|
|
401
|
+
__m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.333333333333311110369124);
|
|
398
402
|
|
|
399
403
|
// Quadrant adjustments
|
|
400
404
|
__mmask8 negative_mask = _mm512_cmp_pd_mask(inputs, _mm512_setzero_pd(), _CMP_LT_OS);
|
|
401
|
-
__m512d
|
|
402
|
-
__mmask8 reciprocal_mask = _mm512_cmp_pd_mask(
|
|
403
|
-
|
|
404
|
-
__m512d const
|
|
405
|
-
__m512d const
|
|
405
|
+
__m512d values_f64x8 = _mm512_abs_pd(inputs);
|
|
406
|
+
__mmask8 reciprocal_mask = _mm512_cmp_pd_mask(values_f64x8, _mm512_set1_pd(1.0), _CMP_GT_OS);
|
|
407
|
+
values_f64x8 = _mm512_mask_div_pd(values_f64x8, reciprocal_mask, _mm512_set1_pd(1.0), values_f64x8);
|
|
408
|
+
__m512d const values_squared_f64x8 = _mm512_mul_pd(values_f64x8, values_f64x8);
|
|
409
|
+
__m512d const values_cubed_f64x8 = _mm512_mul_pd(values_f64x8, values_squared_f64x8);
|
|
406
410
|
|
|
407
411
|
// Polynomial evaluation (argument reduction and approximation)
|
|
408
|
-
__m512d
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
412
|
+
__m512d polynomials_f64x8 = coeff_19_f64x8;
|
|
413
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_18_f64x8);
|
|
414
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_17_f64x8);
|
|
415
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_16_f64x8);
|
|
416
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_15_f64x8);
|
|
417
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_14_f64x8);
|
|
418
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_13_f64x8);
|
|
419
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_12_f64x8);
|
|
420
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_11_f64x8);
|
|
421
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_10_f64x8);
|
|
422
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_9_f64x8);
|
|
423
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_8_f64x8);
|
|
424
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_7_f64x8);
|
|
425
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_6_f64x8);
|
|
426
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_5_f64x8);
|
|
427
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_4_f64x8);
|
|
428
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_3_f64x8);
|
|
429
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_2_f64x8);
|
|
430
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_1_f64x8);
|
|
427
431
|
|
|
428
432
|
// Compute atan approximation
|
|
429
|
-
__m512d
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
return
|
|
433
|
+
__m512d result_f64x8 = _mm512_fmadd_pd(values_cubed_f64x8, polynomials_f64x8, values_f64x8);
|
|
434
|
+
result_f64x8 = _mm512_mask_sub_pd(result_f64x8, reciprocal_mask, _mm512_set1_pd(1.5707963267948966), result_f64x8);
|
|
435
|
+
result_f64x8 = _mm512_mask_sub_pd(result_f64x8, negative_mask, _mm512_setzero_pd(), result_f64x8);
|
|
436
|
+
return result_f64x8;
|
|
433
437
|
}
|
|
434
438
|
|
|
435
439
|
/**
|
|
@@ -438,126 +442,126 @@ NK_INTERNAL __m512d nk_atan_f64x8_skylake_(__m512d const inputs) {
|
|
|
438
442
|
*/
|
|
439
443
|
NK_INTERNAL __m512d nk_atan2_f64x8_skylake_(__m512d const ys_inputs, __m512d const xs_inputs) {
|
|
440
444
|
// Polynomial coefficients for atan approximation (higher precision than f32)
|
|
441
|
-
__m512d const
|
|
442
|
-
__m512d const
|
|
443
|
-
__m512d const
|
|
444
|
-
__m512d const
|
|
445
|
-
__m512d const
|
|
446
|
-
__m512d const
|
|
447
|
-
__m512d const
|
|
448
|
-
__m512d const
|
|
449
|
-
__m512d const
|
|
450
|
-
__m512d const
|
|
451
|
-
__m512d const
|
|
452
|
-
__m512d const
|
|
453
|
-
__m512d const
|
|
454
|
-
__m512d const
|
|
455
|
-
__m512d const
|
|
456
|
-
__m512d const
|
|
457
|
-
__m512d const
|
|
458
|
-
__m512d const
|
|
459
|
-
__m512d const
|
|
445
|
+
__m512d const coeff_19_f64x8 = _mm512_set1_pd(-1.88796008463073496563746e-05);
|
|
446
|
+
__m512d const coeff_18_f64x8 = _mm512_set1_pd(+0.000209850076645816976906797);
|
|
447
|
+
__m512d const coeff_17_f64x8 = _mm512_set1_pd(-0.00110611831486672482563471);
|
|
448
|
+
__m512d const coeff_16_f64x8 = _mm512_set1_pd(+0.00370026744188713119232403);
|
|
449
|
+
__m512d const coeff_15_f64x8 = _mm512_set1_pd(-0.00889896195887655491740809);
|
|
450
|
+
__m512d const coeff_14_f64x8 = _mm512_set1_pd(+0.016599329773529201970117);
|
|
451
|
+
__m512d const coeff_13_f64x8 = _mm512_set1_pd(-0.0254517624932312641616861);
|
|
452
|
+
__m512d const coeff_12_f64x8 = _mm512_set1_pd(+0.0337852580001353069993897);
|
|
453
|
+
__m512d const coeff_11_f64x8 = _mm512_set1_pd(-0.0407629191276836500001934);
|
|
454
|
+
__m512d const coeff_10_f64x8 = _mm512_set1_pd(+0.0466667150077840625632675);
|
|
455
|
+
__m512d const coeff_9_f64x8 = _mm512_set1_pd(-0.0523674852303482457616113);
|
|
456
|
+
__m512d const coeff_8_f64x8 = _mm512_set1_pd(+0.0587666392926673580854313);
|
|
457
|
+
__m512d const coeff_7_f64x8 = _mm512_set1_pd(-0.0666573579361080525984562);
|
|
458
|
+
__m512d const coeff_6_f64x8 = _mm512_set1_pd(+0.0769219538311769618355029);
|
|
459
|
+
__m512d const coeff_5_f64x8 = _mm512_set1_pd(-0.090908995008245008229153);
|
|
460
|
+
__m512d const coeff_4_f64x8 = _mm512_set1_pd(+0.111111105648261418443745);
|
|
461
|
+
__m512d const coeff_3_f64x8 = _mm512_set1_pd(-0.14285714266771329383765);
|
|
462
|
+
__m512d const coeff_2_f64x8 = _mm512_set1_pd(+0.199999999996591265594148);
|
|
463
|
+
__m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.333333333333311110369124);
|
|
460
464
|
|
|
461
465
|
// Quadrant adjustments normalizing to absolute values of x and y
|
|
462
466
|
__mmask8 const xs_negative_mask = _mm512_cmp_pd_mask(xs_inputs, _mm512_setzero_pd(), _CMP_LT_OS);
|
|
463
|
-
__m512d
|
|
464
|
-
__m512d
|
|
467
|
+
__m512d xs_f64x8 = _mm512_abs_pd(xs_inputs);
|
|
468
|
+
__m512d ys_f64x8 = _mm512_abs_pd(ys_inputs);
|
|
465
469
|
// Ensure proper fraction where the numerator is smaller than the denominator
|
|
466
|
-
__mmask8 const swap_mask = _mm512_cmp_pd_mask(
|
|
467
|
-
__m512d
|
|
468
|
-
|
|
469
|
-
|
|
470
|
+
__mmask8 const swap_mask = _mm512_cmp_pd_mask(ys_f64x8, xs_f64x8, _CMP_GT_OS);
|
|
471
|
+
__m512d temps_f64x8 = xs_f64x8;
|
|
472
|
+
xs_f64x8 = _mm512_mask_blend_pd(swap_mask, xs_f64x8, ys_f64x8);
|
|
473
|
+
ys_f64x8 = _mm512_mask_sub_pd(ys_f64x8, swap_mask, _mm512_setzero_pd(), temps_f64x8);
|
|
470
474
|
|
|
471
|
-
// Compute
|
|
472
|
-
__m512d const
|
|
473
|
-
__m512d const
|
|
474
|
-
__m512d const
|
|
475
|
+
// Compute ratio_f64x8 and ratio²
|
|
476
|
+
__m512d const ratio_f64x8 = _mm512_div_pd(ys_f64x8, xs_f64x8);
|
|
477
|
+
__m512d const ratio_squared_f64x8 = _mm512_mul_pd(ratio_f64x8, ratio_f64x8);
|
|
478
|
+
__m512d const ratio_cubed_f64x8 = _mm512_mul_pd(ratio_f64x8, ratio_squared_f64x8);
|
|
475
479
|
|
|
476
480
|
// Polynomial evaluation
|
|
477
|
-
__m512d
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
// Compute the result with
|
|
498
|
-
__m512d
|
|
499
|
-
|
|
500
|
-
// Compute
|
|
481
|
+
__m512d polynomials_f64x8 = coeff_19_f64x8;
|
|
482
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_18_f64x8);
|
|
483
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_17_f64x8);
|
|
484
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_16_f64x8);
|
|
485
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_15_f64x8);
|
|
486
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_14_f64x8);
|
|
487
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_13_f64x8);
|
|
488
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_12_f64x8);
|
|
489
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_11_f64x8);
|
|
490
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_10_f64x8);
|
|
491
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_9_f64x8);
|
|
492
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_8_f64x8);
|
|
493
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_7_f64x8);
|
|
494
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_6_f64x8);
|
|
495
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_5_f64x8);
|
|
496
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_4_f64x8);
|
|
497
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_3_f64x8);
|
|
498
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_2_f64x8);
|
|
499
|
+
polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_1_f64x8);
|
|
500
|
+
|
|
501
|
+
// Compute the result with quadrant_f64x8 adjustments
|
|
502
|
+
__m512d results_f64x8 = _mm512_fmadd_pd(ratio_cubed_f64x8, polynomials_f64x8, ratio_f64x8);
|
|
503
|
+
|
|
504
|
+
// Compute quadrant_f64x8 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
|
|
501
505
|
// -2 for x<0 && !swap, -1 for x<0 && swap
|
|
502
|
-
__m512d
|
|
503
|
-
|
|
504
|
-
__m512d
|
|
505
|
-
|
|
506
|
+
__m512d quadrant_f64x8 = _mm512_setzero_pd();
|
|
507
|
+
quadrant_f64x8 = _mm512_mask_blend_pd(xs_negative_mask, quadrant_f64x8, _mm512_set1_pd(-2.0));
|
|
508
|
+
__m512d quadrant_incremented_f64x8 = _mm512_add_pd(quadrant_f64x8, _mm512_set1_pd(1.0));
|
|
509
|
+
quadrant_f64x8 = _mm512_mask_blend_pd(swap_mask, quadrant_f64x8, quadrant_incremented_f64x8);
|
|
506
510
|
|
|
507
|
-
// Adjust for
|
|
508
|
-
|
|
511
|
+
// Adjust for quadrant_f64x8: result += quadrant_f64x8 * π/2
|
|
512
|
+
results_f64x8 = _mm512_fmadd_pd(quadrant_f64x8, _mm512_set1_pd(1.5707963267948966), results_f64x8);
|
|
509
513
|
|
|
510
514
|
// Transfer sign from x (XOR with sign bit of x_input)
|
|
511
|
-
__m512d
|
|
512
|
-
|
|
515
|
+
__m512d xs_sign_f64x8 = _mm512_and_pd(xs_inputs, _mm512_set1_pd(-0.0));
|
|
516
|
+
results_f64x8 = _mm512_xor_pd(results_f64x8, xs_sign_f64x8);
|
|
513
517
|
|
|
514
518
|
// Transfer sign from y (XOR with sign bit of y_input)
|
|
515
|
-
__m512d
|
|
516
|
-
|
|
519
|
+
__m512d ys_sign_f64x8 = _mm512_and_pd(ys_inputs, _mm512_set1_pd(-0.0));
|
|
520
|
+
results_f64x8 = _mm512_xor_pd(results_f64x8, ys_sign_f64x8);
|
|
517
521
|
|
|
518
|
-
return
|
|
522
|
+
return results_f64x8;
|
|
519
523
|
}
|
|
520
524
|
|
|
521
525
|
NK_PUBLIC void nk_each_sin_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
522
526
|
nk_size_t i = 0;
|
|
523
527
|
for (; i + 8 <= n; i += 8) {
|
|
524
|
-
__m512d
|
|
525
|
-
__m512d
|
|
526
|
-
_mm512_storeu_pd(outs + i,
|
|
528
|
+
__m512d angles_f64x8 = _mm512_loadu_pd(ins + i);
|
|
529
|
+
__m512d results_f64x8 = nk_sin_f64x8_skylake_(angles_f64x8);
|
|
530
|
+
_mm512_storeu_pd(outs + i, results_f64x8);
|
|
527
531
|
}
|
|
528
532
|
if (i < n) {
|
|
529
533
|
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFF, n - i);
|
|
530
|
-
__m512d
|
|
531
|
-
__m512d
|
|
532
|
-
_mm512_mask_storeu_pd(outs + i, mask,
|
|
534
|
+
__m512d angles_f64x8 = _mm512_maskz_loadu_pd(mask, ins + i);
|
|
535
|
+
__m512d results_f64x8 = nk_sin_f64x8_skylake_(angles_f64x8);
|
|
536
|
+
_mm512_mask_storeu_pd(outs + i, mask, results_f64x8);
|
|
533
537
|
}
|
|
534
538
|
}
|
|
535
539
|
NK_PUBLIC void nk_each_cos_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
536
540
|
nk_size_t i = 0;
|
|
537
541
|
for (; i + 8 <= n; i += 8) {
|
|
538
|
-
__m512d
|
|
539
|
-
__m512d
|
|
540
|
-
_mm512_storeu_pd(outs + i,
|
|
542
|
+
__m512d angles_f64x8 = _mm512_loadu_pd(ins + i);
|
|
543
|
+
__m512d results_f64x8 = nk_cos_f64x8_skylake_(angles_f64x8);
|
|
544
|
+
_mm512_storeu_pd(outs + i, results_f64x8);
|
|
541
545
|
}
|
|
542
546
|
if (i < n) {
|
|
543
547
|
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFF, n - i);
|
|
544
|
-
__m512d
|
|
545
|
-
__m512d
|
|
546
|
-
_mm512_mask_storeu_pd(outs + i, mask,
|
|
548
|
+
__m512d angles_f64x8 = _mm512_maskz_loadu_pd(mask, ins + i);
|
|
549
|
+
__m512d results_f64x8 = nk_cos_f64x8_skylake_(angles_f64x8);
|
|
550
|
+
_mm512_mask_storeu_pd(outs + i, mask, results_f64x8);
|
|
547
551
|
}
|
|
548
552
|
}
|
|
549
553
|
NK_PUBLIC void nk_each_atan_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
550
554
|
nk_size_t i = 0;
|
|
551
555
|
for (; i + 8 <= n; i += 8) {
|
|
552
|
-
__m512d
|
|
553
|
-
__m512d
|
|
554
|
-
_mm512_storeu_pd(outs + i,
|
|
556
|
+
__m512d angles_f64x8 = _mm512_loadu_pd(ins + i);
|
|
557
|
+
__m512d results_f64x8 = nk_atan_f64x8_skylake_(angles_f64x8);
|
|
558
|
+
_mm512_storeu_pd(outs + i, results_f64x8);
|
|
555
559
|
}
|
|
556
560
|
if (i < n) {
|
|
557
561
|
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFF, n - i);
|
|
558
|
-
__m512d
|
|
559
|
-
__m512d
|
|
560
|
-
_mm512_mask_storeu_pd(outs + i, mask,
|
|
562
|
+
__m512d angles_f64x8 = _mm512_maskz_loadu_pd(mask, ins + i);
|
|
563
|
+
__m512d results_f64x8 = nk_atan_f64x8_skylake_(angles_f64x8);
|
|
564
|
+
_mm512_mask_storeu_pd(outs + i, mask, results_f64x8);
|
|
561
565
|
}
|
|
562
566
|
}
|
|
563
567
|
|
|
@@ -570,8 +574,8 @@ NK_PUBLIC void nk_each_atan_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64
|
|
|
570
574
|
NK_INTERNAL __m256i nk_sin_f16x16_skylake_(__m256i angles_f16x16) {
|
|
571
575
|
__m512 angles_f32x16 = _mm512_cvtph_ps(angles_f16x16);
|
|
572
576
|
// Cody-Waite range reduction constants
|
|
573
|
-
__m512
|
|
574
|
-
__m512
|
|
577
|
+
__m512 pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
|
|
578
|
+
__m512 pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
|
|
575
579
|
__m512 pi_recip_f32x16 = _mm512_set1_ps(0.31830988618f);
|
|
576
580
|
__m512 c3_f32x16 = _mm512_set1_ps(-1.6666666641e-1f);
|
|
577
581
|
__m512 c5_f32x16 = _mm512_set1_ps(8.3333293855e-3f);
|
|
@@ -581,8 +585,8 @@ NK_INTERNAL __m256i nk_sin_f16x16_skylake_(__m256i angles_f16x16) {
|
|
|
581
585
|
// Use explicit rounding to match roundscale (MXCSR-independent)
|
|
582
586
|
__m512i multiple_i32x16 = _mm512_cvt_roundps_epi32(rounded_f32x16, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
583
587
|
|
|
584
|
-
angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16,
|
|
585
|
-
angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16,
|
|
588
|
+
angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_high_f32x16, angles_f32x16);
|
|
589
|
+
angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_low_f32x16, angles_f32x16);
|
|
586
590
|
|
|
587
591
|
__m512 x2_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
|
|
588
592
|
__m512 poly_f32x16 = _mm512_fmadd_ps(c5_f32x16, x2_f32x16, c3_f32x16);
|
|
@@ -601,8 +605,8 @@ NK_INTERNAL __m256i nk_sin_f16x16_skylake_(__m256i angles_f16x16) {
|
|
|
601
605
|
*/
|
|
602
606
|
NK_INTERNAL __m256i nk_cos_f16x16_skylake_(__m256i angles_f16x16) {
|
|
603
607
|
__m512 angles_f32x16 = _mm512_cvtph_ps(angles_f16x16);
|
|
604
|
-
__m512
|
|
605
|
-
__m512
|
|
608
|
+
__m512 pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
|
|
609
|
+
__m512 pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
|
|
606
610
|
__m512 pi_half_f32x16 = _mm512_set1_ps(1.5707963268f);
|
|
607
611
|
__m512 pi_recip_f32x16 = _mm512_set1_ps(0.31830988618f);
|
|
608
612
|
__m512 half_f32x16 = _mm512_set1_ps(0.5f);
|
|
@@ -614,9 +618,9 @@ NK_INTERNAL __m256i nk_cos_f16x16_skylake_(__m256i angles_f16x16) {
|
|
|
614
618
|
// Use explicit rounding to match roundscale (MXCSR-independent)
|
|
615
619
|
__m512i multiple_i32x16 = _mm512_cvt_roundps_epi32(rounded_f32x16, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
616
620
|
|
|
617
|
-
__m512 shift_f32x16 = _mm512_fmadd_ps(rounded_f32x16,
|
|
621
|
+
__m512 shift_f32x16 = _mm512_fmadd_ps(rounded_f32x16, pi_high_f32x16, pi_half_f32x16);
|
|
618
622
|
angles_f32x16 = _mm512_sub_ps(angles_f32x16, shift_f32x16);
|
|
619
|
-
angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16,
|
|
623
|
+
angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_low_f32x16, angles_f32x16);
|
|
620
624
|
|
|
621
625
|
__m512 x2_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
|
|
622
626
|
__m512 poly_f32x16 = _mm512_fmadd_ps(c5_f32x16, x2_f32x16, c3_f32x16);
|