numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -9,12 +9,12 @@
|
|
|
9
9
|
*
|
|
10
10
|
* @section haswell_trig_instructions Key AVX2 Trigonometry Instructions
|
|
11
11
|
*
|
|
12
|
-
* Intrinsic
|
|
13
|
-
* _mm256_fmadd_ps/pd
|
|
14
|
-
* _mm256_mul_ps/pd
|
|
15
|
-
* _mm256_blendv_ps/pd
|
|
16
|
-
* _mm256_round_ps/pd
|
|
17
|
-
* _mm256_div_ps
|
|
12
|
+
* Intrinsic Instruction Haswell Genoa
|
|
13
|
+
* _mm256_fmadd_ps/pd VFMADD (YMM, YMM, YMM) 5cy @ p01 4cy @ p01
|
|
14
|
+
* _mm256_mul_ps/pd VMULPS/PD (YMM, YMM, YMM) 5cy @ p01 3cy @ p01
|
|
15
|
+
* _mm256_blendv_ps/pd VBLENDVPS/PD (YMM, YMM, YMM) 2cy @ p015 1cy @ p01
|
|
16
|
+
* _mm256_round_ps/pd VROUNDPS/PD (YMM, YMM, I8) 6cy @ p01 3cy @ p23
|
|
17
|
+
* _mm256_div_ps VDIVPS (YMM, YMM, YMM) 13cy @ p0 11cy @ p01
|
|
18
18
|
*
|
|
19
19
|
* Polynomial evaluation uses Horner's method with FMA for sin/cos/atan approximation. For large
|
|
20
20
|
* arrays, out-of-order execution across loop iterations hides FMA latency better than Estrin's
|
|
@@ -46,501 +46,502 @@ extern "C" {
|
|
|
46
46
|
|
|
47
47
|
NK_INTERNAL __m256 nk_sin_f32x8_haswell_(__m256 const angles_radians) {
|
|
48
48
|
// Cody-Waite constants for argument reduction
|
|
49
|
-
__m256 const
|
|
50
|
-
__m256 const
|
|
51
|
-
__m256 const
|
|
49
|
+
__m256 const pi_high_f32x8 = _mm256_set1_ps(3.1415927f);
|
|
50
|
+
__m256 const pi_low_f32x8 = _mm256_set1_ps(-8.742278e-8f);
|
|
51
|
+
__m256 const pi_reciprocal_f32x8 = _mm256_set1_ps(0.31830988618379067154f); // 1/π
|
|
52
52
|
// Degree-9 minimax coefficients
|
|
53
|
-
__m256 const
|
|
54
|
-
__m256 const
|
|
55
|
-
__m256 const
|
|
56
|
-
__m256 const
|
|
53
|
+
__m256 const coeff_9_f32x8 = _mm256_set1_ps(+2.7557319224e-6f);
|
|
54
|
+
__m256 const coeff_7_f32x8 = _mm256_set1_ps(-1.9841269841e-4f);
|
|
55
|
+
__m256 const coeff_5_f32x8 = _mm256_set1_ps(+8.3333293855e-3f);
|
|
56
|
+
__m256 const coeff_3_f32x8 = _mm256_set1_ps(-1.6666666641e-1f);
|
|
57
57
|
|
|
58
|
-
// Compute (
|
|
59
|
-
__m256
|
|
60
|
-
__m256
|
|
61
|
-
// Use truncation (MXCSR-independent) since
|
|
62
|
-
__m256i
|
|
58
|
+
// Compute (multiples_of_pi_i32x8) = round(angle / π)
|
|
59
|
+
__m256 quotients_f32x8 = _mm256_mul_ps(angles_radians, pi_reciprocal_f32x8);
|
|
60
|
+
__m256 rounded_quotients_f32x8 = _mm256_round_ps(quotients_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
61
|
+
// Use truncation (MXCSR-independent) since rounded_quotients_f32x8 is already integer-valued
|
|
62
|
+
__m256i multiples_of_pi_i32x8 = _mm256_cvttps_epi32(rounded_quotients_f32x8);
|
|
63
63
|
|
|
64
64
|
// Cody-Waite range reduction
|
|
65
|
-
__m256
|
|
66
|
-
|
|
67
|
-
__m256 const
|
|
68
|
-
__m256 const
|
|
65
|
+
__m256 angles_f32x8 = _mm256_fnmadd_ps(rounded_quotients_f32x8, pi_high_f32x8, angles_radians);
|
|
66
|
+
angles_f32x8 = _mm256_fnmadd_ps(rounded_quotients_f32x8, pi_low_f32x8, angles_f32x8);
|
|
67
|
+
__m256 const angles_squared_f32x8 = _mm256_mul_ps(angles_f32x8, angles_f32x8);
|
|
68
|
+
__m256 const angles_cubed_f32x8 = _mm256_mul_ps(angles_f32x8, angles_squared_f32x8);
|
|
69
69
|
|
|
70
70
|
// Degree-9 polynomial via Horner's method
|
|
71
|
-
__m256
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
__m256
|
|
76
|
-
|
|
77
|
-
// If
|
|
78
|
-
__m256i
|
|
79
|
-
__m256i
|
|
80
|
-
__m256
|
|
81
|
-
__m256
|
|
82
|
-
|
|
83
|
-
return
|
|
71
|
+
__m256 polynomials_f32x8 = coeff_9_f32x8;
|
|
72
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_7_f32x8);
|
|
73
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_5_f32x8);
|
|
74
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_3_f32x8);
|
|
75
|
+
__m256 results_f32x8 = _mm256_fmadd_ps(angles_cubed_f32x8, polynomials_f32x8, angles_f32x8);
|
|
76
|
+
|
|
77
|
+
// If multiples_of_pi_i32x8 is odd, flip the sign of the results_f32x8
|
|
78
|
+
__m256i parity_i32x8 = _mm256_and_si256(multiples_of_pi_i32x8, _mm256_set1_epi32(1));
|
|
79
|
+
__m256i odd_mask_i32x8 = _mm256_cmpeq_epi32(parity_i32x8, _mm256_set1_epi32(1));
|
|
80
|
+
__m256 float_mask_f32x8 = _mm256_castsi256_ps(odd_mask_i32x8);
|
|
81
|
+
__m256 negated_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), results_f32x8);
|
|
82
|
+
results_f32x8 = _mm256_blendv_ps(results_f32x8, negated_f32x8, float_mask_f32x8);
|
|
83
|
+
return results_f32x8;
|
|
84
84
|
}
|
|
85
85
|
|
|
86
86
|
NK_INTERNAL __m256 nk_cos_f32x8_haswell_(__m256 const angles_radians) {
|
|
87
87
|
// Cody-Waite constants for argument reduction
|
|
88
|
-
__m256 const
|
|
89
|
-
__m256 const
|
|
90
|
-
__m256 const
|
|
91
|
-
__m256 const
|
|
88
|
+
__m256 const pi_high_f32x8 = _mm256_set1_ps(3.1415927f);
|
|
89
|
+
__m256 const pi_low_f32x8 = _mm256_set1_ps(-8.742278e-8f);
|
|
90
|
+
__m256 const pi_half_f32x8 = _mm256_set1_ps(1.57079632679489661923f); // π/2
|
|
91
|
+
__m256 const pi_reciprocal_f32x8 = _mm256_set1_ps(0.31830988618379067154f); // 1/π
|
|
92
92
|
// Degree-9 minimax coefficients
|
|
93
|
-
__m256 const
|
|
94
|
-
__m256 const
|
|
95
|
-
__m256 const
|
|
96
|
-
__m256 const
|
|
93
|
+
__m256 const coeff_9_f32x8 = _mm256_set1_ps(+2.7557319224e-6f);
|
|
94
|
+
__m256 const coeff_7_f32x8 = _mm256_set1_ps(-1.9841269841e-4f);
|
|
95
|
+
__m256 const coeff_5_f32x8 = _mm256_set1_ps(+8.3333293855e-3f);
|
|
96
|
+
__m256 const coeff_3_f32x8 = _mm256_set1_ps(-1.6666666641e-1f);
|
|
97
97
|
|
|
98
|
-
// Compute (
|
|
99
|
-
__m256
|
|
100
|
-
__m256
|
|
101
|
-
// Use truncation (MXCSR-independent) since
|
|
102
|
-
__m256i
|
|
98
|
+
// Compute (multiples_of_pi_i32x8) = round((angle / π) - 0.5)
|
|
99
|
+
__m256 quotients_f32x8 = _mm256_fmsub_ps(angles_radians, pi_reciprocal_f32x8, _mm256_set1_ps(0.5f));
|
|
100
|
+
__m256 rounded_quotients_f32x8 = _mm256_round_ps(quotients_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
101
|
+
// Use truncation (MXCSR-independent) since rounded_quotients_f32x8 is already integer-valued
|
|
102
|
+
__m256i multiples_of_pi_i32x8 = _mm256_cvttps_epi32(rounded_quotients_f32x8);
|
|
103
103
|
|
|
104
104
|
// Cody-Waite range reduction: angle = angle_radians - (multiples * pi + pi/2)
|
|
105
|
-
__m256 const
|
|
106
|
-
__m256
|
|
107
|
-
|
|
108
|
-
__m256 const
|
|
109
|
-
__m256 const
|
|
105
|
+
__m256 const offset_f32x8 = _mm256_fmadd_ps(rounded_quotients_f32x8, pi_high_f32x8, pi_half_f32x8);
|
|
106
|
+
__m256 angles_f32x8 = _mm256_sub_ps(angles_radians, offset_f32x8);
|
|
107
|
+
angles_f32x8 = _mm256_fnmadd_ps(rounded_quotients_f32x8, pi_low_f32x8, angles_f32x8);
|
|
108
|
+
__m256 const angles_squared_f32x8 = _mm256_mul_ps(angles_f32x8, angles_f32x8);
|
|
109
|
+
__m256 const angles_cubed_f32x8 = _mm256_mul_ps(angles_f32x8, angles_squared_f32x8);
|
|
110
110
|
|
|
111
111
|
// Degree-9 polynomial via Horner's method
|
|
112
|
-
__m256
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
__m256
|
|
117
|
-
|
|
118
|
-
// If
|
|
119
|
-
__m256i
|
|
120
|
-
__m256i
|
|
121
|
-
__m256
|
|
122
|
-
__m256
|
|
123
|
-
|
|
124
|
-
return
|
|
112
|
+
__m256 polynomials_f32x8 = coeff_9_f32x8;
|
|
113
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_7_f32x8);
|
|
114
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_5_f32x8);
|
|
115
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_3_f32x8);
|
|
116
|
+
__m256 results_f32x8 = _mm256_fmadd_ps(angles_cubed_f32x8, polynomials_f32x8, angles_f32x8);
|
|
117
|
+
|
|
118
|
+
// If multiples_of_pi_i32x8 is even, flip the sign of the results_f32x8
|
|
119
|
+
__m256i parity_i32x8 = _mm256_and_si256(multiples_of_pi_i32x8, _mm256_set1_epi32(1));
|
|
120
|
+
__m256i even_mask_i32x8 = _mm256_cmpeq_epi32(parity_i32x8, _mm256_setzero_si256());
|
|
121
|
+
__m256 float_mask_f32x8 = _mm256_castsi256_ps(even_mask_i32x8);
|
|
122
|
+
__m256 negated_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), results_f32x8);
|
|
123
|
+
results_f32x8 = _mm256_blendv_ps(results_f32x8, negated_f32x8, float_mask_f32x8);
|
|
124
|
+
return results_f32x8;
|
|
125
125
|
}
|
|
126
126
|
|
|
127
127
|
NK_INTERNAL __m256 nk_atan_f32x8_haswell_(__m256 const inputs) {
|
|
128
128
|
// Polynomial coefficients for atan approximation (8 terms)
|
|
129
129
|
// These coefficients approximate: atan(x) ≈ x + c8 × x³ + c7 × x⁵ + c6 × x⁷ + ... + c1 × x¹⁵
|
|
130
|
-
__m256 const
|
|
131
|
-
__m256 const
|
|
132
|
-
__m256 const
|
|
133
|
-
__m256 const
|
|
134
|
-
__m256 const
|
|
135
|
-
__m256 const
|
|
136
|
-
__m256 const
|
|
137
|
-
__m256 const
|
|
138
|
-
__m256 const
|
|
139
|
-
|
|
140
|
-
// Adjust for quadrant - detect negative
|
|
141
|
-
__m256
|
|
142
|
-
__m256
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
// Check if
|
|
146
|
-
__m256
|
|
147
|
-
__m256
|
|
148
|
-
|
|
130
|
+
__m256 const coeff_8_f32x8 = _mm256_set1_ps(-0.333331018686294555664062f);
|
|
131
|
+
__m256 const coeff_7_f32x8 = _mm256_set1_ps(+0.199926957488059997558594f);
|
|
132
|
+
__m256 const coeff_6_f32x8 = _mm256_set1_ps(-0.142027363181114196777344f);
|
|
133
|
+
__m256 const coeff_5_f32x8 = _mm256_set1_ps(+0.106347933411598205566406f);
|
|
134
|
+
__m256 const coeff_4_f32x8 = _mm256_set1_ps(-0.0748900920152664184570312f);
|
|
135
|
+
__m256 const coeff_3_f32x8 = _mm256_set1_ps(+0.0425049886107444763183594f);
|
|
136
|
+
__m256 const coeff_2_f32x8 = _mm256_set1_ps(-0.0159569028764963150024414f);
|
|
137
|
+
__m256 const coeff_1_f32x8 = _mm256_set1_ps(+0.00282363896258175373077393f);
|
|
138
|
+
__m256 const sign_mask_f32x8 = _mm256_set1_ps(-0.0f);
|
|
139
|
+
|
|
140
|
+
// Adjust for quadrant - detect negative values_f32x8
|
|
141
|
+
__m256 values_f32x8 = inputs;
|
|
142
|
+
__m256 negative_mask_f32x8 = _mm256_cmp_ps(values_f32x8, _mm256_setzero_ps(), _CMP_LT_OS);
|
|
143
|
+
values_f32x8 = _mm256_andnot_ps(sign_mask_f32x8, values_f32x8); // abs(values_f32x8)
|
|
144
|
+
|
|
145
|
+
// Check if values_f32x8 > 1 (need reciprocal)
|
|
146
|
+
__m256 reciprocal_mask_f32x8 = _mm256_cmp_ps(values_f32x8, _mm256_set1_ps(1.0f), _CMP_GT_OS);
|
|
147
|
+
__m256 reciprocal_values_f32x8 = _mm256_div_ps(_mm256_set1_ps(1.0f), values_f32x8);
|
|
148
|
+
values_f32x8 = _mm256_blendv_ps(values_f32x8, reciprocal_values_f32x8, reciprocal_mask_f32x8);
|
|
149
149
|
|
|
150
150
|
// Argument reduction
|
|
151
|
-
__m256 const
|
|
152
|
-
__m256 const
|
|
151
|
+
__m256 const values_squared_f32x8 = _mm256_mul_ps(values_f32x8, values_f32x8);
|
|
152
|
+
__m256 const values_cubed_f32x8 = _mm256_mul_ps(values_f32x8, values_squared_f32x8);
|
|
153
153
|
|
|
154
154
|
// Polynomial evaluation using Horner's method.
|
|
155
155
|
// For large arrays, out-of-order execution across loop iterations already hides
|
|
156
156
|
// FMA latency. Estrin's scheme was tested but showed ~20% regression because
|
|
157
157
|
// the extra power computations (y², y⁴) hurt throughput more than the reduced
|
|
158
158
|
// dependency depth helps latency.
|
|
159
|
-
__m256
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
// Compute
|
|
169
|
-
__m256
|
|
170
|
-
|
|
171
|
-
// Adjust for reciprocal:
|
|
172
|
-
__m256
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
// Adjust for negative:
|
|
176
|
-
__m256
|
|
177
|
-
|
|
178
|
-
return
|
|
159
|
+
__m256 polynomials_f32x8 = coeff_1_f32x8;
|
|
160
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_2_f32x8);
|
|
161
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_3_f32x8);
|
|
162
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_4_f32x8);
|
|
163
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_5_f32x8);
|
|
164
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_6_f32x8);
|
|
165
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_7_f32x8);
|
|
166
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_8_f32x8);
|
|
167
|
+
|
|
168
|
+
// Compute result_f32x8: atan(x) ≈ x + x³ * P(x²)
|
|
169
|
+
__m256 result_f32x8 = _mm256_fmadd_ps(values_cubed_f32x8, polynomials_f32x8, values_f32x8);
|
|
170
|
+
|
|
171
|
+
// Adjust for reciprocal: result_f32x8 = π/2 - result_f32x8
|
|
172
|
+
__m256 adjusted_f32x8 = _mm256_sub_ps(_mm256_set1_ps(1.5707963267948966f), result_f32x8);
|
|
173
|
+
result_f32x8 = _mm256_blendv_ps(result_f32x8, adjusted_f32x8, reciprocal_mask_f32x8);
|
|
174
|
+
|
|
175
|
+
// Adjust for negative: result_f32x8 = -result_f32x8
|
|
176
|
+
__m256 negated_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), result_f32x8);
|
|
177
|
+
result_f32x8 = _mm256_blendv_ps(result_f32x8, negated_f32x8, negative_mask_f32x8);
|
|
178
|
+
return result_f32x8;
|
|
179
179
|
}
|
|
180
180
|
|
|
181
181
|
NK_INTERNAL __m256 nk_atan2_f32x8_haswell_(__m256 const ys_inputs, __m256 const xs_inputs) {
|
|
182
182
|
// Polynomial coefficients (same as atan)
|
|
183
|
-
__m256 const
|
|
184
|
-
__m256 const
|
|
185
|
-
__m256 const
|
|
186
|
-
__m256 const
|
|
187
|
-
__m256 const
|
|
188
|
-
__m256 const
|
|
189
|
-
__m256 const
|
|
190
|
-
__m256 const
|
|
191
|
-
__m256 const
|
|
183
|
+
__m256 const coeff_8_f32x8 = _mm256_set1_ps(-0.333331018686294555664062f);
|
|
184
|
+
__m256 const coeff_7_f32x8 = _mm256_set1_ps(+0.199926957488059997558594f);
|
|
185
|
+
__m256 const coeff_6_f32x8 = _mm256_set1_ps(-0.142027363181114196777344f);
|
|
186
|
+
__m256 const coeff_5_f32x8 = _mm256_set1_ps(+0.106347933411598205566406f);
|
|
187
|
+
__m256 const coeff_4_f32x8 = _mm256_set1_ps(-0.0748900920152664184570312f);
|
|
188
|
+
__m256 const coeff_3_f32x8 = _mm256_set1_ps(+0.0425049886107444763183594f);
|
|
189
|
+
__m256 const coeff_2_f32x8 = _mm256_set1_ps(-0.0159569028764963150024414f);
|
|
190
|
+
__m256 const coeff_1_f32x8 = _mm256_set1_ps(+0.00282363896258175373077393f);
|
|
191
|
+
__m256 const sign_mask_f32x8 = _mm256_set1_ps(-0.0f);
|
|
192
192
|
|
|
193
193
|
// Quadrant adjustments normalizing to absolute values of x and y
|
|
194
|
-
__m256
|
|
195
|
-
__m256
|
|
196
|
-
__m256
|
|
194
|
+
__m256 xs_negative_mask_f32x8 = _mm256_cmp_ps(xs_inputs, _mm256_setzero_ps(), _CMP_LT_OS);
|
|
195
|
+
__m256 xs_f32x8 = _mm256_andnot_ps(sign_mask_f32x8, xs_inputs); // abs(xs_inputs)
|
|
196
|
+
__m256 ys_f32x8 = _mm256_andnot_ps(sign_mask_f32x8, ys_inputs); // abs(ys_inputs)
|
|
197
197
|
|
|
198
198
|
// Ensure proper fraction where the numerator is smaller than the denominator
|
|
199
|
-
__m256
|
|
200
|
-
__m256
|
|
201
|
-
|
|
202
|
-
__m256
|
|
203
|
-
|
|
199
|
+
__m256 swap_mask_f32x8 = _mm256_cmp_ps(ys_f32x8, xs_f32x8, _CMP_GT_OS);
|
|
200
|
+
__m256 temps_f32x8 = xs_f32x8;
|
|
201
|
+
xs_f32x8 = _mm256_blendv_ps(xs_f32x8, ys_f32x8, swap_mask_f32x8);
|
|
202
|
+
__m256 neg_temps_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), temps_f32x8);
|
|
203
|
+
ys_f32x8 = _mm256_blendv_ps(ys_f32x8, neg_temps_f32x8, swap_mask_f32x8);
|
|
204
204
|
|
|
205
|
-
// Compute
|
|
206
|
-
__m256 const
|
|
207
|
-
__m256 const
|
|
208
|
-
__m256 const
|
|
205
|
+
// Compute ratio_f32x8 and powers
|
|
206
|
+
__m256 const ratio_f32x8 = _mm256_div_ps(ys_f32x8, xs_f32x8);
|
|
207
|
+
__m256 const ratio_squared_f32x8 = _mm256_mul_ps(ratio_f32x8, ratio_f32x8);
|
|
208
|
+
__m256 const ratio_cubed_f32x8 = _mm256_mul_ps(ratio_f32x8, ratio_squared_f32x8);
|
|
209
209
|
|
|
210
210
|
// Polynomial evaluation using Horner's method
|
|
211
|
-
__m256
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
// Compute the result using masks for
|
|
221
|
-
__m256
|
|
222
|
-
|
|
223
|
-
// Compute
|
|
211
|
+
__m256 polynomials_f32x8 = coeff_1_f32x8;
|
|
212
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_2_f32x8);
|
|
213
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_3_f32x8);
|
|
214
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_4_f32x8);
|
|
215
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_5_f32x8);
|
|
216
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_6_f32x8);
|
|
217
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_7_f32x8);
|
|
218
|
+
polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_8_f32x8);
|
|
219
|
+
|
|
220
|
+
// Compute the result using masks for quadrant_f32x8 adjustments
|
|
221
|
+
__m256 results_f32x8 = _mm256_fmadd_ps(ratio_cubed_f32x8, polynomials_f32x8, ratio_f32x8);
|
|
222
|
+
|
|
223
|
+
// Compute quadrant_f32x8 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
|
|
224
224
|
// -2 for x<0 && !swap, -1 for x<0 && swap
|
|
225
|
-
__m256
|
|
226
|
-
__m256
|
|
227
|
-
|
|
228
|
-
__m256
|
|
229
|
-
__m256
|
|
230
|
-
|
|
225
|
+
__m256 quadrant_f32x8 = _mm256_setzero_ps();
|
|
226
|
+
__m256 neg_two_f32x8 = _mm256_set1_ps(-2.0f);
|
|
227
|
+
quadrant_f32x8 = _mm256_blendv_ps(quadrant_f32x8, neg_two_f32x8, xs_negative_mask_f32x8);
|
|
228
|
+
__m256 one_f32x8 = _mm256_set1_ps(1.0f);
|
|
229
|
+
__m256 quadrant_incremented_f32x8 = _mm256_add_ps(quadrant_f32x8, one_f32x8);
|
|
230
|
+
quadrant_f32x8 = _mm256_blendv_ps(quadrant_f32x8, quadrant_incremented_f32x8, swap_mask_f32x8);
|
|
231
231
|
|
|
232
|
-
// Adjust for
|
|
233
|
-
__m256
|
|
234
|
-
|
|
232
|
+
// Adjust for quadrant_f32x8: result += quadrant_f32x8 * π/2
|
|
233
|
+
__m256 pi_half_f32x8 = _mm256_set1_ps(1.5707963267948966f);
|
|
234
|
+
results_f32x8 = _mm256_fmadd_ps(quadrant_f32x8, pi_half_f32x8, results_f32x8);
|
|
235
235
|
|
|
236
236
|
// Transfer sign from x (XOR with sign bit of x_input)
|
|
237
|
-
__m256
|
|
238
|
-
|
|
237
|
+
__m256 xs_sign_bits_f32x8 = _mm256_and_ps(xs_inputs, sign_mask_f32x8);
|
|
238
|
+
results_f32x8 = _mm256_xor_ps(results_f32x8, xs_sign_bits_f32x8);
|
|
239
239
|
|
|
240
240
|
// Transfer sign from y (XOR with sign bit of y_input)
|
|
241
|
-
__m256
|
|
242
|
-
|
|
241
|
+
__m256 ys_sign_bits_f32x8 = _mm256_and_ps(ys_inputs, sign_mask_f32x8);
|
|
242
|
+
results_f32x8 = _mm256_xor_ps(results_f32x8, ys_sign_bits_f32x8);
|
|
243
243
|
|
|
244
|
-
return
|
|
244
|
+
return results_f32x8;
|
|
245
245
|
}
|
|
246
246
|
|
|
247
247
|
NK_INTERNAL __m256d nk_sin_f64x4_haswell_(__m256d const angles_radians) {
|
|
248
248
|
// Constants for argument reduction
|
|
249
|
-
__m256d const
|
|
250
|
-
__m256d const
|
|
251
|
-
__m256d const
|
|
249
|
+
__m256d const pi_high_f64x4 = _mm256_set1_pd(3.141592653589793116); // High-digits part of π
|
|
250
|
+
__m256d const pi_low_f64x4 = _mm256_set1_pd(1.2246467991473532072e-16); // Low-digits part of π
|
|
251
|
+
__m256d const pi_reciprocal_f64x4 = _mm256_set1_pd(0.31830988618379067154); // 1/π
|
|
252
252
|
|
|
253
253
|
// Polynomial coefficients for sine approximation (minimax polynomial)
|
|
254
|
-
__m256d const
|
|
255
|
-
__m256d const
|
|
256
|
-
__m256d const
|
|
257
|
-
__m256d const
|
|
258
|
-
__m256d const
|
|
259
|
-
__m256d const
|
|
260
|
-
__m256d const
|
|
261
|
-
__m256d const
|
|
262
|
-
__m256d const
|
|
263
|
-
|
|
264
|
-
// Compute (
|
|
265
|
-
__m256d const
|
|
266
|
-
__m256d const
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
254
|
+
__m256d const coeff_0_f64x4 = _mm256_set1_pd(+0.00833333333333332974823815);
|
|
255
|
+
__m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.000198412698412696162806809);
|
|
256
|
+
__m256d const coeff_2_f64x4 = _mm256_set1_pd(+2.75573192239198747630416e-06);
|
|
257
|
+
__m256d const coeff_3_f64x4 = _mm256_set1_pd(-2.50521083763502045810755e-08);
|
|
258
|
+
__m256d const coeff_4_f64x4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
|
|
259
|
+
__m256d const coeff_5_f64x4 = _mm256_set1_pd(-7.64712219118158833288484e-13);
|
|
260
|
+
__m256d const coeff_6_f64x4 = _mm256_set1_pd(+2.81009972710863200091251e-15);
|
|
261
|
+
__m256d const coeff_7_f64x4 = _mm256_set1_pd(-7.97255955009037868891952e-18);
|
|
262
|
+
__m256d const coeff_8_f64x4 = _mm256_set1_pd(-0.166666666666666657414808);
|
|
263
|
+
|
|
264
|
+
// Compute (rounded_quotients_f64x4) = round(angle / π)
|
|
265
|
+
__m256d const quotients_f64x4 = _mm256_mul_pd(angles_radians, pi_reciprocal_f64x4);
|
|
266
|
+
__m256d const rounded_quotients_f64x4 = _mm256_round_pd(quotients_f64x4,
|
|
267
|
+
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
268
|
+
|
|
269
|
+
// Reduce the angle: angle - (rounded_quotients_f64x4 * π_high + rounded_quotients_f64x4 * π_low)
|
|
270
|
+
__m256d angles_f64x4 = angles_radians;
|
|
271
|
+
angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_high_f64x4, angles_f64x4);
|
|
272
|
+
angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_low_f64x4, angles_f64x4);
|
|
273
|
+
|
|
274
|
+
// If rounded_quotients_f64x4 is odd (bit 0 set), negate the angle
|
|
274
275
|
// Convert to 32-bit int (returns __m128i with 4 x 32-bit ints)
|
|
275
|
-
// Use truncation (MXCSR-independent) since
|
|
276
|
-
__m128i
|
|
277
|
-
__m128i
|
|
278
|
-
__m128i
|
|
276
|
+
// Use truncation (MXCSR-independent) since rounded_quotients_f64x4 is already integer-valued
|
|
277
|
+
__m128i quotients_i32_i32x4 = _mm256_cvttpd_epi32(rounded_quotients_f64x4);
|
|
278
|
+
__m128i parity_i32x4 = _mm_and_si128(quotients_i32_i32x4, _mm_set1_epi32(1));
|
|
279
|
+
__m128i odd_mask_i32_i32x4 = _mm_cmpeq_epi32(parity_i32x4, _mm_set1_epi32(1));
|
|
279
280
|
// Expand 32-bit mask to 64-bit by shuffling
|
|
280
|
-
__m256i
|
|
281
|
-
__m256d
|
|
282
|
-
__m256d
|
|
283
|
-
|
|
281
|
+
__m256i odd_mask_i64_i32x8 = _mm256_cvtepi32_epi64(odd_mask_i32_i32x4);
|
|
282
|
+
__m256d float_mask_f64x4 = _mm256_castsi256_pd(odd_mask_i64_i32x8);
|
|
283
|
+
__m256d negated_angles_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), angles_f64x4);
|
|
284
|
+
angles_f64x4 = _mm256_blendv_pd(angles_f64x4, negated_angles_f64x4, float_mask_f64x4);
|
|
284
285
|
|
|
285
|
-
__m256d const
|
|
286
|
-
__m256d const
|
|
287
|
-
__m256d const
|
|
288
|
-
__m256d const
|
|
286
|
+
__m256d const angles_squared_f64x4 = _mm256_mul_pd(angles_f64x4, angles_f64x4);
|
|
287
|
+
__m256d const angles_cubed_f64x4 = _mm256_mul_pd(angles_f64x4, angles_squared_f64x4);
|
|
288
|
+
__m256d const angles_quadratic_f64x4 = _mm256_mul_pd(angles_squared_f64x4, angles_squared_f64x4);
|
|
289
|
+
__m256d const angles_octic_f64x4 = _mm256_mul_pd(angles_quadratic_f64x4, angles_quadratic_f64x4);
|
|
289
290
|
|
|
290
291
|
// Compute higher-degree polynomial terms
|
|
291
|
-
__m256d const
|
|
292
|
-
__m256d const
|
|
293
|
-
__m256d const
|
|
292
|
+
__m256d const poly_67_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_7_f64x4, coeff_6_f64x4);
|
|
293
|
+
__m256d const poly_45_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_5_f64x4, coeff_4_f64x4);
|
|
294
|
+
__m256d const poly_4567_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_67_f64x4, poly_45_f64x4);
|
|
294
295
|
|
|
295
296
|
// Compute lower-degree polynomial terms
|
|
296
|
-
__m256d const
|
|
297
|
-
__m256d const
|
|
298
|
-
__m256d const
|
|
297
|
+
__m256d const poly_23_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_3_f64x4, coeff_2_f64x4);
|
|
298
|
+
__m256d const poly_01_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_1_f64x4, coeff_0_f64x4);
|
|
299
|
+
__m256d const poly_0123_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_23_f64x4, poly_01_f64x4);
|
|
299
300
|
|
|
300
301
|
// Combine polynomial terms
|
|
301
|
-
__m256d
|
|
302
|
-
|
|
303
|
-
|
|
302
|
+
__m256d results_f64x4 = _mm256_fmadd_pd(angles_octic_f64x4, poly_4567_f64x4, poly_0123_f64x4);
|
|
303
|
+
results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_squared_f64x4, coeff_8_f64x4);
|
|
304
|
+
results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_cubed_f64x4, angles_f64x4);
|
|
304
305
|
|
|
305
306
|
// Handle the special case of negative zero input
|
|
306
|
-
__m256d const
|
|
307
|
-
|
|
308
|
-
return
|
|
307
|
+
__m256d const non_zero_mask_f64x4 = _mm256_cmp_pd(angles_radians, _mm256_setzero_pd(), _CMP_NEQ_UQ);
|
|
308
|
+
results_f64x4 = _mm256_and_pd(results_f64x4, non_zero_mask_f64x4);
|
|
309
|
+
return results_f64x4;
|
|
309
310
|
}
|
|
310
311
|
|
|
311
312
|
NK_INTERNAL __m256d nk_cos_f64x4_haswell_(__m256d const angles_radians) {
|
|
312
313
|
// Constants for argument reduction
|
|
313
|
-
__m256d const
|
|
314
|
-
__m256d const
|
|
315
|
-
__m256d const
|
|
314
|
+
__m256d const pi_high_half_f64x4 = _mm256_set1_pd(3.141592653589793116 * 0.5); // High-digits part of π/2
|
|
315
|
+
__m256d const pi_low_half_f64x4 = _mm256_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π/2
|
|
316
|
+
__m256d const pi_reciprocal_f64x4 = _mm256_set1_pd(0.31830988618379067154); // 1/π
|
|
316
317
|
|
|
317
318
|
// Polynomial coefficients for cosine approximation
|
|
318
|
-
__m256d const
|
|
319
|
-
__m256d const
|
|
320
|
-
__m256d const
|
|
321
|
-
__m256d const
|
|
322
|
-
__m256d const
|
|
323
|
-
__m256d const
|
|
324
|
-
__m256d const
|
|
325
|
-
__m256d const
|
|
326
|
-
__m256d const
|
|
327
|
-
|
|
328
|
-
// Compute (
|
|
329
|
-
// Use fmsub: a*b - c =
|
|
330
|
-
__m256d const
|
|
331
|
-
__m256d const
|
|
332
|
-
_mm256_set1_pd(2.0),
|
|
333
|
-
_mm256_round_pd(
|
|
319
|
+
__m256d const coeff_0_f64x4 = _mm256_set1_pd(+0.00833333333333332974823815);
|
|
320
|
+
__m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.000198412698412696162806809);
|
|
321
|
+
__m256d const coeff_2_f64x4 = _mm256_set1_pd(+2.75573192239198747630416e-06);
|
|
322
|
+
__m256d const coeff_3_f64x4 = _mm256_set1_pd(-2.50521083763502045810755e-08);
|
|
323
|
+
__m256d const coeff_4_f64x4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
|
|
324
|
+
__m256d const coeff_5_f64x4 = _mm256_set1_pd(-7.64712219118158833288484e-13);
|
|
325
|
+
__m256d const coeff_6_f64x4 = _mm256_set1_pd(+2.81009972710863200091251e-15);
|
|
326
|
+
__m256d const coeff_7_f64x4 = _mm256_set1_pd(-7.97255955009037868891952e-18);
|
|
327
|
+
__m256d const coeff_8_f64x4 = _mm256_set1_pd(-0.166666666666666657414808);
|
|
328
|
+
|
|
329
|
+
// Compute (rounded_quotients_f64x4) = 2 * round(angle / π - 0.5) + 1
|
|
330
|
+
// Use fmsub: a*b - c = angles_f64x4 * (1/π) - 0.5
|
|
331
|
+
__m256d const quotients_f64x4 = _mm256_fmsub_pd(angles_radians, pi_reciprocal_f64x4, _mm256_set1_pd(0.5));
|
|
332
|
+
__m256d const rounded_quotients_f64x4 = _mm256_fmadd_pd( //
|
|
333
|
+
_mm256_set1_pd(2.0), //
|
|
334
|
+
_mm256_round_pd(quotients_f64x4, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
|
|
334
335
|
_mm256_set1_pd(1.0));
|
|
335
336
|
|
|
336
|
-
// Reduce the angle: angle - (
|
|
337
|
-
__m256d
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
// If (
|
|
342
|
-
// Use truncation (MXCSR-independent) since
|
|
343
|
-
__m128i
|
|
344
|
-
__m128i
|
|
345
|
-
__m128i
|
|
346
|
-
__m256i
|
|
347
|
-
__m256d
|
|
348
|
-
__m256d
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
__m256d const
|
|
352
|
-
__m256d const
|
|
353
|
-
__m256d const
|
|
354
|
-
__m256d const
|
|
337
|
+
// Reduce the angle: angle - (rounded_quotients_f64x4 * π_high_half + rounded_quotients_f64x4 * π_low_half)
|
|
338
|
+
__m256d angles_f64x4 = angles_radians;
|
|
339
|
+
angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_high_half_f64x4, angles_f64x4);
|
|
340
|
+
angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_low_half_f64x4, angles_f64x4);
|
|
341
|
+
|
|
342
|
+
// If (rounded_quotients_f64x4 & 2) == 0, negate the angle
|
|
343
|
+
// Use truncation (MXCSR-independent) since rounded_quotients_f64x4 is already integer-valued
|
|
344
|
+
__m128i quotients_i32_i32x4 = _mm256_cvttpd_epi32(rounded_quotients_f64x4);
|
|
345
|
+
__m128i bit2_i32x4 = _mm_and_si128(quotients_i32_i32x4, _mm_set1_epi32(2));
|
|
346
|
+
__m128i flip_mask_i32_i32x4 = _mm_cmpeq_epi32(bit2_i32x4, _mm_setzero_si128());
|
|
347
|
+
__m256i flip_mask_i64_i32x8 = _mm256_cvtepi32_epi64(flip_mask_i32_i32x4);
|
|
348
|
+
__m256d float_mask_f64x4 = _mm256_castsi256_pd(flip_mask_i64_i32x8);
|
|
349
|
+
__m256d negated_angles_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), angles_f64x4);
|
|
350
|
+
angles_f64x4 = _mm256_blendv_pd(angles_f64x4, negated_angles_f64x4, float_mask_f64x4);
|
|
351
|
+
|
|
352
|
+
__m256d const angles_squared_f64x4 = _mm256_mul_pd(angles_f64x4, angles_f64x4);
|
|
353
|
+
__m256d const angles_cubed_f64x4 = _mm256_mul_pd(angles_f64x4, angles_squared_f64x4);
|
|
354
|
+
__m256d const angles_quadratic_f64x4 = _mm256_mul_pd(angles_squared_f64x4, angles_squared_f64x4);
|
|
355
|
+
__m256d const angles_octic_f64x4 = _mm256_mul_pd(angles_quadratic_f64x4, angles_quadratic_f64x4);
|
|
355
356
|
|
|
356
357
|
// Compute higher-degree polynomial terms
|
|
357
|
-
__m256d const
|
|
358
|
-
__m256d const
|
|
359
|
-
__m256d const
|
|
358
|
+
__m256d const poly_67_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_7_f64x4, coeff_6_f64x4);
|
|
359
|
+
__m256d const poly_45_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_5_f64x4, coeff_4_f64x4);
|
|
360
|
+
__m256d const poly_4567_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_67_f64x4, poly_45_f64x4);
|
|
360
361
|
|
|
361
362
|
// Compute lower-degree polynomial terms
|
|
362
|
-
__m256d const
|
|
363
|
-
__m256d const
|
|
364
|
-
__m256d const
|
|
363
|
+
__m256d const poly_23_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_3_f64x4, coeff_2_f64x4);
|
|
364
|
+
__m256d const poly_01_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_1_f64x4, coeff_0_f64x4);
|
|
365
|
+
__m256d const poly_0123_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_23_f64x4, poly_01_f64x4);
|
|
365
366
|
|
|
366
367
|
// Combine polynomial terms
|
|
367
|
-
__m256d
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
return
|
|
368
|
+
__m256d results_f64x4 = _mm256_fmadd_pd(angles_octic_f64x4, poly_4567_f64x4, poly_0123_f64x4);
|
|
369
|
+
results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_squared_f64x4, coeff_8_f64x4);
|
|
370
|
+
results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_cubed_f64x4, angles_f64x4);
|
|
371
|
+
return results_f64x4;
|
|
371
372
|
}
|
|
372
373
|
|
|
373
374
|
NK_INTERNAL __m256d nk_atan_f64x4_haswell_(__m256d const inputs) {
|
|
374
375
|
// Polynomial coefficients for atan approximation (19 coefficients)
|
|
375
376
|
// The polynomial approximates: atan(x) ≈ x + x³ * P(x²) where P has 19 terms
|
|
376
|
-
__m256d const
|
|
377
|
-
__m256d const
|
|
378
|
-
__m256d const
|
|
379
|
-
__m256d const
|
|
380
|
-
__m256d const
|
|
381
|
-
__m256d const
|
|
382
|
-
__m256d const
|
|
383
|
-
__m256d const
|
|
384
|
-
__m256d const
|
|
385
|
-
__m256d const
|
|
386
|
-
__m256d const
|
|
387
|
-
__m256d const
|
|
388
|
-
__m256d const
|
|
389
|
-
__m256d const
|
|
390
|
-
__m256d const
|
|
391
|
-
__m256d const
|
|
392
|
-
__m256d const
|
|
393
|
-
__m256d const
|
|
394
|
-
__m256d const
|
|
395
|
-
__m256d const
|
|
396
|
-
|
|
397
|
-
// Adjust for quadrant - detect negative
|
|
398
|
-
__m256d
|
|
399
|
-
__m256d
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
// Check if
|
|
377
|
+
__m256d const coeff_19_f64x4 = _mm256_set1_pd(-1.88796008463073496563746e-05);
|
|
378
|
+
__m256d const coeff_18_f64x4 = _mm256_set1_pd(+0.000209850076645816976906797);
|
|
379
|
+
__m256d const coeff_17_f64x4 = _mm256_set1_pd(-0.00110611831486672482563471);
|
|
380
|
+
__m256d const coeff_16_f64x4 = _mm256_set1_pd(+0.00370026744188713119232403);
|
|
381
|
+
__m256d const coeff_15_f64x4 = _mm256_set1_pd(-0.00889896195887655491740809);
|
|
382
|
+
__m256d const coeff_14_f64x4 = _mm256_set1_pd(+0.016599329773529201970117);
|
|
383
|
+
__m256d const coeff_13_f64x4 = _mm256_set1_pd(-0.0254517624932312641616861);
|
|
384
|
+
__m256d const coeff_12_f64x4 = _mm256_set1_pd(+0.0337852580001353069993897);
|
|
385
|
+
__m256d const coeff_11_f64x4 = _mm256_set1_pd(-0.0407629191276836500001934);
|
|
386
|
+
__m256d const coeff_10_f64x4 = _mm256_set1_pd(+0.0466667150077840625632675);
|
|
387
|
+
__m256d const coeff_9_f64x4 = _mm256_set1_pd(-0.0523674852303482457616113);
|
|
388
|
+
__m256d const coeff_8_f64x4 = _mm256_set1_pd(+0.0587666392926673580854313);
|
|
389
|
+
__m256d const coeff_7_f64x4 = _mm256_set1_pd(-0.0666573579361080525984562);
|
|
390
|
+
__m256d const coeff_6_f64x4 = _mm256_set1_pd(+0.0769219538311769618355029);
|
|
391
|
+
__m256d const coeff_5_f64x4 = _mm256_set1_pd(-0.090908995008245008229153);
|
|
392
|
+
__m256d const coeff_4_f64x4 = _mm256_set1_pd(+0.111111105648261418443745);
|
|
393
|
+
__m256d const coeff_3_f64x4 = _mm256_set1_pd(-0.14285714266771329383765);
|
|
394
|
+
__m256d const coeff_2_f64x4 = _mm256_set1_pd(+0.199999999996591265594148);
|
|
395
|
+
__m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.333333333333311110369124);
|
|
396
|
+
__m256d const sign_mask_f64x4 = _mm256_set1_pd(-0.0);
|
|
397
|
+
|
|
398
|
+
// Adjust for quadrant - detect negative values_f64x4
|
|
399
|
+
__m256d values_f64x4 = inputs;
|
|
400
|
+
__m256d negative_mask_f64x4 = _mm256_cmp_pd(values_f64x4, _mm256_setzero_pd(), _CMP_LT_OS);
|
|
401
|
+
values_f64x4 = _mm256_andnot_pd(sign_mask_f64x4, values_f64x4); // abs(values_f64x4)
|
|
402
|
+
|
|
403
|
+
// Check if values_f64x4 > 1 (need reciprocal)
|
|
403
404
|
// Note: For f64, we keep VDIVPD since RCPPD doesn't exist and Newton-Raphson
|
|
404
405
|
// would need 2 iterations for sufficient precision (~44 bits needed for f64)
|
|
405
|
-
__m256d
|
|
406
|
-
__m256d
|
|
407
|
-
|
|
406
|
+
__m256d reciprocal_mask_f64x4 = _mm256_cmp_pd(values_f64x4, _mm256_set1_pd(1.0), _CMP_GT_OS);
|
|
407
|
+
__m256d reciprocal_values_f64x4 = _mm256_div_pd(_mm256_set1_pd(1.0), values_f64x4);
|
|
408
|
+
values_f64x4 = _mm256_blendv_pd(values_f64x4, reciprocal_values_f64x4, reciprocal_mask_f64x4);
|
|
408
409
|
|
|
409
410
|
// Argument reduction
|
|
410
|
-
__m256d const
|
|
411
|
-
__m256d const
|
|
411
|
+
__m256d const values_squared_f64x4 = _mm256_mul_pd(values_f64x4, values_f64x4);
|
|
412
|
+
__m256d const values_cubed_f64x4 = _mm256_mul_pd(values_f64x4, values_squared_f64x4);
|
|
412
413
|
|
|
413
414
|
// Polynomial evaluation using Horner's method.
|
|
414
415
|
// For large arrays, out-of-order execution across loop iterations already hides
|
|
415
416
|
// FMA latency. Estrin's scheme was tested but showed minimal improvement (~1%)
|
|
416
417
|
// while adding complexity. Keeping Horner for maintainability.
|
|
417
|
-
__m256d
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
// Compute
|
|
438
|
-
__m256d
|
|
439
|
-
|
|
440
|
-
// Adjust for reciprocal:
|
|
441
|
-
__m256d
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
// Adjust for negative:
|
|
445
|
-
__m256d
|
|
446
|
-
|
|
447
|
-
return
|
|
418
|
+
__m256d polynomials_f64x4 = coeff_19_f64x4;
|
|
419
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_18_f64x4);
|
|
420
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_17_f64x4);
|
|
421
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_16_f64x4);
|
|
422
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_15_f64x4);
|
|
423
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_14_f64x4);
|
|
424
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_13_f64x4);
|
|
425
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_12_f64x4);
|
|
426
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_11_f64x4);
|
|
427
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_10_f64x4);
|
|
428
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_9_f64x4);
|
|
429
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_8_f64x4);
|
|
430
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_7_f64x4);
|
|
431
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_6_f64x4);
|
|
432
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_5_f64x4);
|
|
433
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_4_f64x4);
|
|
434
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_3_f64x4);
|
|
435
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_2_f64x4);
|
|
436
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_1_f64x4);
|
|
437
|
+
|
|
438
|
+
// Compute result_f64x4
|
|
439
|
+
__m256d result_f64x4 = _mm256_fmadd_pd(values_cubed_f64x4, polynomials_f64x4, values_f64x4);
|
|
440
|
+
|
|
441
|
+
// Adjust for reciprocal: result_f64x4 = π/2 - result_f64x4
|
|
442
|
+
__m256d adjusted_f64x4 = _mm256_sub_pd(_mm256_set1_pd(1.5707963267948966), result_f64x4);
|
|
443
|
+
result_f64x4 = _mm256_blendv_pd(result_f64x4, adjusted_f64x4, reciprocal_mask_f64x4);
|
|
444
|
+
|
|
445
|
+
// Adjust for negative: result_f64x4 = -result_f64x4
|
|
446
|
+
__m256d negated_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), result_f64x4);
|
|
447
|
+
result_f64x4 = _mm256_blendv_pd(result_f64x4, negated_f64x4, negative_mask_f64x4);
|
|
448
|
+
return result_f64x4;
|
|
448
449
|
}
|
|
449
450
|
|
|
450
451
|
NK_INTERNAL __m256d nk_atan2_f64x4_haswell_(__m256d const ys_inputs, __m256d const xs_inputs) {
|
|
451
452
|
// Polynomial coefficients for atan approximation (19 coefficients, same as atan)
|
|
452
|
-
__m256d const
|
|
453
|
-
__m256d const
|
|
454
|
-
__m256d const
|
|
455
|
-
__m256d const
|
|
456
|
-
__m256d const
|
|
457
|
-
__m256d const
|
|
458
|
-
__m256d const
|
|
459
|
-
__m256d const
|
|
460
|
-
__m256d const
|
|
461
|
-
__m256d const
|
|
462
|
-
__m256d const
|
|
463
|
-
__m256d const
|
|
464
|
-
__m256d const
|
|
465
|
-
__m256d const
|
|
466
|
-
__m256d const
|
|
467
|
-
__m256d const
|
|
468
|
-
__m256d const
|
|
469
|
-
__m256d const
|
|
470
|
-
__m256d const
|
|
471
|
-
__m256d const
|
|
453
|
+
__m256d const coeff_19_f64x4 = _mm256_set1_pd(-1.88796008463073496563746e-05);
|
|
454
|
+
__m256d const coeff_18_f64x4 = _mm256_set1_pd(+0.000209850076645816976906797);
|
|
455
|
+
__m256d const coeff_17_f64x4 = _mm256_set1_pd(-0.00110611831486672482563471);
|
|
456
|
+
__m256d const coeff_16_f64x4 = _mm256_set1_pd(+0.00370026744188713119232403);
|
|
457
|
+
__m256d const coeff_15_f64x4 = _mm256_set1_pd(-0.00889896195887655491740809);
|
|
458
|
+
__m256d const coeff_14_f64x4 = _mm256_set1_pd(+0.016599329773529201970117);
|
|
459
|
+
__m256d const coeff_13_f64x4 = _mm256_set1_pd(-0.0254517624932312641616861);
|
|
460
|
+
__m256d const coeff_12_f64x4 = _mm256_set1_pd(+0.0337852580001353069993897);
|
|
461
|
+
__m256d const coeff_11_f64x4 = _mm256_set1_pd(-0.0407629191276836500001934);
|
|
462
|
+
__m256d const coeff_10_f64x4 = _mm256_set1_pd(+0.0466667150077840625632675);
|
|
463
|
+
__m256d const coeff_9_f64x4 = _mm256_set1_pd(-0.0523674852303482457616113);
|
|
464
|
+
__m256d const coeff_8_f64x4 = _mm256_set1_pd(+0.0587666392926673580854313);
|
|
465
|
+
__m256d const coeff_7_f64x4 = _mm256_set1_pd(-0.0666573579361080525984562);
|
|
466
|
+
__m256d const coeff_6_f64x4 = _mm256_set1_pd(+0.0769219538311769618355029);
|
|
467
|
+
__m256d const coeff_5_f64x4 = _mm256_set1_pd(-0.090908995008245008229153);
|
|
468
|
+
__m256d const coeff_4_f64x4 = _mm256_set1_pd(+0.111111105648261418443745);
|
|
469
|
+
__m256d const coeff_3_f64x4 = _mm256_set1_pd(-0.14285714266771329383765);
|
|
470
|
+
__m256d const coeff_2_f64x4 = _mm256_set1_pd(+0.199999999996591265594148);
|
|
471
|
+
__m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.333333333333311110369124);
|
|
472
|
+
__m256d const sign_mask_f64x4 = _mm256_set1_pd(-0.0);
|
|
472
473
|
|
|
473
474
|
// Quadrant adjustments normalizing to absolute values of x and y
|
|
474
|
-
__m256d
|
|
475
|
-
__m256d
|
|
476
|
-
__m256d
|
|
475
|
+
__m256d xs_negative_mask_f64x4 = _mm256_cmp_pd(xs_inputs, _mm256_setzero_pd(), _CMP_LT_OS);
|
|
476
|
+
__m256d xs_f64x4 = _mm256_andnot_pd(sign_mask_f64x4, xs_inputs); // abs(xs_inputs)
|
|
477
|
+
__m256d ys_f64x4 = _mm256_andnot_pd(sign_mask_f64x4, ys_inputs); // abs(ys_inputs)
|
|
477
478
|
|
|
478
479
|
// Ensure proper fraction where the numerator is smaller than the denominator
|
|
479
|
-
__m256d
|
|
480
|
-
__m256d
|
|
481
|
-
|
|
482
|
-
__m256d
|
|
483
|
-
|
|
480
|
+
__m256d swap_mask_f64x4 = _mm256_cmp_pd(ys_f64x4, xs_f64x4, _CMP_GT_OS);
|
|
481
|
+
__m256d temps_f64x4 = xs_f64x4;
|
|
482
|
+
xs_f64x4 = _mm256_blendv_pd(xs_f64x4, ys_f64x4, swap_mask_f64x4);
|
|
483
|
+
__m256d neg_temps_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), temps_f64x4);
|
|
484
|
+
ys_f64x4 = _mm256_blendv_pd(ys_f64x4, neg_temps_f64x4, swap_mask_f64x4);
|
|
484
485
|
|
|
485
|
-
// Compute
|
|
486
|
-
__m256d const
|
|
487
|
-
__m256d const
|
|
488
|
-
__m256d const
|
|
486
|
+
// Compute ratio_f64x4 and powers
|
|
487
|
+
__m256d const ratio_f64x4 = _mm256_div_pd(ys_f64x4, xs_f64x4);
|
|
488
|
+
__m256d const ratio_squared_f64x4 = _mm256_mul_pd(ratio_f64x4, ratio_f64x4);
|
|
489
|
+
__m256d const ratio_cubed_f64x4 = _mm256_mul_pd(ratio_f64x4, ratio_squared_f64x4);
|
|
489
490
|
|
|
490
491
|
// Polynomial evaluation using Horner's method
|
|
491
|
-
__m256d
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
// Compute the result using masks for
|
|
512
|
-
__m256d
|
|
513
|
-
|
|
514
|
-
// Compute
|
|
492
|
+
__m256d polynomials_f64x4 = coeff_19_f64x4;
|
|
493
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_18_f64x4);
|
|
494
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_17_f64x4);
|
|
495
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_16_f64x4);
|
|
496
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_15_f64x4);
|
|
497
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_14_f64x4);
|
|
498
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_13_f64x4);
|
|
499
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_12_f64x4);
|
|
500
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_11_f64x4);
|
|
501
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_10_f64x4);
|
|
502
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_9_f64x4);
|
|
503
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_8_f64x4);
|
|
504
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_7_f64x4);
|
|
505
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_6_f64x4);
|
|
506
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_5_f64x4);
|
|
507
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_4_f64x4);
|
|
508
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_3_f64x4);
|
|
509
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_2_f64x4);
|
|
510
|
+
polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_1_f64x4);
|
|
511
|
+
|
|
512
|
+
// Compute the result using masks for quadrant_f64x4 adjustments
|
|
513
|
+
__m256d results_f64x4 = _mm256_fmadd_pd(ratio_cubed_f64x4, polynomials_f64x4, ratio_f64x4);
|
|
514
|
+
|
|
515
|
+
// Compute quadrant_f64x4 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
|
|
515
516
|
// -2 for x<0 && !swap, -1 for x<0 && swap
|
|
516
|
-
__m256d
|
|
517
|
-
__m256d
|
|
518
|
-
|
|
519
|
-
__m256d
|
|
520
|
-
__m256d
|
|
521
|
-
|
|
517
|
+
__m256d quadrant_f64x4 = _mm256_setzero_pd();
|
|
518
|
+
__m256d neg_two_f64x4 = _mm256_set1_pd(-2.0);
|
|
519
|
+
quadrant_f64x4 = _mm256_blendv_pd(quadrant_f64x4, neg_two_f64x4, xs_negative_mask_f64x4);
|
|
520
|
+
__m256d one_f64x4 = _mm256_set1_pd(1.0);
|
|
521
|
+
__m256d quadrant_incremented_f64x4 = _mm256_add_pd(quadrant_f64x4, one_f64x4);
|
|
522
|
+
quadrant_f64x4 = _mm256_blendv_pd(quadrant_f64x4, quadrant_incremented_f64x4, swap_mask_f64x4);
|
|
522
523
|
|
|
523
|
-
// Adjust for
|
|
524
|
-
__m256d
|
|
525
|
-
|
|
524
|
+
// Adjust for quadrant_f64x4: result += quadrant_f64x4 * π/2
|
|
525
|
+
__m256d pi_half_f64x4 = _mm256_set1_pd(1.5707963267948966);
|
|
526
|
+
results_f64x4 = _mm256_fmadd_pd(quadrant_f64x4, pi_half_f64x4, results_f64x4);
|
|
526
527
|
|
|
527
528
|
// Transfer sign from x (XOR with sign bit of x_input)
|
|
528
|
-
__m256d
|
|
529
|
-
|
|
529
|
+
__m256d xs_sign_bits_f64x4 = _mm256_and_pd(xs_inputs, sign_mask_f64x4);
|
|
530
|
+
results_f64x4 = _mm256_xor_pd(results_f64x4, xs_sign_bits_f64x4);
|
|
530
531
|
|
|
531
532
|
// Transfer sign from y (XOR with sign bit of y_input)
|
|
532
|
-
__m256d
|
|
533
|
-
|
|
533
|
+
__m256d ys_sign_bits_f64x4 = _mm256_and_pd(ys_inputs, sign_mask_f64x4);
|
|
534
|
+
results_f64x4 = _mm256_xor_pd(results_f64x4, ys_sign_bits_f64x4);
|
|
534
535
|
|
|
535
|
-
return
|
|
536
|
+
return results_f64x4;
|
|
536
537
|
}
|
|
537
538
|
|
|
538
539
|
NK_PUBLIC void nk_each_sin_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
539
540
|
nk_size_t i = 0;
|
|
540
541
|
for (; i + 8 <= n; i += 8) {
|
|
541
|
-
__m256
|
|
542
|
-
__m256
|
|
543
|
-
_mm256_storeu_ps(outs + i,
|
|
542
|
+
__m256 angles_f32x8 = _mm256_loadu_ps(ins + i);
|
|
543
|
+
__m256 results_f32x8 = nk_sin_f32x8_haswell_(angles_f32x8);
|
|
544
|
+
_mm256_storeu_ps(outs + i, results_f32x8);
|
|
544
545
|
}
|
|
545
546
|
if (i < n) {
|
|
546
547
|
nk_size_t remaining = n - i;
|
|
@@ -555,9 +556,9 @@ NK_PUBLIC void nk_each_sin_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_
|
|
|
555
556
|
NK_PUBLIC void nk_each_cos_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
556
557
|
nk_size_t i = 0;
|
|
557
558
|
for (; i + 8 <= n; i += 8) {
|
|
558
|
-
__m256
|
|
559
|
-
__m256
|
|
560
|
-
_mm256_storeu_ps(outs + i,
|
|
559
|
+
__m256 angles_f32x8 = _mm256_loadu_ps(ins + i);
|
|
560
|
+
__m256 results_f32x8 = nk_cos_f32x8_haswell_(angles_f32x8);
|
|
561
|
+
_mm256_storeu_ps(outs + i, results_f32x8);
|
|
561
562
|
}
|
|
562
563
|
if (i < n) {
|
|
563
564
|
nk_size_t remaining = n - i;
|
|
@@ -572,9 +573,9 @@ NK_PUBLIC void nk_each_cos_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_
|
|
|
572
573
|
NK_PUBLIC void nk_each_atan_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
573
574
|
nk_size_t i = 0;
|
|
574
575
|
for (; i + 8 <= n; i += 8) {
|
|
575
|
-
__m256
|
|
576
|
-
__m256
|
|
577
|
-
_mm256_storeu_ps(outs + i,
|
|
576
|
+
__m256 values_f32x8 = _mm256_loadu_ps(ins + i);
|
|
577
|
+
__m256 results_f32x8 = nk_atan_f32x8_haswell_(values_f32x8);
|
|
578
|
+
_mm256_storeu_ps(outs + i, results_f32x8);
|
|
578
579
|
}
|
|
579
580
|
if (i < n) {
|
|
580
581
|
nk_size_t remaining = n - i;
|
|
@@ -589,9 +590,9 @@ NK_PUBLIC void nk_each_atan_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32
|
|
|
589
590
|
NK_PUBLIC void nk_each_sin_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
590
591
|
nk_size_t i = 0;
|
|
591
592
|
for (; i + 4 <= n; i += 4) {
|
|
592
|
-
__m256d
|
|
593
|
-
__m256d
|
|
594
|
-
_mm256_storeu_pd(outs + i,
|
|
593
|
+
__m256d angles_f64x4 = _mm256_loadu_pd(ins + i);
|
|
594
|
+
__m256d results_f64x4 = nk_sin_f64x4_haswell_(angles_f64x4);
|
|
595
|
+
_mm256_storeu_pd(outs + i, results_f64x4);
|
|
595
596
|
}
|
|
596
597
|
if (i < n) {
|
|
597
598
|
nk_size_t remaining = n - i;
|
|
@@ -606,9 +607,9 @@ NK_PUBLIC void nk_each_sin_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_
|
|
|
606
607
|
NK_PUBLIC void nk_each_cos_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
607
608
|
nk_size_t i = 0;
|
|
608
609
|
for (; i + 4 <= n; i += 4) {
|
|
609
|
-
__m256d
|
|
610
|
-
__m256d
|
|
611
|
-
_mm256_storeu_pd(outs + i,
|
|
610
|
+
__m256d angles_f64x4 = _mm256_loadu_pd(ins + i);
|
|
611
|
+
__m256d results_f64x4 = nk_cos_f64x4_haswell_(angles_f64x4);
|
|
612
|
+
_mm256_storeu_pd(outs + i, results_f64x4);
|
|
612
613
|
}
|
|
613
614
|
if (i < n) {
|
|
614
615
|
nk_size_t remaining = n - i;
|
|
@@ -623,9 +624,9 @@ NK_PUBLIC void nk_each_cos_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_
|
|
|
623
624
|
NK_PUBLIC void nk_each_atan_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
624
625
|
nk_size_t i = 0;
|
|
625
626
|
for (; i + 4 <= n; i += 4) {
|
|
626
|
-
__m256d
|
|
627
|
-
__m256d
|
|
628
|
-
_mm256_storeu_pd(outs + i,
|
|
627
|
+
__m256d values_f64x4 = _mm256_loadu_pd(ins + i);
|
|
628
|
+
__m256d results_f64x4 = nk_atan_f64x4_haswell_(values_f64x4);
|
|
629
|
+
_mm256_storeu_pd(outs + i, results_f64x4);
|
|
629
630
|
}
|
|
630
631
|
if (i < n) {
|
|
631
632
|
nk_size_t remaining = n - i;
|