numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -9,20 +9,19 @@
|
|
|
9
9
|
*
|
|
10
10
|
* @section trigonometry_neon_instructions ARM NEON Instructions
|
|
11
11
|
*
|
|
12
|
-
* Intrinsic
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
25
|
-
* vdivq_f64 FDIV (V.2D, V.2D, V.2D) 15cy 0.5/cy 0.5/cy
|
|
12
|
+
* Intrinsic Instruction A76 M5
|
|
13
|
+
* vfmaq_f32 FMLA (V.4S, V.4S, V.4S) 4cy @ 2p 3cy @ 4p
|
|
14
|
+
* vfmsq_f32 FMLS (V.4S, V.4S, V.4S) 4cy @ 2p 3cy @ 4p
|
|
15
|
+
* vmulq_f32 FMUL (V.4S, V.4S, V.4S) 3cy @ 2p 3cy @ 4p
|
|
16
|
+
* vaddq_f32 FADD (V.4S, V.4S, V.4S) 2cy @ 2p 2cy @ 4p
|
|
17
|
+
* vsubq_f32 FSUB (V.4S, V.4S, V.4S) 2cy @ 2p 2cy @ 4p
|
|
18
|
+
* vcvtnq_s32_f32 FCVTNS (V.4S, V.4S) 3cy @ 2p 3cy @ 4p
|
|
19
|
+
* vcvtq_f32_s32 SCVTF (V.4S, V.4S) 3cy @ 2p 3cy @ 4p
|
|
20
|
+
* vbslq_f32 BSL (V.16B, V.16B, V.16B) 1cy @ 2p 2cy @ 4p
|
|
21
|
+
* vrecpeq_f32 FRECPE (V.4S, V.4S) 2cy @ 2p 3cy @ 1p
|
|
22
|
+
* vrecpsq_f32 FRECPS (V.4S, V.4S, V.4S) 4cy @ 2p 3cy @ 2p
|
|
23
|
+
* vfmaq_f64 FMLA (V.2D, V.2D, V.2D) 4cy @ 2p 3cy @ 4p
|
|
24
|
+
* vdivq_f64 FDIV (V.2D, V.2D, V.2D) 12cy @ 1p 7cy @ 1p
|
|
26
25
|
*
|
|
27
26
|
* Polynomial approximations for sin/cos/atan are FMA-dominated. On 4-pipe cores (Apple M4+,
|
|
28
27
|
* Graviton3+, Oryon), FMA throughput is 4/cy with 4cy latency.
|
|
@@ -56,478 +55,478 @@ extern "C" {
|
|
|
56
55
|
|
|
57
56
|
NK_INTERNAL float32x4_t nk_sin_f32x4_neon_(float32x4_t const angles_radians) {
|
|
58
57
|
// Cody-Waite constants for argument reduction
|
|
59
|
-
float32x4_t const
|
|
60
|
-
float32x4_t const
|
|
61
|
-
float32x4_t const
|
|
58
|
+
float32x4_t const pi_high_f32x4 = vdupq_n_f32(3.1415927f);
|
|
59
|
+
float32x4_t const pi_low_f32x4 = vdupq_n_f32(-8.742278e-8f);
|
|
60
|
+
float32x4_t const pi_reciprocal_f32x4 = vdupq_n_f32(0.31830988618379067154f);
|
|
62
61
|
// Degree-9 minimax coefficients
|
|
63
|
-
float32x4_t const
|
|
64
|
-
float32x4_t const
|
|
65
|
-
float32x4_t const
|
|
66
|
-
float32x4_t const
|
|
62
|
+
float32x4_t const coeff_9_f32x4 = vdupq_n_f32(+2.7557319224e-6f);
|
|
63
|
+
float32x4_t const coeff_7_f32x4 = vdupq_n_f32(-1.9841269841e-4f);
|
|
64
|
+
float32x4_t const coeff_5_f32x4 = vdupq_n_f32(+8.3333293855e-3f);
|
|
65
|
+
float32x4_t const coeff_3_f32x4 = vdupq_n_f32(-1.6666666641e-1f);
|
|
67
66
|
|
|
68
|
-
// Compute (
|
|
69
|
-
float32x4_t
|
|
70
|
-
int32x4_t
|
|
71
|
-
float32x4_t
|
|
67
|
+
// Compute (multiples_of_pi_i32x4) = round(angle / π) using vcvtnq which rounds to nearest
|
|
68
|
+
float32x4_t quotients_f32x4 = vmulq_f32(angles_radians, pi_reciprocal_f32x4);
|
|
69
|
+
int32x4_t multiples_of_pi_i32x4 = vcvtnq_s32_f32(quotients_f32x4);
|
|
70
|
+
float32x4_t rounded_quotients_f32x4 = vcvtq_f32_s32(multiples_of_pi_i32x4);
|
|
72
71
|
|
|
73
72
|
// Cody-Waite range reduction
|
|
74
|
-
float32x4_t
|
|
75
|
-
|
|
76
|
-
float32x4_t const
|
|
77
|
-
float32x4_t const
|
|
73
|
+
float32x4_t angles_f32x4 = vfmsq_f32(angles_radians, rounded_quotients_f32x4, pi_high_f32x4);
|
|
74
|
+
angles_f32x4 = vfmsq_f32(angles_f32x4, rounded_quotients_f32x4, pi_low_f32x4);
|
|
75
|
+
float32x4_t const angles_squared_f32x4 = vmulq_f32(angles_f32x4, angles_f32x4);
|
|
76
|
+
float32x4_t const angles_cubed_f32x4 = vmulq_f32(angles_f32x4, angles_squared_f32x4);
|
|
78
77
|
|
|
79
78
|
// Degree-9 polynomial via Horner's method
|
|
80
|
-
float32x4_t
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
float32x4_t
|
|
85
|
-
|
|
86
|
-
// If
|
|
87
|
-
int32x4_t
|
|
88
|
-
uint32x4_t
|
|
89
|
-
float32x4_t
|
|
90
|
-
|
|
91
|
-
return
|
|
79
|
+
float32x4_t polynomials_f32x4 = coeff_9_f32x4;
|
|
80
|
+
polynomials_f32x4 = vfmaq_f32(coeff_7_f32x4, polynomials_f32x4, angles_squared_f32x4);
|
|
81
|
+
polynomials_f32x4 = vfmaq_f32(coeff_5_f32x4, polynomials_f32x4, angles_squared_f32x4);
|
|
82
|
+
polynomials_f32x4 = vfmaq_f32(coeff_3_f32x4, polynomials_f32x4, angles_squared_f32x4);
|
|
83
|
+
float32x4_t results_f32x4 = vfmaq_f32(angles_f32x4, angles_cubed_f32x4, polynomials_f32x4);
|
|
84
|
+
|
|
85
|
+
// If multiples_of_pi_i32x4 is odd, flip the sign
|
|
86
|
+
int32x4_t parity_i32x4 = vandq_s32(multiples_of_pi_i32x4, vdupq_n_s32(1));
|
|
87
|
+
uint32x4_t odd_mask_u32x4 = vceqq_s32(parity_i32x4, vdupq_n_s32(1));
|
|
88
|
+
float32x4_t negated_f32x4 = vnegq_f32(results_f32x4);
|
|
89
|
+
results_f32x4 = vbslq_f32(odd_mask_u32x4, negated_f32x4, results_f32x4);
|
|
90
|
+
return results_f32x4;
|
|
92
91
|
}
|
|
93
92
|
|
|
94
93
|
NK_INTERNAL float32x4_t nk_cos_f32x4_neon_(float32x4_t const angles_radians) {
|
|
95
94
|
// Cody-Waite constants for argument reduction
|
|
96
|
-
float32x4_t const
|
|
97
|
-
float32x4_t const
|
|
98
|
-
float32x4_t const
|
|
99
|
-
float32x4_t const
|
|
95
|
+
float32x4_t const pi_high_f32x4 = vdupq_n_f32(3.1415927f);
|
|
96
|
+
float32x4_t const pi_low_f32x4 = vdupq_n_f32(-8.742278e-8f);
|
|
97
|
+
float32x4_t const pi_half_f32x4 = vdupq_n_f32(1.57079632679489661923f);
|
|
98
|
+
float32x4_t const pi_reciprocal_f32x4 = vdupq_n_f32(0.31830988618379067154f);
|
|
100
99
|
// Degree-9 minimax coefficients
|
|
101
|
-
float32x4_t const
|
|
102
|
-
float32x4_t const
|
|
103
|
-
float32x4_t const
|
|
104
|
-
float32x4_t const
|
|
100
|
+
float32x4_t const coeff_9_f32x4 = vdupq_n_f32(+2.7557319224e-6f);
|
|
101
|
+
float32x4_t const coeff_7_f32x4 = vdupq_n_f32(-1.9841269841e-4f);
|
|
102
|
+
float32x4_t const coeff_5_f32x4 = vdupq_n_f32(+8.3333293855e-3f);
|
|
103
|
+
float32x4_t const coeff_3_f32x4 = vdupq_n_f32(-1.6666666641e-1f);
|
|
105
104
|
|
|
106
105
|
// Compute round((angle / π) - 0.5)
|
|
107
|
-
float32x4_t
|
|
108
|
-
int32x4_t
|
|
109
|
-
float32x4_t
|
|
106
|
+
float32x4_t quotients_f32x4 = vsubq_f32(vmulq_f32(angles_radians, pi_reciprocal_f32x4), vdupq_n_f32(0.5f));
|
|
107
|
+
int32x4_t multiples_of_pi_i32x4 = vcvtnq_s32_f32(quotients_f32x4);
|
|
108
|
+
float32x4_t rounded_quotients_f32x4 = vcvtq_f32_s32(multiples_of_pi_i32x4);
|
|
110
109
|
|
|
111
|
-
// Cody-Waite range reduction: angle = (angle - pi/2) - rounded * (
|
|
112
|
-
float32x4_t
|
|
113
|
-
float32x4_t
|
|
114
|
-
|
|
115
|
-
float32x4_t const
|
|
116
|
-
float32x4_t const
|
|
110
|
+
// Cody-Waite range reduction: angle = (angle - pi/2) - rounded * (pi_high + pi_low)
|
|
111
|
+
float32x4_t shifted_f32x4 = vsubq_f32(angles_radians, pi_half_f32x4);
|
|
112
|
+
float32x4_t angles_f32x4 = vfmsq_f32(shifted_f32x4, rounded_quotients_f32x4, pi_high_f32x4);
|
|
113
|
+
angles_f32x4 = vfmsq_f32(angles_f32x4, rounded_quotients_f32x4, pi_low_f32x4);
|
|
114
|
+
float32x4_t const angles_squared_f32x4 = vmulq_f32(angles_f32x4, angles_f32x4);
|
|
115
|
+
float32x4_t const angles_cubed_f32x4 = vmulq_f32(angles_f32x4, angles_squared_f32x4);
|
|
117
116
|
|
|
118
117
|
// Degree-9 polynomial via Horner's method
|
|
119
|
-
float32x4_t
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
float32x4_t
|
|
124
|
-
|
|
125
|
-
// If
|
|
126
|
-
int32x4_t
|
|
127
|
-
uint32x4_t
|
|
128
|
-
float32x4_t
|
|
129
|
-
|
|
130
|
-
return
|
|
118
|
+
float32x4_t polynomials_f32x4 = coeff_9_f32x4;
|
|
119
|
+
polynomials_f32x4 = vfmaq_f32(coeff_7_f32x4, polynomials_f32x4, angles_squared_f32x4);
|
|
120
|
+
polynomials_f32x4 = vfmaq_f32(coeff_5_f32x4, polynomials_f32x4, angles_squared_f32x4);
|
|
121
|
+
polynomials_f32x4 = vfmaq_f32(coeff_3_f32x4, polynomials_f32x4, angles_squared_f32x4);
|
|
122
|
+
float32x4_t results_f32x4 = vfmaq_f32(angles_f32x4, angles_cubed_f32x4, polynomials_f32x4);
|
|
123
|
+
|
|
124
|
+
// If multiples_of_pi_i32x4 is even, flip the sign
|
|
125
|
+
int32x4_t parity_i32x4 = vandq_s32(multiples_of_pi_i32x4, vdupq_n_s32(1));
|
|
126
|
+
uint32x4_t even_mask_u32x4 = vceqq_s32(parity_i32x4, vdupq_n_s32(0));
|
|
127
|
+
float32x4_t negated_f32x4 = vnegq_f32(results_f32x4);
|
|
128
|
+
results_f32x4 = vbslq_f32(even_mask_u32x4, negated_f32x4, results_f32x4);
|
|
129
|
+
return results_f32x4;
|
|
131
130
|
}
|
|
132
131
|
|
|
133
132
|
NK_INTERNAL float32x4_t nk_atan_f32x4_neon_(float32x4_t const inputs) {
|
|
134
133
|
// Polynomial coefficients for atan approximation (8 terms)
|
|
135
|
-
float32x4_t const
|
|
136
|
-
float32x4_t const
|
|
137
|
-
float32x4_t const
|
|
138
|
-
float32x4_t const
|
|
139
|
-
float32x4_t const
|
|
140
|
-
float32x4_t const
|
|
141
|
-
float32x4_t const
|
|
142
|
-
float32x4_t const
|
|
143
|
-
float32x4_t const
|
|
144
|
-
|
|
145
|
-
// Detect negative
|
|
146
|
-
float32x4_t const
|
|
147
|
-
uint32x4_t
|
|
148
|
-
float32x4_t
|
|
149
|
-
|
|
150
|
-
// Check if
|
|
151
|
-
uint32x4_t
|
|
134
|
+
float32x4_t const coeff_8_f32x4 = vdupq_n_f32(-0.333331018686294555664062f);
|
|
135
|
+
float32x4_t const coeff_7_f32x4 = vdupq_n_f32(+0.199926957488059997558594f);
|
|
136
|
+
float32x4_t const coeff_6_f32x4 = vdupq_n_f32(-0.142027363181114196777344f);
|
|
137
|
+
float32x4_t const coeff_5_f32x4 = vdupq_n_f32(+0.106347933411598205566406f);
|
|
138
|
+
float32x4_t const coeff_4_f32x4 = vdupq_n_f32(-0.0748900920152664184570312f);
|
|
139
|
+
float32x4_t const coeff_3_f32x4 = vdupq_n_f32(+0.0425049886107444763183594f);
|
|
140
|
+
float32x4_t const coeff_2_f32x4 = vdupq_n_f32(-0.0159569028764963150024414f);
|
|
141
|
+
float32x4_t const coeff_1_f32x4 = vdupq_n_f32(+0.00282363896258175373077393f);
|
|
142
|
+
float32x4_t const half_pi_f32x4 = vdupq_n_f32(1.5707963267948966f);
|
|
143
|
+
|
|
144
|
+
// Detect negative values_f32x4 and take absolute value
|
|
145
|
+
float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
|
|
146
|
+
uint32x4_t negative_mask_u32x4 = vcltq_f32(inputs, zeros_f32x4);
|
|
147
|
+
float32x4_t values_f32x4 = vabsq_f32(inputs);
|
|
148
|
+
|
|
149
|
+
// Check if values_f32x4 > 1 (need reciprocal)
|
|
150
|
+
uint32x4_t reciprocal_mask_u32x4 = vcgtq_f32(values_f32x4, vdupq_n_f32(1.0f));
|
|
152
151
|
|
|
153
152
|
// Fast reciprocal using vrecpeq + Newton-Raphson (faster than vdivq on many Arm cores)
|
|
154
|
-
float32x4_t
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
153
|
+
float32x4_t recip_f32x4 = vrecpeq_f32(values_f32x4);
|
|
154
|
+
recip_f32x4 = vmulq_f32(recip_f32x4, vrecpsq_f32(values_f32x4, recip_f32x4));
|
|
155
|
+
recip_f32x4 = vmulq_f32(recip_f32x4, vrecpsq_f32(values_f32x4, recip_f32x4));
|
|
156
|
+
values_f32x4 = vbslq_f32(reciprocal_mask_u32x4, recip_f32x4, values_f32x4);
|
|
158
157
|
|
|
159
158
|
// Compute powers
|
|
160
|
-
float32x4_t const
|
|
161
|
-
float32x4_t const
|
|
159
|
+
float32x4_t const values_squared_f32x4 = vmulq_f32(values_f32x4, values_f32x4);
|
|
160
|
+
float32x4_t const values_cubed_f32x4 = vmulq_f32(values_f32x4, values_squared_f32x4);
|
|
162
161
|
|
|
163
162
|
// Polynomial evaluation using Horner's method
|
|
164
|
-
float32x4_t
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
// Compute
|
|
174
|
-
float32x4_t
|
|
175
|
-
|
|
176
|
-
// Adjust for reciprocal:
|
|
177
|
-
float32x4_t
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
// Adjust for negative:
|
|
181
|
-
float32x4_t
|
|
182
|
-
|
|
183
|
-
return
|
|
163
|
+
float32x4_t polynomials_f32x4 = coeff_1_f32x4;
|
|
164
|
+
polynomials_f32x4 = vfmaq_f32(coeff_2_f32x4, polynomials_f32x4, values_squared_f32x4);
|
|
165
|
+
polynomials_f32x4 = vfmaq_f32(coeff_3_f32x4, polynomials_f32x4, values_squared_f32x4);
|
|
166
|
+
polynomials_f32x4 = vfmaq_f32(coeff_4_f32x4, polynomials_f32x4, values_squared_f32x4);
|
|
167
|
+
polynomials_f32x4 = vfmaq_f32(coeff_5_f32x4, polynomials_f32x4, values_squared_f32x4);
|
|
168
|
+
polynomials_f32x4 = vfmaq_f32(coeff_6_f32x4, polynomials_f32x4, values_squared_f32x4);
|
|
169
|
+
polynomials_f32x4 = vfmaq_f32(coeff_7_f32x4, polynomials_f32x4, values_squared_f32x4);
|
|
170
|
+
polynomials_f32x4 = vfmaq_f32(coeff_8_f32x4, polynomials_f32x4, values_squared_f32x4);
|
|
171
|
+
|
|
172
|
+
// Compute result_f32x4: atan(x) ≈ x + x³ * P(x²)
|
|
173
|
+
float32x4_t result_f32x4 = vfmaq_f32(values_f32x4, values_cubed_f32x4, polynomials_f32x4);
|
|
174
|
+
|
|
175
|
+
// Adjust for reciprocal: result_f32x4 = π/2 - result_f32x4
|
|
176
|
+
float32x4_t adjusted_f32x4 = vsubq_f32(half_pi_f32x4, result_f32x4);
|
|
177
|
+
result_f32x4 = vbslq_f32(reciprocal_mask_u32x4, adjusted_f32x4, result_f32x4);
|
|
178
|
+
|
|
179
|
+
// Adjust for negative: result_f32x4 = -result_f32x4
|
|
180
|
+
float32x4_t negated_f32x4 = vnegq_f32(result_f32x4);
|
|
181
|
+
result_f32x4 = vbslq_f32(negative_mask_u32x4, negated_f32x4, result_f32x4);
|
|
182
|
+
return result_f32x4;
|
|
184
183
|
}
|
|
185
184
|
|
|
186
185
|
NK_INTERNAL float32x4_t nk_atan2_f32x4_neon_(float32x4_t const ys_inputs, float32x4_t const xs_inputs) {
|
|
187
186
|
// Polynomial coefficients (same as atan)
|
|
188
|
-
float32x4_t const
|
|
189
|
-
float32x4_t const
|
|
190
|
-
float32x4_t const
|
|
191
|
-
float32x4_t const
|
|
192
|
-
float32x4_t const
|
|
193
|
-
float32x4_t const
|
|
194
|
-
float32x4_t const
|
|
195
|
-
float32x4_t const
|
|
196
|
-
float32x4_t const
|
|
197
|
-
float32x4_t const
|
|
187
|
+
float32x4_t const coeff_8_f32x4 = vdupq_n_f32(-0.333331018686294555664062f);
|
|
188
|
+
float32x4_t const coeff_7_f32x4 = vdupq_n_f32(+0.199926957488059997558594f);
|
|
189
|
+
float32x4_t const coeff_6_f32x4 = vdupq_n_f32(-0.142027363181114196777344f);
|
|
190
|
+
float32x4_t const coeff_5_f32x4 = vdupq_n_f32(+0.106347933411598205566406f);
|
|
191
|
+
float32x4_t const coeff_4_f32x4 = vdupq_n_f32(-0.0748900920152664184570312f);
|
|
192
|
+
float32x4_t const coeff_3_f32x4 = vdupq_n_f32(+0.0425049886107444763183594f);
|
|
193
|
+
float32x4_t const coeff_2_f32x4 = vdupq_n_f32(-0.0159569028764963150024414f);
|
|
194
|
+
float32x4_t const coeff_1_f32x4 = vdupq_n_f32(+0.00282363896258175373077393f);
|
|
195
|
+
float32x4_t const half_pi_f32x4 = vdupq_n_f32(1.5707963267948966f);
|
|
196
|
+
float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
|
|
198
197
|
|
|
199
198
|
// Quadrant adjustments - take absolute values
|
|
200
|
-
uint32x4_t
|
|
201
|
-
float32x4_t
|
|
202
|
-
float32x4_t
|
|
199
|
+
uint32x4_t xs_negative_mask_u32x4 = vcltq_f32(xs_inputs, zeros_f32x4);
|
|
200
|
+
float32x4_t xs_f32x4 = vabsq_f32(xs_inputs);
|
|
201
|
+
float32x4_t ys_f32x4 = vabsq_f32(ys_inputs);
|
|
203
202
|
|
|
204
203
|
// Ensure proper fraction where numerator < denominator
|
|
205
|
-
uint32x4_t
|
|
206
|
-
float32x4_t
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
// Fast reciprocal for division:
|
|
211
|
-
float32x4_t
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
float32x4_t const
|
|
215
|
-
float32x4_t const
|
|
216
|
-
float32x4_t const
|
|
204
|
+
uint32x4_t swap_mask_u32x4 = vcgtq_f32(ys_f32x4, xs_f32x4);
|
|
205
|
+
float32x4_t temps_f32x4 = xs_f32x4;
|
|
206
|
+
xs_f32x4 = vbslq_f32(swap_mask_u32x4, ys_f32x4, xs_f32x4);
|
|
207
|
+
ys_f32x4 = vbslq_f32(swap_mask_u32x4, vnegq_f32(temps_f32x4), ys_f32x4);
|
|
208
|
+
|
|
209
|
+
// Fast reciprocal for division: ratio_f32x4 = ys_f32x4 / xs_f32x4 ≈ ys_f32x4 * recip_f32x4(xs_f32x4)
|
|
210
|
+
float32x4_t recip_f32x4 = vrecpeq_f32(xs_f32x4);
|
|
211
|
+
recip_f32x4 = vmulq_f32(recip_f32x4, vrecpsq_f32(xs_f32x4, recip_f32x4));
|
|
212
|
+
recip_f32x4 = vmulq_f32(recip_f32x4, vrecpsq_f32(xs_f32x4, recip_f32x4));
|
|
213
|
+
float32x4_t const ratio_f32x4 = vmulq_f32(ys_f32x4, recip_f32x4);
|
|
214
|
+
float32x4_t const ratio_squared_f32x4 = vmulq_f32(ratio_f32x4, ratio_f32x4);
|
|
215
|
+
float32x4_t const ratio_cubed_f32x4 = vmulq_f32(ratio_f32x4, ratio_squared_f32x4);
|
|
217
216
|
|
|
218
217
|
// Polynomial evaluation using Horner's method
|
|
219
|
-
float32x4_t
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
218
|
+
float32x4_t polynomials_f32x4 = coeff_1_f32x4;
|
|
219
|
+
polynomials_f32x4 = vfmaq_f32(coeff_2_f32x4, polynomials_f32x4, ratio_squared_f32x4);
|
|
220
|
+
polynomials_f32x4 = vfmaq_f32(coeff_3_f32x4, polynomials_f32x4, ratio_squared_f32x4);
|
|
221
|
+
polynomials_f32x4 = vfmaq_f32(coeff_4_f32x4, polynomials_f32x4, ratio_squared_f32x4);
|
|
222
|
+
polynomials_f32x4 = vfmaq_f32(coeff_5_f32x4, polynomials_f32x4, ratio_squared_f32x4);
|
|
223
|
+
polynomials_f32x4 = vfmaq_f32(coeff_6_f32x4, polynomials_f32x4, ratio_squared_f32x4);
|
|
224
|
+
polynomials_f32x4 = vfmaq_f32(coeff_7_f32x4, polynomials_f32x4, ratio_squared_f32x4);
|
|
225
|
+
polynomials_f32x4 = vfmaq_f32(coeff_8_f32x4, polynomials_f32x4, ratio_squared_f32x4);
|
|
227
226
|
|
|
228
227
|
// Compute the result
|
|
229
|
-
float32x4_t
|
|
228
|
+
float32x4_t results_f32x4 = vfmaq_f32(ratio_f32x4, ratio_cubed_f32x4, polynomials_f32x4);
|
|
230
229
|
|
|
231
|
-
// Compute
|
|
230
|
+
// Compute quadrant_f32x4 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
|
|
232
231
|
// -2 for x<0 && !swap, -1 for x<0 && swap
|
|
233
|
-
float32x4_t
|
|
234
|
-
float32x4_t
|
|
235
|
-
|
|
236
|
-
float32x4_t
|
|
237
|
-
|
|
232
|
+
float32x4_t quadrant_f32x4 = vdupq_n_f32(0.0f);
|
|
233
|
+
float32x4_t neg_two_f32x4 = vdupq_n_f32(-2.0f);
|
|
234
|
+
quadrant_f32x4 = vbslq_f32(xs_negative_mask_u32x4, neg_two_f32x4, quadrant_f32x4);
|
|
235
|
+
float32x4_t quadrant_incremented_f32x4 = vaddq_f32(quadrant_f32x4, vdupq_n_f32(1.0f));
|
|
236
|
+
quadrant_f32x4 = vbslq_f32(swap_mask_u32x4, quadrant_incremented_f32x4, quadrant_f32x4);
|
|
238
237
|
|
|
239
|
-
// Adjust for
|
|
240
|
-
|
|
238
|
+
// Adjust for quadrant_f32x4: result += quadrant_f32x4 * π/2
|
|
239
|
+
results_f32x4 = vfmaq_f32(results_f32x4, quadrant_f32x4, half_pi_f32x4);
|
|
241
240
|
|
|
242
241
|
// Transfer sign from x and y by XOR with sign bits
|
|
243
|
-
uint32x4_t
|
|
244
|
-
uint32x4_t
|
|
245
|
-
uint32x4_t
|
|
246
|
-
uint32x4_t
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
return
|
|
242
|
+
uint32x4_t sign_mask_u32x4 = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
|
|
243
|
+
uint32x4_t xs_sign_u32x4 = vandq_u32(vreinterpretq_u32_f32(xs_inputs), sign_mask_u32x4);
|
|
244
|
+
uint32x4_t ys_sign_u32x4 = vandq_u32(vreinterpretq_u32_f32(ys_inputs), sign_mask_u32x4);
|
|
245
|
+
uint32x4_t result_bits_u32x4 = vreinterpretq_u32_f32(results_f32x4);
|
|
246
|
+
result_bits_u32x4 = veorq_u32(result_bits_u32x4, xs_sign_u32x4);
|
|
247
|
+
result_bits_u32x4 = veorq_u32(result_bits_u32x4, ys_sign_u32x4);
|
|
248
|
+
results_f32x4 = vreinterpretq_f32_u32(result_bits_u32x4);
|
|
249
|
+
|
|
250
|
+
return results_f32x4;
|
|
252
251
|
}
|
|
253
252
|
|
|
254
253
|
NK_INTERNAL float64x2_t nk_sin_f64x2_neon_(float64x2_t const angles_radians) {
|
|
255
254
|
// Constants for argument reduction
|
|
256
|
-
float64x2_t const
|
|
257
|
-
float64x2_t const
|
|
258
|
-
float64x2_t const
|
|
255
|
+
float64x2_t const pi_high_f64x2 = vdupq_n_f64(3.141592653589793116);
|
|
256
|
+
float64x2_t const pi_low_f64x2 = vdupq_n_f64(1.2246467991473532072e-16);
|
|
257
|
+
float64x2_t const pi_reciprocal_f64x2 = vdupq_n_f64(0.31830988618379067154);
|
|
259
258
|
|
|
260
259
|
// Polynomial coefficients for sine approximation
|
|
261
|
-
float64x2_t const
|
|
262
|
-
float64x2_t const
|
|
263
|
-
float64x2_t const
|
|
264
|
-
float64x2_t const
|
|
265
|
-
float64x2_t const
|
|
266
|
-
float64x2_t const
|
|
267
|
-
float64x2_t const
|
|
268
|
-
float64x2_t const
|
|
269
|
-
float64x2_t const
|
|
260
|
+
float64x2_t const coeff_0_f64x2 = vdupq_n_f64(+0.00833333333333332974823815);
|
|
261
|
+
float64x2_t const coeff_1_f64x2 = vdupq_n_f64(-0.000198412698412696162806809);
|
|
262
|
+
float64x2_t const coeff_2_f64x2 = vdupq_n_f64(+2.75573192239198747630416e-06);
|
|
263
|
+
float64x2_t const coeff_3_f64x2 = vdupq_n_f64(-2.50521083763502045810755e-08);
|
|
264
|
+
float64x2_t const coeff_4_f64x2 = vdupq_n_f64(+1.60590430605664501629054e-10);
|
|
265
|
+
float64x2_t const coeff_5_f64x2 = vdupq_n_f64(-7.64712219118158833288484e-13);
|
|
266
|
+
float64x2_t const coeff_6_f64x2 = vdupq_n_f64(+2.81009972710863200091251e-15);
|
|
267
|
+
float64x2_t const coeff_7_f64x2 = vdupq_n_f64(-7.97255955009037868891952e-18);
|
|
268
|
+
float64x2_t const coeff_8_f64x2 = vdupq_n_f64(-0.166666666666666657414808);
|
|
270
269
|
|
|
271
270
|
// Compute round(angle / π)
|
|
272
|
-
float64x2_t const
|
|
273
|
-
int64x2_t
|
|
274
|
-
float64x2_t
|
|
271
|
+
float64x2_t const quotients_f64x2 = vmulq_f64(angles_radians, pi_reciprocal_f64x2);
|
|
272
|
+
int64x2_t multiples_of_pi_i64x2 = vcvtnq_s64_f64(quotients_f64x2);
|
|
273
|
+
float64x2_t rounded_quotients_f64x2 = vcvtq_f64_s64(multiples_of_pi_i64x2);
|
|
275
274
|
|
|
276
275
|
// Two-step Cody-Waite reduction: angle - rounded * π_high - rounded * π_low
|
|
277
|
-
float64x2_t
|
|
278
|
-
|
|
279
|
-
|
|
276
|
+
float64x2_t angles_f64x2 = angles_radians;
|
|
277
|
+
angles_f64x2 = vfmsq_f64(angles_f64x2, rounded_quotients_f64x2, pi_high_f64x2);
|
|
278
|
+
angles_f64x2 = vfmsq_f64(angles_f64x2, rounded_quotients_f64x2, pi_low_f64x2);
|
|
280
279
|
|
|
281
|
-
// If
|
|
282
|
-
int64x2_t
|
|
283
|
-
uint64x2_t
|
|
284
|
-
float64x2_t
|
|
285
|
-
|
|
280
|
+
// If multiples_of_pi_i64x2 is odd, negate the angle
|
|
281
|
+
int64x2_t parity_i64x2 = vandq_s64(multiples_of_pi_i64x2, vdupq_n_s64(1));
|
|
282
|
+
uint64x2_t odd_mask_u64x2 = vceqq_s64(parity_i64x2, vdupq_n_s64(1));
|
|
283
|
+
float64x2_t negated_angles_f64x2 = vnegq_f64(angles_f64x2);
|
|
284
|
+
angles_f64x2 = vbslq_f64(odd_mask_u64x2, negated_angles_f64x2, angles_f64x2);
|
|
286
285
|
|
|
287
|
-
float64x2_t const
|
|
288
|
-
float64x2_t const
|
|
289
|
-
float64x2_t const
|
|
290
|
-
float64x2_t const
|
|
286
|
+
float64x2_t const angles_squared_f64x2 = vmulq_f64(angles_f64x2, angles_f64x2);
|
|
287
|
+
float64x2_t const angles_cubed_f64x2 = vmulq_f64(angles_f64x2, angles_squared_f64x2);
|
|
288
|
+
float64x2_t const angles_quadratic_f64x2 = vmulq_f64(angles_squared_f64x2, angles_squared_f64x2);
|
|
289
|
+
float64x2_t const angles_octic_f64x2 = vmulq_f64(angles_quadratic_f64x2, angles_quadratic_f64x2);
|
|
291
290
|
|
|
292
291
|
// Compute polynomial terms using Estrin's scheme for better ILP
|
|
293
|
-
float64x2_t const
|
|
294
|
-
float64x2_t const
|
|
295
|
-
float64x2_t const
|
|
292
|
+
float64x2_t const poly_67_f64x2 = vfmaq_f64(coeff_6_f64x2, angles_squared_f64x2, coeff_7_f64x2);
|
|
293
|
+
float64x2_t const poly_45_f64x2 = vfmaq_f64(coeff_4_f64x2, angles_squared_f64x2, coeff_5_f64x2);
|
|
294
|
+
float64x2_t const poly_4567_f64x2 = vfmaq_f64(poly_45_f64x2, angles_quadratic_f64x2, poly_67_f64x2);
|
|
296
295
|
|
|
297
|
-
float64x2_t const
|
|
298
|
-
float64x2_t const
|
|
299
|
-
float64x2_t const
|
|
296
|
+
float64x2_t const poly_23_f64x2 = vfmaq_f64(coeff_2_f64x2, angles_squared_f64x2, coeff_3_f64x2);
|
|
297
|
+
float64x2_t const poly_01_f64x2 = vfmaq_f64(coeff_0_f64x2, angles_squared_f64x2, coeff_1_f64x2);
|
|
298
|
+
float64x2_t const poly_0123_f64x2 = vfmaq_f64(poly_01_f64x2, angles_quadratic_f64x2, poly_23_f64x2);
|
|
300
299
|
|
|
301
300
|
// Combine polynomial terms
|
|
302
|
-
float64x2_t
|
|
303
|
-
|
|
304
|
-
|
|
301
|
+
float64x2_t results_f64x2 = vfmaq_f64(poly_0123_f64x2, angles_octic_f64x2, poly_4567_f64x2);
|
|
302
|
+
results_f64x2 = vfmaq_f64(coeff_8_f64x2, results_f64x2, angles_squared_f64x2);
|
|
303
|
+
results_f64x2 = vfmaq_f64(angles_f64x2, results_f64x2, angles_cubed_f64x2);
|
|
305
304
|
|
|
306
305
|
// Handle zero input (preserve sign of zero)
|
|
307
|
-
uint64x2_t const
|
|
308
|
-
|
|
309
|
-
return
|
|
306
|
+
uint64x2_t const non_zero_mask_u64x2 = vceqq_f64(angles_radians, vdupq_n_f64(0));
|
|
307
|
+
results_f64x2 = vbslq_f64(non_zero_mask_u64x2, angles_radians, results_f64x2);
|
|
308
|
+
return results_f64x2;
|
|
310
309
|
}
|
|
311
310
|
|
|
312
311
|
NK_INTERNAL float64x2_t nk_cos_f64x2_neon_(float64x2_t const angles_radians) {
|
|
313
312
|
// Constants for argument reduction
|
|
314
|
-
float64x2_t const
|
|
315
|
-
float64x2_t const
|
|
316
|
-
float64x2_t const
|
|
313
|
+
float64x2_t const pi_high_half_f64x2 = vdupq_n_f64(3.141592653589793116 * 0.5);
|
|
314
|
+
float64x2_t const pi_low_half_f64x2 = vdupq_n_f64(1.2246467991473532072e-16 * 0.5);
|
|
315
|
+
float64x2_t const pi_reciprocal_f64x2 = vdupq_n_f64(0.31830988618379067154);
|
|
317
316
|
|
|
318
317
|
// Polynomial coefficients for cosine approximation
|
|
319
|
-
float64x2_t const
|
|
320
|
-
float64x2_t const
|
|
321
|
-
float64x2_t const
|
|
322
|
-
float64x2_t const
|
|
323
|
-
float64x2_t const
|
|
324
|
-
float64x2_t const
|
|
325
|
-
float64x2_t const
|
|
326
|
-
float64x2_t const
|
|
327
|
-
float64x2_t const
|
|
318
|
+
float64x2_t const coeff_0_f64x2 = vdupq_n_f64(+0.00833333333333332974823815);
|
|
319
|
+
float64x2_t const coeff_1_f64x2 = vdupq_n_f64(-0.000198412698412696162806809);
|
|
320
|
+
float64x2_t const coeff_2_f64x2 = vdupq_n_f64(+2.75573192239198747630416e-06);
|
|
321
|
+
float64x2_t const coeff_3_f64x2 = vdupq_n_f64(-2.50521083763502045810755e-08);
|
|
322
|
+
float64x2_t const coeff_4_f64x2 = vdupq_n_f64(+1.60590430605664501629054e-10);
|
|
323
|
+
float64x2_t const coeff_5_f64x2 = vdupq_n_f64(-7.64712219118158833288484e-13);
|
|
324
|
+
float64x2_t const coeff_6_f64x2 = vdupq_n_f64(+2.81009972710863200091251e-15);
|
|
325
|
+
float64x2_t const coeff_7_f64x2 = vdupq_n_f64(-7.97255955009037868891952e-18);
|
|
326
|
+
float64x2_t const coeff_8_f64x2 = vdupq_n_f64(-0.166666666666666657414808);
|
|
328
327
|
|
|
329
328
|
// Compute 2 * round(angle / π - 0.5) + 1
|
|
330
|
-
float64x2_t const
|
|
331
|
-
float64x2_t const
|
|
332
|
-
float64x2_t const
|
|
333
|
-
int64x2_t
|
|
329
|
+
float64x2_t const quotients_f64x2 = vsubq_f64(vmulq_f64(angles_radians, pi_reciprocal_f64x2), vdupq_n_f64(0.5));
|
|
330
|
+
float64x2_t const rounded_f64x2 = vcvtq_f64_s64(vcvtnq_s64_f64(quotients_f64x2));
|
|
331
|
+
float64x2_t const rounded_quotients_f64x2 = vfmaq_f64(vdupq_n_f64(1.0), vdupq_n_f64(2.0), rounded_f64x2);
|
|
332
|
+
int64x2_t quotients_i64_i64x2 = vcvtnq_s64_f64(rounded_quotients_f64x2);
|
|
334
333
|
|
|
335
334
|
// Two-step Cody-Waite reduction
|
|
336
|
-
float64x2_t
|
|
337
|
-
|
|
338
|
-
|
|
335
|
+
float64x2_t angles_f64x2 = angles_radians;
|
|
336
|
+
angles_f64x2 = vfmsq_f64(angles_f64x2, rounded_quotients_f64x2, pi_high_half_f64x2);
|
|
337
|
+
angles_f64x2 = vfmsq_f64(angles_f64x2, rounded_quotients_f64x2, pi_low_half_f64x2);
|
|
339
338
|
|
|
340
|
-
// If (
|
|
341
|
-
int64x2_t
|
|
342
|
-
uint64x2_t
|
|
343
|
-
float64x2_t
|
|
344
|
-
|
|
339
|
+
// If (rounded_quotients_f64x2 & 2) == 0, negate the angle
|
|
340
|
+
int64x2_t bit2_i64x2 = vandq_s64(quotients_i64_i64x2, vdupq_n_s64(2));
|
|
341
|
+
uint64x2_t flip_mask_u64x2 = vceqq_s64(bit2_i64x2, vdupq_n_s64(0));
|
|
342
|
+
float64x2_t negated_angles_f64x2 = vnegq_f64(angles_f64x2);
|
|
343
|
+
angles_f64x2 = vbslq_f64(flip_mask_u64x2, negated_angles_f64x2, angles_f64x2);
|
|
345
344
|
|
|
346
|
-
float64x2_t const
|
|
347
|
-
float64x2_t const
|
|
348
|
-
float64x2_t const
|
|
349
|
-
float64x2_t const
|
|
345
|
+
float64x2_t const angles_squared_f64x2 = vmulq_f64(angles_f64x2, angles_f64x2);
|
|
346
|
+
float64x2_t const angles_cubed_f64x2 = vmulq_f64(angles_f64x2, angles_squared_f64x2);
|
|
347
|
+
float64x2_t const angles_quadratic_f64x2 = vmulq_f64(angles_squared_f64x2, angles_squared_f64x2);
|
|
348
|
+
float64x2_t const angles_octic_f64x2 = vmulq_f64(angles_quadratic_f64x2, angles_quadratic_f64x2);
|
|
350
349
|
|
|
351
350
|
// Compute polynomial terms using Estrin's scheme
|
|
352
|
-
float64x2_t const
|
|
353
|
-
float64x2_t const
|
|
354
|
-
float64x2_t const
|
|
351
|
+
float64x2_t const poly_67_f64x2 = vfmaq_f64(coeff_6_f64x2, angles_squared_f64x2, coeff_7_f64x2);
|
|
352
|
+
float64x2_t const poly_45_f64x2 = vfmaq_f64(coeff_4_f64x2, angles_squared_f64x2, coeff_5_f64x2);
|
|
353
|
+
float64x2_t const poly_4567_f64x2 = vfmaq_f64(poly_45_f64x2, angles_quadratic_f64x2, poly_67_f64x2);
|
|
355
354
|
|
|
356
|
-
float64x2_t const
|
|
357
|
-
float64x2_t const
|
|
358
|
-
float64x2_t const
|
|
355
|
+
float64x2_t const poly_23_f64x2 = vfmaq_f64(coeff_2_f64x2, angles_squared_f64x2, coeff_3_f64x2);
|
|
356
|
+
float64x2_t const poly_01_f64x2 = vfmaq_f64(coeff_0_f64x2, angles_squared_f64x2, coeff_1_f64x2);
|
|
357
|
+
float64x2_t const poly_0123_f64x2 = vfmaq_f64(poly_01_f64x2, angles_quadratic_f64x2, poly_23_f64x2);
|
|
359
358
|
|
|
360
359
|
// Combine polynomial terms
|
|
361
|
-
float64x2_t
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
return
|
|
360
|
+
float64x2_t results_f64x2 = vfmaq_f64(poly_0123_f64x2, angles_octic_f64x2, poly_4567_f64x2);
|
|
361
|
+
results_f64x2 = vfmaq_f64(coeff_8_f64x2, results_f64x2, angles_squared_f64x2);
|
|
362
|
+
results_f64x2 = vfmaq_f64(angles_f64x2, results_f64x2, angles_cubed_f64x2);
|
|
363
|
+
return results_f64x2;
|
|
365
364
|
}
|
|
366
365
|
|
|
367
366
|
NK_INTERNAL float64x2_t nk_atan_f64x2_neon_(float64x2_t const inputs) {
|
|
368
367
|
// Polynomial coefficients for atan approximation (19 terms)
|
|
369
|
-
float64x2_t const
|
|
370
|
-
float64x2_t const
|
|
371
|
-
float64x2_t const
|
|
372
|
-
float64x2_t const
|
|
373
|
-
float64x2_t const
|
|
374
|
-
float64x2_t const
|
|
375
|
-
float64x2_t const
|
|
376
|
-
float64x2_t const
|
|
377
|
-
float64x2_t const
|
|
378
|
-
float64x2_t const
|
|
379
|
-
float64x2_t const
|
|
380
|
-
float64x2_t const
|
|
381
|
-
float64x2_t const
|
|
382
|
-
float64x2_t const
|
|
383
|
-
float64x2_t const
|
|
384
|
-
float64x2_t const
|
|
385
|
-
float64x2_t const
|
|
386
|
-
float64x2_t const
|
|
387
|
-
float64x2_t const
|
|
388
|
-
float64x2_t const
|
|
389
|
-
float64x2_t const
|
|
368
|
+
float64x2_t const coeff_19_f64x2 = vdupq_n_f64(-1.88796008463073496563746e-05);
|
|
369
|
+
float64x2_t const coeff_18_f64x2 = vdupq_n_f64(+0.000209850076645816976906797);
|
|
370
|
+
float64x2_t const coeff_17_f64x2 = vdupq_n_f64(-0.00110611831486672482563471);
|
|
371
|
+
float64x2_t const coeff_16_f64x2 = vdupq_n_f64(+0.00370026744188713119232403);
|
|
372
|
+
float64x2_t const coeff_15_f64x2 = vdupq_n_f64(-0.00889896195887655491740809);
|
|
373
|
+
float64x2_t const coeff_14_f64x2 = vdupq_n_f64(+0.016599329773529201970117);
|
|
374
|
+
float64x2_t const coeff_13_f64x2 = vdupq_n_f64(-0.0254517624932312641616861);
|
|
375
|
+
float64x2_t const coeff_12_f64x2 = vdupq_n_f64(+0.0337852580001353069993897);
|
|
376
|
+
float64x2_t const coeff_11_f64x2 = vdupq_n_f64(-0.0407629191276836500001934);
|
|
377
|
+
float64x2_t const coeff_10_f64x2 = vdupq_n_f64(+0.0466667150077840625632675);
|
|
378
|
+
float64x2_t const coeff_9_f64x2 = vdupq_n_f64(-0.0523674852303482457616113);
|
|
379
|
+
float64x2_t const coeff_8_f64x2 = vdupq_n_f64(+0.0587666392926673580854313);
|
|
380
|
+
float64x2_t const coeff_7_f64x2 = vdupq_n_f64(-0.0666573579361080525984562);
|
|
381
|
+
float64x2_t const coeff_6_f64x2 = vdupq_n_f64(+0.0769219538311769618355029);
|
|
382
|
+
float64x2_t const coeff_5_f64x2 = vdupq_n_f64(-0.090908995008245008229153);
|
|
383
|
+
float64x2_t const coeff_4_f64x2 = vdupq_n_f64(+0.111111105648261418443745);
|
|
384
|
+
float64x2_t const coeff_3_f64x2 = vdupq_n_f64(-0.14285714266771329383765);
|
|
385
|
+
float64x2_t const coeff_2_f64x2 = vdupq_n_f64(+0.199999999996591265594148);
|
|
386
|
+
float64x2_t const coeff_1_f64x2 = vdupq_n_f64(-0.333333333333311110369124);
|
|
387
|
+
float64x2_t const half_pi_f64x2 = vdupq_n_f64(1.5707963267948966);
|
|
388
|
+
float64x2_t const zeros_f64x2 = vdupq_n_f64(0);
|
|
390
389
|
|
|
391
390
|
// Detect negative and take absolute value
|
|
392
|
-
uint64x2_t
|
|
393
|
-
float64x2_t
|
|
391
|
+
uint64x2_t negative_mask_u64x2 = vcltq_f64(inputs, zeros_f64x2);
|
|
392
|
+
float64x2_t values_f64x2 = vabsq_f64(inputs);
|
|
394
393
|
|
|
395
|
-
// Check if
|
|
396
|
-
uint64x2_t
|
|
397
|
-
float64x2_t
|
|
398
|
-
|
|
394
|
+
// Check if values_f64x2 > 1 (need reciprocal) - use division for f64 precision
|
|
395
|
+
uint64x2_t reciprocal_mask_u64x2 = vcgtq_f64(values_f64x2, vdupq_n_f64(1.0));
|
|
396
|
+
float64x2_t reciprocal_values_f64x2 = vdivq_f64(vdupq_n_f64(1.0), values_f64x2);
|
|
397
|
+
values_f64x2 = vbslq_f64(reciprocal_mask_u64x2, reciprocal_values_f64x2, values_f64x2);
|
|
399
398
|
|
|
400
399
|
// Compute powers
|
|
401
|
-
float64x2_t const
|
|
402
|
-
float64x2_t const
|
|
400
|
+
float64x2_t const values_squared_f64x2 = vmulq_f64(values_f64x2, values_f64x2);
|
|
401
|
+
float64x2_t const values_cubed_f64x2 = vmulq_f64(values_f64x2, values_squared_f64x2);
|
|
403
402
|
|
|
404
403
|
// Polynomial evaluation using Horner's method
|
|
405
|
-
float64x2_t
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
// Compute
|
|
426
|
-
float64x2_t
|
|
427
|
-
|
|
428
|
-
// Adjust for reciprocal:
|
|
429
|
-
float64x2_t
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
// Adjust for negative:
|
|
433
|
-
float64x2_t
|
|
434
|
-
|
|
435
|
-
return
|
|
404
|
+
float64x2_t polynomials_f64x2 = coeff_19_f64x2;
|
|
405
|
+
polynomials_f64x2 = vfmaq_f64(coeff_18_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
406
|
+
polynomials_f64x2 = vfmaq_f64(coeff_17_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
407
|
+
polynomials_f64x2 = vfmaq_f64(coeff_16_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
408
|
+
polynomials_f64x2 = vfmaq_f64(coeff_15_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
409
|
+
polynomials_f64x2 = vfmaq_f64(coeff_14_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
410
|
+
polynomials_f64x2 = vfmaq_f64(coeff_13_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
411
|
+
polynomials_f64x2 = vfmaq_f64(coeff_12_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
412
|
+
polynomials_f64x2 = vfmaq_f64(coeff_11_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
413
|
+
polynomials_f64x2 = vfmaq_f64(coeff_10_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
414
|
+
polynomials_f64x2 = vfmaq_f64(coeff_9_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
415
|
+
polynomials_f64x2 = vfmaq_f64(coeff_8_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
416
|
+
polynomials_f64x2 = vfmaq_f64(coeff_7_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
417
|
+
polynomials_f64x2 = vfmaq_f64(coeff_6_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
418
|
+
polynomials_f64x2 = vfmaq_f64(coeff_5_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
419
|
+
polynomials_f64x2 = vfmaq_f64(coeff_4_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
420
|
+
polynomials_f64x2 = vfmaq_f64(coeff_3_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
421
|
+
polynomials_f64x2 = vfmaq_f64(coeff_2_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
422
|
+
polynomials_f64x2 = vfmaq_f64(coeff_1_f64x2, polynomials_f64x2, values_squared_f64x2);
|
|
423
|
+
|
|
424
|
+
// Compute result_f64x2
|
|
425
|
+
float64x2_t result_f64x2 = vfmaq_f64(values_f64x2, values_cubed_f64x2, polynomials_f64x2);
|
|
426
|
+
|
|
427
|
+
// Adjust for reciprocal: result_f64x2 = π/2 - result_f64x2
|
|
428
|
+
float64x2_t adjusted_f64x2 = vsubq_f64(half_pi_f64x2, result_f64x2);
|
|
429
|
+
result_f64x2 = vbslq_f64(reciprocal_mask_u64x2, adjusted_f64x2, result_f64x2);
|
|
430
|
+
|
|
431
|
+
// Adjust for negative: result_f64x2 = -result_f64x2
|
|
432
|
+
float64x2_t negated_f64x2 = vnegq_f64(result_f64x2);
|
|
433
|
+
result_f64x2 = vbslq_f64(negative_mask_u64x2, negated_f64x2, result_f64x2);
|
|
434
|
+
return result_f64x2;
|
|
436
435
|
}
|
|
437
436
|
|
|
438
437
|
NK_INTERNAL float64x2_t nk_atan2_f64x2_neon_(float64x2_t const ys_inputs, float64x2_t const xs_inputs) {
|
|
439
438
|
// Polynomial coefficients (same as atan)
|
|
440
|
-
float64x2_t const
|
|
441
|
-
float64x2_t const
|
|
442
|
-
float64x2_t const
|
|
443
|
-
float64x2_t const
|
|
444
|
-
float64x2_t const
|
|
445
|
-
float64x2_t const
|
|
446
|
-
float64x2_t const
|
|
447
|
-
float64x2_t const
|
|
448
|
-
float64x2_t const
|
|
449
|
-
float64x2_t const
|
|
450
|
-
float64x2_t const
|
|
451
|
-
float64x2_t const
|
|
452
|
-
float64x2_t const
|
|
453
|
-
float64x2_t const
|
|
454
|
-
float64x2_t const
|
|
455
|
-
float64x2_t const
|
|
456
|
-
float64x2_t const
|
|
457
|
-
float64x2_t const
|
|
458
|
-
float64x2_t const
|
|
459
|
-
float64x2_t const
|
|
460
|
-
float64x2_t const
|
|
439
|
+
float64x2_t const coeff_19_f64x2 = vdupq_n_f64(-1.88796008463073496563746e-05);
|
|
440
|
+
float64x2_t const coeff_18_f64x2 = vdupq_n_f64(+0.000209850076645816976906797);
|
|
441
|
+
float64x2_t const coeff_17_f64x2 = vdupq_n_f64(-0.00110611831486672482563471);
|
|
442
|
+
float64x2_t const coeff_16_f64x2 = vdupq_n_f64(+0.00370026744188713119232403);
|
|
443
|
+
float64x2_t const coeff_15_f64x2 = vdupq_n_f64(-0.00889896195887655491740809);
|
|
444
|
+
float64x2_t const coeff_14_f64x2 = vdupq_n_f64(+0.016599329773529201970117);
|
|
445
|
+
float64x2_t const coeff_13_f64x2 = vdupq_n_f64(-0.0254517624932312641616861);
|
|
446
|
+
float64x2_t const coeff_12_f64x2 = vdupq_n_f64(+0.0337852580001353069993897);
|
|
447
|
+
float64x2_t const coeff_11_f64x2 = vdupq_n_f64(-0.0407629191276836500001934);
|
|
448
|
+
float64x2_t const coeff_10_f64x2 = vdupq_n_f64(+0.0466667150077840625632675);
|
|
449
|
+
float64x2_t const coeff_9_f64x2 = vdupq_n_f64(-0.0523674852303482457616113);
|
|
450
|
+
float64x2_t const coeff_8_f64x2 = vdupq_n_f64(+0.0587666392926673580854313);
|
|
451
|
+
float64x2_t const coeff_7_f64x2 = vdupq_n_f64(-0.0666573579361080525984562);
|
|
452
|
+
float64x2_t const coeff_6_f64x2 = vdupq_n_f64(+0.0769219538311769618355029);
|
|
453
|
+
float64x2_t const coeff_5_f64x2 = vdupq_n_f64(-0.090908995008245008229153);
|
|
454
|
+
float64x2_t const coeff_4_f64x2 = vdupq_n_f64(+0.111111105648261418443745);
|
|
455
|
+
float64x2_t const coeff_3_f64x2 = vdupq_n_f64(-0.14285714266771329383765);
|
|
456
|
+
float64x2_t const coeff_2_f64x2 = vdupq_n_f64(+0.199999999996591265594148);
|
|
457
|
+
float64x2_t const coeff_1_f64x2 = vdupq_n_f64(-0.333333333333311110369124);
|
|
458
|
+
float64x2_t const half_pi_f64x2 = vdupq_n_f64(1.5707963267948966);
|
|
459
|
+
float64x2_t const zeros_f64x2 = vdupq_n_f64(0);
|
|
461
460
|
|
|
462
461
|
// Quadrant adjustments - take absolute values
|
|
463
|
-
uint64x2_t
|
|
464
|
-
float64x2_t
|
|
465
|
-
float64x2_t
|
|
462
|
+
uint64x2_t xs_negative_mask_u64x2 = vcltq_f64(xs_inputs, zeros_f64x2);
|
|
463
|
+
float64x2_t xs_f64x2 = vabsq_f64(xs_inputs);
|
|
464
|
+
float64x2_t ys_f64x2 = vabsq_f64(ys_inputs);
|
|
466
465
|
|
|
467
466
|
// Ensure proper fraction where numerator < denominator
|
|
468
|
-
uint64x2_t
|
|
469
|
-
float64x2_t
|
|
470
|
-
|
|
471
|
-
|
|
467
|
+
uint64x2_t swap_mask_u64x2 = vcgtq_f64(ys_f64x2, xs_f64x2);
|
|
468
|
+
float64x2_t temps_f64x2 = xs_f64x2;
|
|
469
|
+
xs_f64x2 = vbslq_f64(swap_mask_u64x2, ys_f64x2, xs_f64x2);
|
|
470
|
+
ys_f64x2 = vbslq_f64(swap_mask_u64x2, vnegq_f64(temps_f64x2), ys_f64x2);
|
|
472
471
|
|
|
473
472
|
// Division for f64 precision
|
|
474
|
-
float64x2_t const
|
|
475
|
-
float64x2_t const
|
|
476
|
-
float64x2_t const
|
|
473
|
+
float64x2_t const ratio_f64x2 = vdivq_f64(ys_f64x2, xs_f64x2);
|
|
474
|
+
float64x2_t const ratio_squared_f64x2 = vmulq_f64(ratio_f64x2, ratio_f64x2);
|
|
475
|
+
float64x2_t const ratio_cubed_f64x2 = vmulq_f64(ratio_f64x2, ratio_squared_f64x2);
|
|
477
476
|
|
|
478
477
|
// Polynomial evaluation using Horner's method
|
|
479
|
-
float64x2_t
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
478
|
+
float64x2_t polynomials_f64x2 = coeff_19_f64x2;
|
|
479
|
+
polynomials_f64x2 = vfmaq_f64(coeff_18_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
480
|
+
polynomials_f64x2 = vfmaq_f64(coeff_17_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
481
|
+
polynomials_f64x2 = vfmaq_f64(coeff_16_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
482
|
+
polynomials_f64x2 = vfmaq_f64(coeff_15_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
483
|
+
polynomials_f64x2 = vfmaq_f64(coeff_14_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
484
|
+
polynomials_f64x2 = vfmaq_f64(coeff_13_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
485
|
+
polynomials_f64x2 = vfmaq_f64(coeff_12_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
486
|
+
polynomials_f64x2 = vfmaq_f64(coeff_11_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
487
|
+
polynomials_f64x2 = vfmaq_f64(coeff_10_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
488
|
+
polynomials_f64x2 = vfmaq_f64(coeff_9_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
489
|
+
polynomials_f64x2 = vfmaq_f64(coeff_8_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
490
|
+
polynomials_f64x2 = vfmaq_f64(coeff_7_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
491
|
+
polynomials_f64x2 = vfmaq_f64(coeff_6_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
492
|
+
polynomials_f64x2 = vfmaq_f64(coeff_5_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
493
|
+
polynomials_f64x2 = vfmaq_f64(coeff_4_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
494
|
+
polynomials_f64x2 = vfmaq_f64(coeff_3_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
495
|
+
polynomials_f64x2 = vfmaq_f64(coeff_2_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
496
|
+
polynomials_f64x2 = vfmaq_f64(coeff_1_f64x2, polynomials_f64x2, ratio_squared_f64x2);
|
|
498
497
|
|
|
499
498
|
// Compute the result
|
|
500
|
-
float64x2_t
|
|
499
|
+
float64x2_t results_f64x2 = vfmaq_f64(ratio_f64x2, ratio_cubed_f64x2, polynomials_f64x2);
|
|
501
500
|
|
|
502
|
-
// Compute
|
|
501
|
+
// Compute quadrant_f64x2 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
|
|
503
502
|
// -2 for x<0 && !swap, -1 for x<0 && swap
|
|
504
|
-
float64x2_t
|
|
505
|
-
float64x2_t
|
|
506
|
-
|
|
507
|
-
float64x2_t
|
|
508
|
-
|
|
503
|
+
float64x2_t quadrant_f64x2 = vdupq_n_f64(0.0);
|
|
504
|
+
float64x2_t neg_two_f64x2 = vdupq_n_f64(-2.0);
|
|
505
|
+
quadrant_f64x2 = vbslq_f64(xs_negative_mask_u64x2, neg_two_f64x2, quadrant_f64x2);
|
|
506
|
+
float64x2_t quadrant_incremented_f64x2 = vaddq_f64(quadrant_f64x2, vdupq_n_f64(1.0));
|
|
507
|
+
quadrant_f64x2 = vbslq_f64(swap_mask_u64x2, quadrant_incremented_f64x2, quadrant_f64x2);
|
|
509
508
|
|
|
510
|
-
// Adjust for
|
|
511
|
-
|
|
509
|
+
// Adjust for quadrant_f64x2: result += quadrant_f64x2 * π/2
|
|
510
|
+
results_f64x2 = vfmaq_f64(results_f64x2, quadrant_f64x2, half_pi_f64x2);
|
|
512
511
|
|
|
513
512
|
// Transfer sign from x and y by XOR with sign bits
|
|
514
|
-
uint64x2_t
|
|
515
|
-
uint64x2_t
|
|
516
|
-
uint64x2_t
|
|
517
|
-
uint64x2_t
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
return
|
|
513
|
+
uint64x2_t sign_mask_u64x2 = vreinterpretq_u64_f64(vdupq_n_f64(-0.0));
|
|
514
|
+
uint64x2_t xs_sign_u64x2 = vandq_u64(vreinterpretq_u64_f64(xs_inputs), sign_mask_u64x2);
|
|
515
|
+
uint64x2_t ys_sign_u64x2 = vandq_u64(vreinterpretq_u64_f64(ys_inputs), sign_mask_u64x2);
|
|
516
|
+
uint64x2_t result_bits_u64x2 = vreinterpretq_u64_f64(results_f64x2);
|
|
517
|
+
result_bits_u64x2 = veorq_u64(result_bits_u64x2, xs_sign_u64x2);
|
|
518
|
+
result_bits_u64x2 = veorq_u64(result_bits_u64x2, ys_sign_u64x2);
|
|
519
|
+
results_f64x2 = vreinterpretq_f64_u64(result_bits_u64x2);
|
|
520
|
+
|
|
521
|
+
return results_f64x2;
|
|
523
522
|
}
|
|
524
523
|
|
|
525
524
|
NK_PUBLIC void nk_each_sin_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
526
525
|
nk_size_t i = 0;
|
|
527
526
|
for (; i + 4 <= n; i += 4) {
|
|
528
|
-
float32x4_t
|
|
529
|
-
float32x4_t
|
|
530
|
-
vst1q_f32(outs + i,
|
|
527
|
+
float32x4_t angles_f32x4 = vld1q_f32(ins + i);
|
|
528
|
+
float32x4_t results_f32x4 = nk_sin_f32x4_neon_(angles_f32x4);
|
|
529
|
+
vst1q_f32(outs + i, results_f32x4);
|
|
531
530
|
}
|
|
532
531
|
if (i < n) {
|
|
533
532
|
nk_size_t remaining = n - i;
|
|
@@ -542,9 +541,9 @@ NK_PUBLIC void nk_each_sin_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *
|
|
|
542
541
|
NK_PUBLIC void nk_each_cos_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
543
542
|
nk_size_t i = 0;
|
|
544
543
|
for (; i + 4 <= n; i += 4) {
|
|
545
|
-
float32x4_t
|
|
546
|
-
float32x4_t
|
|
547
|
-
vst1q_f32(outs + i,
|
|
544
|
+
float32x4_t angles_f32x4 = vld1q_f32(ins + i);
|
|
545
|
+
float32x4_t results_f32x4 = nk_cos_f32x4_neon_(angles_f32x4);
|
|
546
|
+
vst1q_f32(outs + i, results_f32x4);
|
|
548
547
|
}
|
|
549
548
|
if (i < n) {
|
|
550
549
|
nk_size_t remaining = n - i;
|
|
@@ -559,9 +558,9 @@ NK_PUBLIC void nk_each_cos_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *
|
|
|
559
558
|
NK_PUBLIC void nk_each_atan_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
560
559
|
nk_size_t i = 0;
|
|
561
560
|
for (; i + 4 <= n; i += 4) {
|
|
562
|
-
float32x4_t
|
|
563
|
-
float32x4_t
|
|
564
|
-
vst1q_f32(outs + i,
|
|
561
|
+
float32x4_t values_f32x4 = vld1q_f32(ins + i);
|
|
562
|
+
float32x4_t results_f32x4 = nk_atan_f32x4_neon_(values_f32x4);
|
|
563
|
+
vst1q_f32(outs + i, results_f32x4);
|
|
565
564
|
}
|
|
566
565
|
if (i < n) {
|
|
567
566
|
nk_size_t remaining = n - i;
|
|
@@ -576,9 +575,9 @@ NK_PUBLIC void nk_each_atan_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t
|
|
|
576
575
|
NK_PUBLIC void nk_each_sin_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
577
576
|
nk_size_t i = 0;
|
|
578
577
|
for (; i + 2 <= n; i += 2) {
|
|
579
|
-
float64x2_t
|
|
580
|
-
float64x2_t
|
|
581
|
-
vst1q_f64(outs + i,
|
|
578
|
+
float64x2_t angles_f64x2 = vld1q_f64(ins + i);
|
|
579
|
+
float64x2_t results_f64x2 = nk_sin_f64x2_neon_(angles_f64x2);
|
|
580
|
+
vst1q_f64(outs + i, results_f64x2);
|
|
582
581
|
}
|
|
583
582
|
if (i < n) {
|
|
584
583
|
nk_size_t remaining = n - i;
|
|
@@ -593,9 +592,9 @@ NK_PUBLIC void nk_each_sin_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *
|
|
|
593
592
|
NK_PUBLIC void nk_each_cos_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
594
593
|
nk_size_t i = 0;
|
|
595
594
|
for (; i + 2 <= n; i += 2) {
|
|
596
|
-
float64x2_t
|
|
597
|
-
float64x2_t
|
|
598
|
-
vst1q_f64(outs + i,
|
|
595
|
+
float64x2_t angles_f64x2 = vld1q_f64(ins + i);
|
|
596
|
+
float64x2_t results_f64x2 = nk_cos_f64x2_neon_(angles_f64x2);
|
|
597
|
+
vst1q_f64(outs + i, results_f64x2);
|
|
599
598
|
}
|
|
600
599
|
if (i < n) {
|
|
601
600
|
nk_size_t remaining = n - i;
|
|
@@ -610,9 +609,9 @@ NK_PUBLIC void nk_each_cos_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *
|
|
|
610
609
|
NK_PUBLIC void nk_each_atan_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
611
610
|
nk_size_t i = 0;
|
|
612
611
|
for (; i + 2 <= n; i += 2) {
|
|
613
|
-
float64x2_t
|
|
614
|
-
float64x2_t
|
|
615
|
-
vst1q_f64(outs + i,
|
|
612
|
+
float64x2_t values_f64x2 = vld1q_f64(ins + i);
|
|
613
|
+
float64x2_t results_f64x2 = nk_atan_f64x2_neon_(values_f64x2);
|
|
614
|
+
vst1q_f64(outs + i, results_f64x2);
|
|
616
615
|
}
|
|
617
616
|
if (i < n) {
|
|
618
617
|
nk_size_t remaining = n - i;
|