numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,703 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SWAR-accelerated Trigonometric Functions for SIMD-free CPUs.
|
|
3
|
+
* @file include/numkong/trigonometry/serial.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date December 27, 2025
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/trigonometry.h
|
|
8
|
+
* @see https://sleef.org
|
|
9
|
+
*/
|
|
10
|
+
#ifndef NK_TRIGONOMETRY_SERIAL_H
|
|
11
|
+
#define NK_TRIGONOMETRY_SERIAL_H
|
|
12
|
+
|
|
13
|
+
#include "numkong/types.h"
|
|
14
|
+
#include "numkong/cast/serial.h" // `nk_f16_to_f32_serial`
|
|
15
|
+
#include "numkong/scalar/serial.h" // `nk_f32_fma_serial`, `nk_f64_fma_serial`
|
|
16
|
+
|
|
17
|
+
#if defined(__cplusplus)
|
|
18
|
+
extern "C" {
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* @brief Computes an approximate sine of the given angle in radians with @b 3-ULP error bound for [-2π, 2π].
|
|
23
|
+
* @see Based on @b `xfastsinf_u3500` in SLEEF library.
|
|
24
|
+
* @param[in] angle The input angle in radians.
|
|
25
|
+
* @return The approximate sine of the input angle in [-1, 1] range.
|
|
26
|
+
*/
|
|
27
|
+
NK_PUBLIC nk_f32_t nk_f32_sin(nk_f32_t const angle_radians) {
|
|
28
|
+
|
|
29
|
+
// Cody-Waite constants for argument reduction (pi split into hi + lo)
|
|
30
|
+
nk_f32_t const pi_hi = 3.1415927f;
|
|
31
|
+
nk_f32_t const pi_lo = -8.742278e-8f;
|
|
32
|
+
nk_f32_t const pi_reciprocal = 0.31830988618379067154f; /// 1/π
|
|
33
|
+
|
|
34
|
+
// Degree-9 minimax coefficients: sin(x) ≈ x + c3*x³ + c5*x⁵ + c7*x⁷ + c9*x⁹
|
|
35
|
+
nk_f32_t const coeff_9 = +2.7557319224e-6f;
|
|
36
|
+
nk_f32_t const coeff_7 = -1.9841269841e-4f;
|
|
37
|
+
nk_f32_t const coeff_5 = +8.3333293855e-3f;
|
|
38
|
+
nk_f32_t const coeff_3 = -1.6666666641e-1f;
|
|
39
|
+
|
|
40
|
+
// Compute (multiple_of_pi) = round(angle / π)
|
|
41
|
+
nk_f32_t const quotient = angle_radians * pi_reciprocal;
|
|
42
|
+
int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5f : quotient + 0.5f);
|
|
43
|
+
|
|
44
|
+
// Cody-Waite range reduction: angle = angle_radians - multiple * (pi_hi + pi_lo)
|
|
45
|
+
nk_f32_t angle = angle_radians - multiple_of_pi * pi_hi;
|
|
46
|
+
angle -= multiple_of_pi * pi_lo;
|
|
47
|
+
nk_f32_t const angle_squared = angle * angle;
|
|
48
|
+
nk_f32_t const angle_cubed = angle * angle_squared;
|
|
49
|
+
|
|
50
|
+
// Degree-9 polynomial via Horner's method
|
|
51
|
+
nk_f32_t polynomial = coeff_9;
|
|
52
|
+
polynomial = polynomial * angle_squared + coeff_7;
|
|
53
|
+
polynomial = polynomial * angle_squared + coeff_5;
|
|
54
|
+
polynomial = polynomial * angle_squared + coeff_3;
|
|
55
|
+
nk_f32_t result = polynomial * angle_cubed + angle;
|
|
56
|
+
|
|
57
|
+
// If multiple_of_pi is odd, flip the sign of the result
|
|
58
|
+
if ((multiple_of_pi & 1) != 0) result = -result;
|
|
59
|
+
return result;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* @brief Computes an approximate cosine of the given angle in radians with @b 3-ULP error bound for [-2π, 2π].
|
|
64
|
+
* @see Based on @b `xfastcosf_u3500` in SLEEF library.
|
|
65
|
+
* @param[in] angle The input angle in radians.
|
|
66
|
+
* @return The approximate cosine of the input angle in [-1, 1] range.
|
|
67
|
+
*/
|
|
68
|
+
NK_PUBLIC nk_f32_t nk_f32_cos(nk_f32_t const angle_radians) {
|
|
69
|
+
|
|
70
|
+
// Cody-Waite constants for argument reduction (pi split into hi + lo)
|
|
71
|
+
nk_f32_t const pi_hi = 3.1415927f;
|
|
72
|
+
nk_f32_t const pi_lo = -8.742278e-8f;
|
|
73
|
+
nk_f32_t const pi_half = 1.57079632679489661923f; /// π/2
|
|
74
|
+
nk_f32_t const pi_reciprocal = 0.31830988618379067154f; /// 1/π
|
|
75
|
+
|
|
76
|
+
// Degree-9 minimax coefficients: sin(x) ≈ x + c3*x³ + c5*x⁵ + c7*x⁷ + c9*x⁹
|
|
77
|
+
nk_f32_t const coeff_9 = +2.7557319224e-6f;
|
|
78
|
+
nk_f32_t const coeff_7 = -1.9841269841e-4f;
|
|
79
|
+
nk_f32_t const coeff_5 = +8.3333293855e-3f;
|
|
80
|
+
nk_f32_t const coeff_3 = -1.6666666641e-1f;
|
|
81
|
+
|
|
82
|
+
// Compute (multiple_of_pi) = round(angle / π - 0.5)
|
|
83
|
+
nk_f32_t const quotient = angle_radians * pi_reciprocal - 0.5f;
|
|
84
|
+
int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5f : quotient + 0.5f);
|
|
85
|
+
|
|
86
|
+
// Cody-Waite range reduction: angle = angle_radians - (multiple * pi + pi/2)
|
|
87
|
+
nk_f32_t const offset = pi_half + multiple_of_pi * pi_hi;
|
|
88
|
+
nk_f32_t angle = angle_radians - offset;
|
|
89
|
+
angle -= multiple_of_pi * pi_lo;
|
|
90
|
+
nk_f32_t const angle_squared = angle * angle;
|
|
91
|
+
nk_f32_t const angle_cubed = angle * angle_squared;
|
|
92
|
+
|
|
93
|
+
// Degree-9 polynomial via Horner's method
|
|
94
|
+
nk_f32_t polynomial = coeff_9;
|
|
95
|
+
polynomial = polynomial * angle_squared + coeff_7;
|
|
96
|
+
polynomial = polynomial * angle_squared + coeff_5;
|
|
97
|
+
polynomial = polynomial * angle_squared + coeff_3;
|
|
98
|
+
nk_f32_t result = polynomial * angle_cubed + angle;
|
|
99
|
+
|
|
100
|
+
// If multiple_of_pi is even, flip the sign of the result
|
|
101
|
+
if ((multiple_of_pi & 1) == 0) result = -result;
|
|
102
|
+
return result;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* @brief Computes the arc-tangent of a value with @b 0-ULP error bound.
|
|
107
|
+
* @see Based on @b `xatanf` in SLEEF library.
|
|
108
|
+
* @param input The input value.
|
|
109
|
+
* @return The arc-tangent of the input value in [-π/2, π/2] radians range.
|
|
110
|
+
*/
|
|
111
|
+
NK_PUBLIC nk_f32_t nk_f32_atan(nk_f32_t const input) {
|
|
112
|
+
// Polynomial coefficients for atan approximation
|
|
113
|
+
nk_f32_t const coeff_8 = -0.333331018686294555664062f;
|
|
114
|
+
nk_f32_t const coeff_7 = +0.199926957488059997558594f;
|
|
115
|
+
nk_f32_t const coeff_6 = -0.142027363181114196777344f;
|
|
116
|
+
nk_f32_t const coeff_5 = +0.106347933411598205566406f;
|
|
117
|
+
nk_f32_t const coeff_4 = -0.0748900920152664184570312f;
|
|
118
|
+
nk_f32_t const coeff_3 = +0.0425049886107444763183594f;
|
|
119
|
+
nk_f32_t const coeff_2 = -0.0159569028764963150024414f;
|
|
120
|
+
nk_f32_t const coeff_1 = +0.00282363896258175373077393f;
|
|
121
|
+
|
|
122
|
+
// Quadrant adjustment
|
|
123
|
+
int quadrant = 0;
|
|
124
|
+
nk_f32_t value = input;
|
|
125
|
+
if (value < 0.0f) value = -value, quadrant |= 2;
|
|
126
|
+
if (value > 1.0f) value = 1.0f / value, quadrant |= 1;
|
|
127
|
+
|
|
128
|
+
// Argument reduction
|
|
129
|
+
nk_f32_t const value_squared = value * value;
|
|
130
|
+
nk_f32_t const value_cubed = value * value_squared;
|
|
131
|
+
|
|
132
|
+
// Polynomial evaluation using FMA for improved precision
|
|
133
|
+
nk_f32_t polynomial = coeff_1;
|
|
134
|
+
polynomial = nk_f32_fma_serial(polynomial, value_squared, coeff_2);
|
|
135
|
+
polynomial = nk_f32_fma_serial(polynomial, value_squared, coeff_3);
|
|
136
|
+
polynomial = nk_f32_fma_serial(polynomial, value_squared, coeff_4);
|
|
137
|
+
polynomial = nk_f32_fma_serial(polynomial, value_squared, coeff_5);
|
|
138
|
+
polynomial = nk_f32_fma_serial(polynomial, value_squared, coeff_6);
|
|
139
|
+
polynomial = nk_f32_fma_serial(polynomial, value_squared, coeff_7);
|
|
140
|
+
polynomial = nk_f32_fma_serial(polynomial, value_squared, coeff_8);
|
|
141
|
+
|
|
142
|
+
// Adjust for quadrant
|
|
143
|
+
nk_f32_t result = nk_f32_fma_serial(polynomial, value_cubed, value);
|
|
144
|
+
nk_f32_t const pi_half = 1.5707963267948966f; // π/2
|
|
145
|
+
if ((quadrant & 1) != 0) result = pi_half - result;
|
|
146
|
+
if ((quadrant & 2) != 0) result = -result;
|
|
147
|
+
return result;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
typedef enum nk_float_class_t {
|
|
151
|
+
nk_float_unknown_k = 0,
|
|
152
|
+
nk_float_nan_k = 1 << 1,
|
|
153
|
+
|
|
154
|
+
nk_float_positive_zero_k = 1 << 10,
|
|
155
|
+
nk_float_positive_finite_k = 1 << 11,
|
|
156
|
+
nk_float_positive_infinity_k = 1 << 12,
|
|
157
|
+
|
|
158
|
+
nk_float_negative_zero_k = 1 << 20,
|
|
159
|
+
nk_float_negative_finite_k = 1 << 21,
|
|
160
|
+
nk_float_negative_infinity_k = 1 << 22,
|
|
161
|
+
|
|
162
|
+
} nk_float_class_t;
|
|
163
|
+
|
|
164
|
+
NK_PUBLIC nk_float_class_t nk_f32_classify(nk_f32_t const input) {
|
|
165
|
+
// Constants for special cases
|
|
166
|
+
nk_u32_t const positive_zero = 0x00000000u; // +0
|
|
167
|
+
nk_u32_t const negative_zero = 0x80000000u; // -0
|
|
168
|
+
nk_u32_t const positive_infinity = 0x7F800000u; // +∞
|
|
169
|
+
nk_u32_t const negative_infinity = 0xFF800000u; // -∞
|
|
170
|
+
nk_u32_t const exponent_mask = 0x7F800000u; // Mask for exponent bits
|
|
171
|
+
nk_u32_t const mantissa_mask = 0x007FFFFFu; // Mask for mantissa bits
|
|
172
|
+
|
|
173
|
+
nk_fui32_t bits;
|
|
174
|
+
bits.f = input;
|
|
175
|
+
if (bits.u == positive_zero) return nk_float_positive_zero_k;
|
|
176
|
+
if (bits.u == negative_zero) return nk_float_negative_zero_k;
|
|
177
|
+
if (bits.u == positive_infinity) return nk_float_positive_infinity_k;
|
|
178
|
+
if (bits.u == negative_infinity) return nk_float_negative_infinity_k;
|
|
179
|
+
|
|
180
|
+
// Check for NaN (exponent all 1s and non-zero mantissa)
|
|
181
|
+
if ((bits.u & exponent_mask) == exponent_mask && (bits.u & mantissa_mask) != 0) return nk_float_nan_k;
|
|
182
|
+
return input > 0.0f ? nk_float_positive_finite_k : nk_float_negative_finite_k;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
NK_PUBLIC int nk_float_class_belongs_to(nk_float_class_t const class_, int const belongs_to) {
|
|
186
|
+
return (class_ & belongs_to) != 0;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* @brief Computes the arc-tangent of (y/x) with @b 0-ULP error bound.
|
|
191
|
+
* @see Based on @b `xatan2f` in SLEEF library.
|
|
192
|
+
* @param y_input The input sine value.
|
|
193
|
+
* @param x_input The input cosine value.
|
|
194
|
+
* @return The arc-tangent of (y_input/x_input) in [-π, π] radians range.
|
|
195
|
+
*/
|
|
196
|
+
NK_PUBLIC nk_f32_t nk_f32_atan2(nk_f32_t const y_input, nk_f32_t const x_input) {
|
|
197
|
+
|
|
198
|
+
// Polynomial coefficients for atan2 approximation
|
|
199
|
+
nk_f32_t const coeff_8 = -0.333331018686294555664062f;
|
|
200
|
+
nk_f32_t const coeff_7 = +0.199926957488059997558594f;
|
|
201
|
+
nk_f32_t const coeff_6 = -0.142027363181114196777344f;
|
|
202
|
+
nk_f32_t const coeff_5 = +0.106347933411598205566406f;
|
|
203
|
+
nk_f32_t const coeff_4 = -0.0748900920152664184570312f;
|
|
204
|
+
nk_f32_t const coeff_3 = +0.0425049886107444763183594f;
|
|
205
|
+
nk_f32_t const coeff_2 = -0.0159569028764963150024414f;
|
|
206
|
+
nk_f32_t const coeff_1 = +0.00282363896258175373077393f;
|
|
207
|
+
|
|
208
|
+
// Convert to bit representation
|
|
209
|
+
nk_fui32_t const x_bits = *(nk_fui32_t *)&x_input;
|
|
210
|
+
nk_fui32_t const y_bits = *(nk_fui32_t *)&y_input;
|
|
211
|
+
nk_fui32_t x_abs, y_abs;
|
|
212
|
+
y_abs.u = y_bits.u & 0x7FFFFFFFu;
|
|
213
|
+
|
|
214
|
+
// Quadrant adjustment
|
|
215
|
+
int quadrant = 0;
|
|
216
|
+
if (x_input < 0.0f) { x_abs.f = -x_input, quadrant = -2; }
|
|
217
|
+
else { x_abs.f = x_input; }
|
|
218
|
+
// Ensure proper fraction where the numerator is smaller than the denominator
|
|
219
|
+
if (y_abs.f > x_abs.f) {
|
|
220
|
+
nk_f32_t temp = x_abs.f;
|
|
221
|
+
x_abs.f = y_abs.f;
|
|
222
|
+
y_abs.f = -temp;
|
|
223
|
+
quadrant += 1;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Argument reduction
|
|
227
|
+
nk_f32_t const ratio = y_abs.f / x_abs.f;
|
|
228
|
+
nk_f32_t const ratio_squared = ratio * ratio;
|
|
229
|
+
nk_f32_t const ratio_cubed = ratio * ratio_squared;
|
|
230
|
+
|
|
231
|
+
// Polynomial evaluation using FMA for improved precision
|
|
232
|
+
nk_f32_t polynomial = coeff_1;
|
|
233
|
+
polynomial = nk_f32_fma_serial(polynomial, ratio_squared, coeff_2);
|
|
234
|
+
polynomial = nk_f32_fma_serial(polynomial, ratio_squared, coeff_3);
|
|
235
|
+
polynomial = nk_f32_fma_serial(polynomial, ratio_squared, coeff_4);
|
|
236
|
+
polynomial = nk_f32_fma_serial(polynomial, ratio_squared, coeff_5);
|
|
237
|
+
polynomial = nk_f32_fma_serial(polynomial, ratio_squared, coeff_6);
|
|
238
|
+
polynomial = nk_f32_fma_serial(polynomial, ratio_squared, coeff_7);
|
|
239
|
+
polynomial = nk_f32_fma_serial(polynomial, ratio_squared, coeff_8);
|
|
240
|
+
|
|
241
|
+
// Compute the result using FMA
|
|
242
|
+
nk_f32_t const pi_half = 1.5707963267948966f; // π/2
|
|
243
|
+
nk_f32_t result = nk_f32_fma_serial(polynomial, ratio_cubed, ratio);
|
|
244
|
+
result = nk_f32_fma_serial((nk_f32_t)quadrant, pi_half, result); // quadrant * (π/2)
|
|
245
|
+
|
|
246
|
+
// Adjust sign
|
|
247
|
+
nk_i32_t const negative_zero = 0x80000000;
|
|
248
|
+
nk_fui32_t result_bits;
|
|
249
|
+
result_bits.f = result;
|
|
250
|
+
result_bits.u ^= x_bits.u & negative_zero;
|
|
251
|
+
result_bits.u ^= y_bits.u & negative_zero;
|
|
252
|
+
return result_bits.f;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* @brief Computes the sine of the given angle in radians with @b 0-ULP error bound in [-2π, 2π].
|
|
257
|
+
* @see Based on @b `xsin` in SLEEF library.
|
|
258
|
+
* @param[in] angle The input angle in radians.
|
|
259
|
+
* @return The approximate cosine of the input angle.
|
|
260
|
+
*/
|
|
261
|
+
NK_PUBLIC nk_f64_t nk_f64_sin(nk_f64_t const angle_radians) {
|
|
262
|
+
|
|
263
|
+
// Constants for argument reduction
|
|
264
|
+
nk_f64_t const pi_high = 3.141592653589793116; // High-digits part of π
|
|
265
|
+
nk_f64_t const pi_low = 1.2246467991473532072e-16; // Low-digits part of π
|
|
266
|
+
nk_f64_t const pi_reciprocal = 0.318309886183790671537767526745028724; // 1/π
|
|
267
|
+
nk_i64_t const negative_zero = 0x8000000000000000LL; // Hexadecimal value of -0.0 in IEEE 754
|
|
268
|
+
|
|
269
|
+
// Polynomial coefficients for sine/cosine approximation (minimax polynomial)
|
|
270
|
+
nk_f64_t const coeff_0 = +0.00833333333333332974823815;
|
|
271
|
+
nk_f64_t const coeff_1 = -0.000198412698412696162806809;
|
|
272
|
+
nk_f64_t const coeff_2 = +2.75573192239198747630416e-06;
|
|
273
|
+
nk_f64_t const coeff_3 = -2.50521083763502045810755e-08;
|
|
274
|
+
nk_f64_t const coeff_4 = +1.60590430605664501629054e-10;
|
|
275
|
+
nk_f64_t const coeff_5 = -7.64712219118158833288484e-13;
|
|
276
|
+
nk_f64_t const coeff_6 = +2.81009972710863200091251e-15;
|
|
277
|
+
nk_f64_t const coeff_7 = -7.97255955009037868891952e-18;
|
|
278
|
+
nk_f64_t const coeff_8 = -0.166666666666666657414808;
|
|
279
|
+
|
|
280
|
+
// Compute (multiple_of_pi) = round(angle / π)
|
|
281
|
+
nk_f64_t const quotient = angle_radians * pi_reciprocal;
|
|
282
|
+
int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5 : quotient + 0.5);
|
|
283
|
+
|
|
284
|
+
// Reduce the angle to: (angle - (multiple_of_pi * π)) ∈ [0, π]
|
|
285
|
+
nk_f64_t angle = angle_radians;
|
|
286
|
+
angle = angle - (multiple_of_pi * pi_high);
|
|
287
|
+
angle = angle - (multiple_of_pi * pi_low);
|
|
288
|
+
if ((multiple_of_pi & 1) != 0) angle = -angle;
|
|
289
|
+
nk_f64_t const angle_squared = angle * angle;
|
|
290
|
+
nk_f64_t const angle_cubed = angle * angle_squared;
|
|
291
|
+
nk_f64_t const angle_quartic = angle_squared * angle_squared;
|
|
292
|
+
nk_f64_t const angle_octic = angle_quartic * angle_quartic;
|
|
293
|
+
|
|
294
|
+
// Compute higher-degree polynomial terms using FMA
|
|
295
|
+
nk_f64_t const poly_67 = nk_f64_fma_serial(angle_squared, coeff_7, coeff_6);
|
|
296
|
+
nk_f64_t const poly_45 = nk_f64_fma_serial(angle_squared, coeff_5, coeff_4);
|
|
297
|
+
nk_f64_t const poly_4567 = nk_f64_fma_serial(angle_quartic, poly_67, poly_45);
|
|
298
|
+
|
|
299
|
+
// Compute lower-degree polynomial terms using FMA
|
|
300
|
+
nk_f64_t const poly_23 = nk_f64_fma_serial(angle_squared, coeff_3, coeff_2);
|
|
301
|
+
nk_f64_t const poly_01 = nk_f64_fma_serial(angle_squared, coeff_1, coeff_0);
|
|
302
|
+
nk_f64_t const poly_0123 = nk_f64_fma_serial(angle_quartic, poly_23, poly_01);
|
|
303
|
+
|
|
304
|
+
// Combine polynomial terms using FMA
|
|
305
|
+
nk_f64_t result = nk_f64_fma_serial(angle_octic, poly_4567, poly_0123);
|
|
306
|
+
result = nk_f64_fma_serial(result, angle_squared, coeff_8);
|
|
307
|
+
result = nk_f64_fma_serial(result, angle_cubed, angle);
|
|
308
|
+
|
|
309
|
+
// Handle the special case of negative zero input
|
|
310
|
+
nk_fui64_t converter;
|
|
311
|
+
converter.f = angle_radians;
|
|
312
|
+
if ((nk_i64_t)converter.u == negative_zero) result = angle;
|
|
313
|
+
return result;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* @brief Computes the cosine of the given angle in radians with @b 0-ULP error bound in [-2π, 2π].
|
|
318
|
+
* @see Based on @b `xcos` in SLEEF library.
|
|
319
|
+
* @param[in] angle The input angle in radians.
|
|
320
|
+
* @return The approximate cosine of the input angle in [-1, 1] range.
|
|
321
|
+
*/
|
|
322
|
+
NK_PUBLIC nk_f64_t nk_f64_cos(nk_f64_t const angle_radians) {
|
|
323
|
+
|
|
324
|
+
// Constants for argument reduction
|
|
325
|
+
nk_f64_t const pi_high_half = 3.141592653589793116 * 0.5; // High-digits part of π
|
|
326
|
+
nk_f64_t const pi_low_half = 1.2246467991473532072e-16 * 0.5; // Low-digits part of π
|
|
327
|
+
nk_f64_t const pi_reciprocal = 0.318309886183790671537767526745028724; // 1/π
|
|
328
|
+
|
|
329
|
+
// Polynomial coefficients for sine/cosine approximation (minimax polynomial)
|
|
330
|
+
nk_f64_t const coeff_0 = +0.00833333333333332974823815;
|
|
331
|
+
nk_f64_t const coeff_1 = -0.000198412698412696162806809;
|
|
332
|
+
nk_f64_t const coeff_2 = +2.75573192239198747630416e-06;
|
|
333
|
+
nk_f64_t const coeff_3 = -2.50521083763502045810755e-08;
|
|
334
|
+
nk_f64_t const coeff_4 = +1.60590430605664501629054e-10;
|
|
335
|
+
nk_f64_t const coeff_5 = -7.64712219118158833288484e-13;
|
|
336
|
+
nk_f64_t const coeff_6 = +2.81009972710863200091251e-15;
|
|
337
|
+
nk_f64_t const coeff_7 = -7.97255955009037868891952e-18;
|
|
338
|
+
nk_f64_t const coeff_8 = -0.166666666666666657414808;
|
|
339
|
+
|
|
340
|
+
// Compute (multiple_of_pi) = 2 * round(angle / π - 0.5) + 1
|
|
341
|
+
nk_f64_t const quotient = angle_radians * pi_reciprocal - 0.5;
|
|
342
|
+
int const multiple_of_pi = 2 * (int)(quotient < 0 ? quotient - 0.5 : quotient + 0.5) + 1;
|
|
343
|
+
|
|
344
|
+
// Reduce the angle to: (angle - (multiple_of_pi * π)) in [-π/2, π/2]
|
|
345
|
+
nk_f64_t angle = angle_radians;
|
|
346
|
+
angle = angle - (multiple_of_pi * pi_high_half);
|
|
347
|
+
angle = angle - (multiple_of_pi * pi_low_half);
|
|
348
|
+
if ((multiple_of_pi & 2) == 0) angle = -angle;
|
|
349
|
+
nk_f64_t const angle_squared = angle * angle;
|
|
350
|
+
nk_f64_t const angle_cubed = angle * angle_squared;
|
|
351
|
+
nk_f64_t const angle_quartic = angle_squared * angle_squared;
|
|
352
|
+
nk_f64_t const angle_octic = angle_quartic * angle_quartic;
|
|
353
|
+
|
|
354
|
+
// Compute higher-degree polynomial terms using FMA
|
|
355
|
+
nk_f64_t const poly_67 = nk_f64_fma_serial(angle_squared, coeff_7, coeff_6);
|
|
356
|
+
nk_f64_t const poly_45 = nk_f64_fma_serial(angle_squared, coeff_5, coeff_4);
|
|
357
|
+
nk_f64_t const poly_4567 = nk_f64_fma_serial(angle_quartic, poly_67, poly_45);
|
|
358
|
+
|
|
359
|
+
// Compute lower-degree polynomial terms using FMA
|
|
360
|
+
nk_f64_t const poly_23 = nk_f64_fma_serial(angle_squared, coeff_3, coeff_2);
|
|
361
|
+
nk_f64_t const poly_01 = nk_f64_fma_serial(angle_squared, coeff_1, coeff_0);
|
|
362
|
+
nk_f64_t const poly_0123 = nk_f64_fma_serial(angle_quartic, poly_23, poly_01);
|
|
363
|
+
|
|
364
|
+
// Combine polynomial terms using FMA
|
|
365
|
+
nk_f64_t result = nk_f64_fma_serial(angle_octic, poly_4567, poly_0123);
|
|
366
|
+
result = nk_f64_fma_serial(result, angle_squared, coeff_8);
|
|
367
|
+
result = nk_f64_fma_serial(result, angle_cubed, angle);
|
|
368
|
+
return result;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
/**
|
|
372
|
+
* @brief Computes the arc-tangent of a value with @b 0-ULP error bound.
|
|
373
|
+
* @see Based on @b `xatan` in SLEEF library.
|
|
374
|
+
* @param input The input value.
|
|
375
|
+
* @return The arc-tangent of the input value in [-π/2, π/2] radians range.
|
|
376
|
+
*/
|
|
377
|
+
NK_PUBLIC nk_f64_t nk_f64_atan(nk_f64_t const input) {
|
|
378
|
+
// Polynomial coefficients for atan approximation
|
|
379
|
+
nk_f64_t const coeff_19 = -1.88796008463073496563746e-05;
|
|
380
|
+
nk_f64_t const coeff_18 = +0.000209850076645816976906797;
|
|
381
|
+
nk_f64_t const coeff_17 = -0.00110611831486672482563471;
|
|
382
|
+
nk_f64_t const coeff_16 = +0.00370026744188713119232403;
|
|
383
|
+
nk_f64_t const coeff_15 = -0.00889896195887655491740809;
|
|
384
|
+
nk_f64_t const coeff_14 = +0.016599329773529201970117;
|
|
385
|
+
nk_f64_t const coeff_13 = -0.0254517624932312641616861;
|
|
386
|
+
nk_f64_t const coeff_12 = +0.0337852580001353069993897;
|
|
387
|
+
nk_f64_t const coeff_11 = -0.0407629191276836500001934;
|
|
388
|
+
nk_f64_t const coeff_10 = +0.0466667150077840625632675;
|
|
389
|
+
nk_f64_t const coeff_9 = -0.0523674852303482457616113;
|
|
390
|
+
nk_f64_t const coeff_8 = +0.0587666392926673580854313;
|
|
391
|
+
nk_f64_t const coeff_7 = -0.0666573579361080525984562;
|
|
392
|
+
nk_f64_t const coeff_6 = +0.0769219538311769618355029;
|
|
393
|
+
nk_f64_t const coeff_5 = -0.090908995008245008229153;
|
|
394
|
+
nk_f64_t const coeff_4 = +0.111111105648261418443745;
|
|
395
|
+
nk_f64_t const coeff_3 = -0.14285714266771329383765;
|
|
396
|
+
nk_f64_t const coeff_2 = +0.199999999996591265594148;
|
|
397
|
+
nk_f64_t const coeff_1 = -0.333333333333311110369124;
|
|
398
|
+
|
|
399
|
+
// Quadrant adjustment
|
|
400
|
+
int quadrant = 0;
|
|
401
|
+
nk_f64_t value = input;
|
|
402
|
+
if (value < 0) value = -value, quadrant |= 2;
|
|
403
|
+
if (value > 1) value = 1.0 / value, quadrant |= 1;
|
|
404
|
+
nk_f64_t const value_squared = value * value;
|
|
405
|
+
nk_f64_t const value_cubed = value * value_squared;
|
|
406
|
+
|
|
407
|
+
// Polynomial evaluation using FMA for improved precision
|
|
408
|
+
nk_f64_t polynomial = coeff_19;
|
|
409
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_18);
|
|
410
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_17);
|
|
411
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_16);
|
|
412
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_15);
|
|
413
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_14);
|
|
414
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_13);
|
|
415
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_12);
|
|
416
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_11);
|
|
417
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_10);
|
|
418
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_9);
|
|
419
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_8);
|
|
420
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_7);
|
|
421
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_6);
|
|
422
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_5);
|
|
423
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_4);
|
|
424
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_3);
|
|
425
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_2);
|
|
426
|
+
polynomial = nk_f64_fma_serial(polynomial, value_squared, coeff_1);
|
|
427
|
+
|
|
428
|
+
// Adjust for quadrant
|
|
429
|
+
nk_f64_t const pi_half = 1.5707963267948966; // π/2
|
|
430
|
+
nk_f64_t result = nk_f64_fma_serial(polynomial, value_cubed, value);
|
|
431
|
+
if (quadrant & 1) result = pi_half - result;
|
|
432
|
+
if (quadrant & 2) result = -result;
|
|
433
|
+
|
|
434
|
+
return result;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
/**
|
|
438
|
+
* @brief Computes the arc-tangent of (y/x) with @b 0-ULP error bound.
|
|
439
|
+
* @see Based on @b `xatan2` in SLEEF library.
|
|
440
|
+
* @param y_input The input sine value.
|
|
441
|
+
* @param x_input The input cosine value.
|
|
442
|
+
* @return The arc-tangent of (y_input/x_input) in [-π/2, π/2] radians range.
|
|
443
|
+
*/
|
|
444
|
+
NK_PUBLIC nk_f64_t nk_f64_atan2(nk_f64_t const y_input, nk_f64_t const x_input) {
|
|
445
|
+
// Polynomial coefficients for atan2 approximation
|
|
446
|
+
nk_f64_t const coeff_19 = -1.88796008463073496563746e-05;
|
|
447
|
+
nk_f64_t const coeff_18 = +0.000209850076645816976906797;
|
|
448
|
+
nk_f64_t const coeff_17 = -0.00110611831486672482563471;
|
|
449
|
+
nk_f64_t const coeff_16 = +0.00370026744188713119232403;
|
|
450
|
+
nk_f64_t const coeff_15 = -0.00889896195887655491740809;
|
|
451
|
+
nk_f64_t const coeff_14 = +0.016599329773529201970117;
|
|
452
|
+
nk_f64_t const coeff_13 = -0.0254517624932312641616861;
|
|
453
|
+
nk_f64_t const coeff_12 = +0.0337852580001353069993897;
|
|
454
|
+
nk_f64_t const coeff_11 = -0.0407629191276836500001934;
|
|
455
|
+
nk_f64_t const coeff_10 = +0.0466667150077840625632675;
|
|
456
|
+
nk_f64_t const coeff_9 = -0.0523674852303482457616113;
|
|
457
|
+
nk_f64_t const coeff_8 = +0.0587666392926673580854313;
|
|
458
|
+
nk_f64_t const coeff_7 = -0.0666573579361080525984562;
|
|
459
|
+
nk_f64_t const coeff_6 = +0.0769219538311769618355029;
|
|
460
|
+
nk_f64_t const coeff_5 = -0.090908995008245008229153;
|
|
461
|
+
nk_f64_t const coeff_4 = +0.111111105648261418443745;
|
|
462
|
+
nk_f64_t const coeff_3 = -0.14285714266771329383765;
|
|
463
|
+
nk_f64_t const coeff_2 = +0.199999999996591265594148;
|
|
464
|
+
nk_f64_t const coeff_1 = -0.333333333333311110369124;
|
|
465
|
+
|
|
466
|
+
nk_fui64_t x_bits, y_bits;
|
|
467
|
+
x_bits.f = x_input, y_bits.f = y_input;
|
|
468
|
+
nk_fui64_t x_abs, y_abs;
|
|
469
|
+
y_abs.u = y_bits.u & 0x7FFFFFFFFFFFFFFFull;
|
|
470
|
+
|
|
471
|
+
// Quadrant adjustment
|
|
472
|
+
int quadrant = 0;
|
|
473
|
+
if (x_input < 0) { x_abs.f = -x_input, quadrant = -2; }
|
|
474
|
+
else { x_abs.f = x_input; }
|
|
475
|
+
// Now make sure its proper fraction, where the nominator is smaller than the denominator,
|
|
476
|
+
// otherwise swap the absolute values that we will use down the road, but keep the `x_bits` and `y_bits`
|
|
477
|
+
// as is for final qdrant re-adjustment.
|
|
478
|
+
if (y_abs.f > x_abs.f) {
|
|
479
|
+
nk_f64_t temp = x_abs.f;
|
|
480
|
+
x_abs.f = y_abs.f;
|
|
481
|
+
y_abs.f = -temp;
|
|
482
|
+
quadrant += 1;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// Argument reduction
|
|
486
|
+
nk_f64_t const ratio = y_abs.f / x_abs.f;
|
|
487
|
+
nk_f64_t const ratio_squared = ratio * ratio;
|
|
488
|
+
nk_f64_t const ratio_cubed = ratio * ratio_squared;
|
|
489
|
+
|
|
490
|
+
// Polynomial evaluation using FMA for improved precision
|
|
491
|
+
nk_f64_t polynomial = nk_f64_fma_serial(coeff_19, ratio_squared, coeff_18);
|
|
492
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_17);
|
|
493
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_16);
|
|
494
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_15);
|
|
495
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_14);
|
|
496
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_13);
|
|
497
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_12);
|
|
498
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_11);
|
|
499
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_10);
|
|
500
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_9);
|
|
501
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_8);
|
|
502
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_7);
|
|
503
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_6);
|
|
504
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_5);
|
|
505
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_4);
|
|
506
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_3);
|
|
507
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_2);
|
|
508
|
+
polynomial = nk_f64_fma_serial(polynomial, ratio_squared, coeff_1);
|
|
509
|
+
|
|
510
|
+
// Adjust for quadrant
|
|
511
|
+
nk_f64_t const pi = 3.14159265358979323846; // π
|
|
512
|
+
nk_f64_t const pi_half = 1.5707963267948966; // π/2
|
|
513
|
+
nk_f64_t const pi_quarter = 0.7853981633974483; // π/4
|
|
514
|
+
nk_u64_t const negative_zero = 0x8000000000000000ull;
|
|
515
|
+
nk_u64_t const positive_infinity = 0x7FF0000000000000ull;
|
|
516
|
+
nk_u64_t const negative_infinity = 0xFFF0000000000000ull;
|
|
517
|
+
nk_f64_t result = nk_f64_fma_serial(polynomial, ratio_cubed, ratio);
|
|
518
|
+
result = nk_f64_fma_serial((nk_f64_t)quadrant, pi_half, result);
|
|
519
|
+
|
|
520
|
+
// Special cases handling using bit reinterpretation
|
|
521
|
+
int const x_is_inf = (x_bits.u == positive_infinity) | (x_bits.u == negative_infinity);
|
|
522
|
+
int const y_is_inf = (y_bits.u == positive_infinity) | (y_bits.u == negative_infinity);
|
|
523
|
+
|
|
524
|
+
// Perform the sign multiplication and infer the right quadrant
|
|
525
|
+
nk_fui64_t result_bits;
|
|
526
|
+
result_bits.f = result;
|
|
527
|
+
// Sign transfer:
|
|
528
|
+
result_bits.u ^= x_bits.u & negative_zero;
|
|
529
|
+
// Quadrant adjustments:
|
|
530
|
+
if (x_is_inf | (x_bits.f == 0)) result_bits.f = pi_half - (x_is_inf ? (x_bits.f < 0 ? pi_half : 0) : 0);
|
|
531
|
+
if (y_is_inf) result_bits.f = pi_half - (x_is_inf ? (x_bits.f < 0 ? pi_half : pi_quarter) : 0);
|
|
532
|
+
if (y_bits.f == 0) result_bits.f = (x_bits.f < 0 ? pi : 0);
|
|
533
|
+
if (x_is_inf | y_is_inf) result_bits.u = 0x7FF8000000000000ull;
|
|
534
|
+
// Sign transfer back:
|
|
535
|
+
else { result_bits.u ^= y_bits.u & negative_zero; }
|
|
536
|
+
return result_bits.f;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
/**
|
|
540
|
+
* @brief Computes an approximate tangent of the given angle in radians with @b 3-ULP error bound for [-2π, 2π].
|
|
541
|
+
* @param[in] angle_radians The input angle in radians.
|
|
542
|
+
* @return The approximate tangent of the input angle.
|
|
543
|
+
*/
|
|
544
|
+
NK_PUBLIC nk_f32_t nk_f32_tan(nk_f32_t const angle_radians) {
|
|
545
|
+
|
|
546
|
+
// Cody-Waite constants for argument reduction
|
|
547
|
+
nk_f32_t const pi_hi = 3.1415927f;
|
|
548
|
+
nk_f32_t const pi_lo = -8.742278e-8f;
|
|
549
|
+
nk_f32_t const pi_half = 1.57079632679489661923f; /// π/2
|
|
550
|
+
nk_f32_t const pi_quarter = 0.78539816339744830962f; /// π/4
|
|
551
|
+
nk_f32_t const pi_reciprocal = 0.31830988618379067154f; /// 1/π
|
|
552
|
+
|
|
553
|
+
// Polynomial coefficients for tangent approximation (minimax polynomial)
|
|
554
|
+
nk_f32_t const coeff_7 = +0.002443315461f; /// Coefficient for x⁷ term
|
|
555
|
+
nk_f32_t const coeff_5 = +0.05338123068f; /// Coefficient for x⁵ term
|
|
556
|
+
nk_f32_t const coeff_3 = +0.3333314061f; /// Coefficient for x³ term
|
|
557
|
+
|
|
558
|
+
// Compute (multiple_of_pi) = round(angle / π)
|
|
559
|
+
nk_f32_t const quotient = angle_radians * pi_reciprocal;
|
|
560
|
+
int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5f : quotient + 0.5f);
|
|
561
|
+
|
|
562
|
+
// Cody-Waite range reduction
|
|
563
|
+
nk_f32_t angle = angle_radians - multiple_of_pi * pi_hi;
|
|
564
|
+
angle -= multiple_of_pi * pi_lo;
|
|
565
|
+
|
|
566
|
+
// If |angle| > π/4, use tan(x) = 1/tan(π/2 - x) for better accuracy
|
|
567
|
+
int reciprocal = 0;
|
|
568
|
+
if (angle > pi_quarter) {
|
|
569
|
+
angle = pi_half - angle;
|
|
570
|
+
reciprocal = 1;
|
|
571
|
+
}
|
|
572
|
+
else if (angle < -pi_quarter) {
|
|
573
|
+
angle = -pi_half - angle;
|
|
574
|
+
reciprocal = 1;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// Compute the polynomial approximation: tan(x) ≈ x + c3 × x³ + c5 × x⁵ + c7 × x⁷
|
|
578
|
+
nk_f32_t const angle_squared = angle * angle;
|
|
579
|
+
nk_f32_t const angle_cubed = angle * angle_squared;
|
|
580
|
+
|
|
581
|
+
nk_f32_t polynomial = coeff_7;
|
|
582
|
+
polynomial = polynomial * angle_squared + coeff_5;
|
|
583
|
+
polynomial = polynomial * angle_squared + coeff_3;
|
|
584
|
+
nk_f32_t result = polynomial * angle_cubed + angle;
|
|
585
|
+
|
|
586
|
+
// Apply reciprocal if we reduced from outer region
|
|
587
|
+
if (reciprocal) result = 1.0f / result;
|
|
588
|
+
return result;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
/**
|
|
592
|
+
* @brief Computes the tangent of the given angle in radians with @b 0-ULP error bound in [-2π, 2π].
|
|
593
|
+
* @param[in] angle_radians The input angle in radians.
|
|
594
|
+
* @return The approximate tangent of the input angle.
|
|
595
|
+
*/
|
|
596
|
+
NK_PUBLIC nk_f64_t nk_f64_tan(nk_f64_t const angle_radians) {
|
|
597
|
+
|
|
598
|
+
// Constants for argument reduction
|
|
599
|
+
nk_f64_t const pi_high = 3.141592653589793116; /// High-digits part of π
|
|
600
|
+
nk_f64_t const pi_low = 1.2246467991473532072e-16; /// Low-digits part of π
|
|
601
|
+
nk_f64_t const pi_half = 1.5707963267948966192313216916398; /// π/2
|
|
602
|
+
nk_f64_t const pi_quarter = 0.78539816339744830961566084581988; /// π/4
|
|
603
|
+
nk_f64_t const pi_reciprocal = 0.318309886183790671537767526745028724; /// 1/π
|
|
604
|
+
|
|
605
|
+
// Polynomial coefficients for tangent approximation (minimax polynomial)
|
|
606
|
+
nk_f64_t const coeff_13 = +0.000024030521244861858; /// Coefficient for x¹³ term
|
|
607
|
+
nk_f64_t const coeff_11 = +0.00035923150434482523; /// Coefficient for x¹¹ term
|
|
608
|
+
nk_f64_t const coeff_9 = +0.0058685277932046705; /// Coefficient for x⁹ term
|
|
609
|
+
nk_f64_t const coeff_7 = +0.021869488294859542; /// Coefficient for x⁷ term
|
|
610
|
+
nk_f64_t const coeff_5 = +0.053968253972902704; /// Coefficient for x⁵ term
|
|
611
|
+
nk_f64_t const coeff_3 = +0.13333333333320124; /// Coefficient for x³ term
|
|
612
|
+
nk_f64_t const coeff_1 = +0.33333333333333331; /// Coefficient for x term
|
|
613
|
+
|
|
614
|
+
// Compute (multiple_of_pi) = round(angle / π)
|
|
615
|
+
nk_f64_t const quotient = angle_radians * pi_reciprocal;
|
|
616
|
+
int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5 : quotient + 0.5);
|
|
617
|
+
|
|
618
|
+
// Reduce the angle using high/low precision split
|
|
619
|
+
nk_f64_t angle = angle_radians;
|
|
620
|
+
angle = angle - (multiple_of_pi * pi_high);
|
|
621
|
+
angle = angle - (multiple_of_pi * pi_low);
|
|
622
|
+
|
|
623
|
+
// If |angle| > π/4, use tan(x) = 1/tan(π/2 - x) for better accuracy
|
|
624
|
+
int reciprocal = 0;
|
|
625
|
+
if (angle > pi_quarter) {
|
|
626
|
+
angle = pi_half - angle;
|
|
627
|
+
reciprocal = 1;
|
|
628
|
+
}
|
|
629
|
+
else if (angle < -pi_quarter) {
|
|
630
|
+
angle = -pi_half - angle;
|
|
631
|
+
reciprocal = 1;
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// Compute powers of angle
|
|
635
|
+
nk_f64_t const angle_squared = angle * angle;
|
|
636
|
+
nk_f64_t const angle_cubed = angle * angle_squared;
|
|
637
|
+
|
|
638
|
+
// Compute the polynomial approximation: tan(x) ≈ x × (1 + c1 × x² + c3 × x⁴ + ...)
|
|
639
|
+
nk_f64_t polynomial = coeff_13;
|
|
640
|
+
polynomial = polynomial * angle_squared + coeff_11;
|
|
641
|
+
polynomial = polynomial * angle_squared + coeff_9;
|
|
642
|
+
polynomial = polynomial * angle_squared + coeff_7;
|
|
643
|
+
polynomial = polynomial * angle_squared + coeff_5;
|
|
644
|
+
polynomial = polynomial * angle_squared + coeff_3;
|
|
645
|
+
polynomial = polynomial * angle_squared + coeff_1;
|
|
646
|
+
nk_f64_t result = polynomial * angle_cubed + angle;
|
|
647
|
+
|
|
648
|
+
// Apply reciprocal if we reduced from outer region
|
|
649
|
+
if (reciprocal) result = 1.0 / result;
|
|
650
|
+
return result;
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
NK_PUBLIC void nk_each_sin_f32_serial(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
654
|
+
for (nk_size_t i = 0; i != n; ++i) outs[i] = nk_f32_sin(ins[i]);
|
|
655
|
+
}
|
|
656
|
+
NK_PUBLIC void nk_each_cos_f32_serial(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
657
|
+
for (nk_size_t i = 0; i != n; ++i) outs[i] = nk_f32_cos(ins[i]);
|
|
658
|
+
}
|
|
659
|
+
NK_PUBLIC void nk_each_atan_f32_serial(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
660
|
+
for (nk_size_t i = 0; i != n; ++i) outs[i] = nk_f32_atan(ins[i]);
|
|
661
|
+
}
|
|
662
|
+
NK_PUBLIC void nk_each_sin_f64_serial(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
663
|
+
for (nk_size_t i = 0; i != n; ++i) outs[i] = nk_f64_sin(ins[i]);
|
|
664
|
+
}
|
|
665
|
+
NK_PUBLIC void nk_each_cos_f64_serial(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
666
|
+
for (nk_size_t i = 0; i != n; ++i) outs[i] = nk_f64_cos(ins[i]);
|
|
667
|
+
}
|
|
668
|
+
NK_PUBLIC void nk_each_atan_f64_serial(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
669
|
+
for (nk_size_t i = 0; i != n; ++i) outs[i] = nk_f64_atan(ins[i]);
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
NK_PUBLIC void nk_each_sin_f16_serial(nk_f16_t const *ins, nk_size_t n, nk_f16_t *outs) {
|
|
673
|
+
for (nk_size_t i = 0; i != n; ++i) {
|
|
674
|
+
nk_f32_t angle_f32;
|
|
675
|
+
nk_f16_to_f32_serial(&ins[i], &angle_f32);
|
|
676
|
+
nk_f32_t const result_f32 = nk_f32_sin(angle_f32);
|
|
677
|
+
nk_f32_to_f16_serial(&result_f32, &outs[i]);
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
NK_PUBLIC void nk_each_cos_f16_serial(nk_f16_t const *ins, nk_size_t n, nk_f16_t *outs) {
|
|
682
|
+
for (nk_size_t i = 0; i != n; ++i) {
|
|
683
|
+
nk_f32_t angle_f32;
|
|
684
|
+
nk_f16_to_f32_serial(&ins[i], &angle_f32);
|
|
685
|
+
nk_f32_t const result_f32 = nk_f32_cos(angle_f32);
|
|
686
|
+
nk_f32_to_f16_serial(&result_f32, &outs[i]);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
NK_PUBLIC void nk_each_atan_f16_serial(nk_f16_t const *ins, nk_size_t n, nk_f16_t *outs) {
|
|
691
|
+
for (nk_size_t i = 0; i != n; ++i) {
|
|
692
|
+
nk_f32_t value_f32;
|
|
693
|
+
nk_f16_to_f32_serial(&ins[i], &value_f32);
|
|
694
|
+
nk_f32_t const result_f32 = nk_f32_atan(value_f32);
|
|
695
|
+
nk_f32_to_f16_serial(&result_f32, &outs[i]);
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
#if defined(__cplusplus)
|
|
700
|
+
} // extern "C"
|
|
701
|
+
#endif
|
|
702
|
+
|
|
703
|
+
#endif // NK_TRIGONOMETRY_SERIAL_H
|