numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -59,18 +59,18 @@ extern "C" {
|
|
|
59
59
|
* Internal helpers return vector register groups for use by geospatial/rvv.h.
|
|
60
60
|
*/
|
|
61
61
|
|
|
62
|
-
NK_INTERNAL vfloat32m4_t nk_f32m4_sin_rvv_(vfloat32m4_t
|
|
62
|
+
NK_INTERNAL vfloat32m4_t nk_f32m4_sin_rvv_(vfloat32m4_t angles_f32m4, nk_size_t vl) {
|
|
63
63
|
nk_f32_t const pi = 3.14159265358979323846f;
|
|
64
64
|
nk_f32_t const pi_recip = 0.31830988618379067154f;
|
|
65
65
|
|
|
66
66
|
// Range reduce: round(angle / pi)
|
|
67
|
-
vfloat32m4_t quotients_f32m4 = __riscv_vfmul_vf_f32m4(
|
|
67
|
+
vfloat32m4_t quotients_f32m4 = __riscv_vfmul_vf_f32m4(angles_f32m4, pi_recip, vl);
|
|
68
68
|
// vfcvt_x_f rounds to nearest integer by default (RNE)
|
|
69
69
|
vint32m4_t rounded_i32m4 = __riscv_vfcvt_x_f_v_i32m4(quotients_f32m4, vl);
|
|
70
70
|
vfloat32m4_t rounded_f32m4 = __riscv_vfcvt_f_x_v_f32m4(rounded_i32m4, vl);
|
|
71
71
|
|
|
72
72
|
// reduced = angle - rounded * pi
|
|
73
|
-
vfloat32m4_t reduced_f32m4 = __riscv_vfnmsac_vf_f32m4(
|
|
73
|
+
vfloat32m4_t reduced_f32m4 = __riscv_vfnmsac_vf_f32m4(angles_f32m4, pi, rounded_f32m4, vl);
|
|
74
74
|
|
|
75
75
|
// Polynomial: sin(x) ~ x + x^3 * (c1 + x^2 * (c3 + x^2 * c5))
|
|
76
76
|
vfloat32m4_t squared_f32m4 = __riscv_vfmul_vv_f32m4(reduced_f32m4, reduced_f32m4, vl);
|
|
@@ -88,19 +88,19 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_sin_rvv_(vfloat32m4_t angles, nk_size_t vl) {
|
|
|
88
88
|
return __riscv_vreinterpret_v_u32m4_f32m4(result_bits_u32m4);
|
|
89
89
|
}
|
|
90
90
|
|
|
91
|
-
NK_INTERNAL vfloat32m4_t nk_f32m4_cos_rvv_(vfloat32m4_t
|
|
91
|
+
NK_INTERNAL vfloat32m4_t nk_f32m4_cos_rvv_(vfloat32m4_t angles_f32m4, nk_size_t vl) {
|
|
92
92
|
nk_f32_t const pi = 3.14159265358979323846f;
|
|
93
93
|
nk_f32_t const pi_half = 1.57079632679489661923f;
|
|
94
94
|
nk_f32_t const pi_recip = 0.31830988618379067154f;
|
|
95
95
|
|
|
96
96
|
// Compute round((angle / pi) - 0.5)
|
|
97
|
-
vfloat32m4_t quotients_f32m4 = __riscv_vfsub_vf_f32m4(__riscv_vfmul_vf_f32m4(
|
|
97
|
+
vfloat32m4_t quotients_f32m4 = __riscv_vfsub_vf_f32m4(__riscv_vfmul_vf_f32m4(angles_f32m4, pi_recip, vl), 0.5f, vl);
|
|
98
98
|
vint32m4_t rounded_i32m4 = __riscv_vfcvt_x_f_v_i32m4(quotients_f32m4, vl);
|
|
99
99
|
vfloat32m4_t rounded_f32m4 = __riscv_vfcvt_f_x_v_f32m4(rounded_i32m4, vl);
|
|
100
100
|
|
|
101
101
|
// Reduce: angle - (rounded * pi + pi/2)
|
|
102
102
|
vfloat32m4_t offset_f32m4 = __riscv_vfmacc_vf_f32m4(__riscv_vfmv_v_f_f32m4(pi_half, vl), pi, rounded_f32m4, vl);
|
|
103
|
-
vfloat32m4_t reduced_f32m4 = __riscv_vfsub_vv_f32m4(
|
|
103
|
+
vfloat32m4_t reduced_f32m4 = __riscv_vfsub_vv_f32m4(angles_f32m4, offset_f32m4, vl);
|
|
104
104
|
|
|
105
105
|
// Polynomial: same 3-term approximation
|
|
106
106
|
vfloat32m4_t squared_f32m4 = __riscv_vfmul_vv_f32m4(reduced_f32m4, reduced_f32m4, vl);
|
|
@@ -118,7 +118,7 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_cos_rvv_(vfloat32m4_t angles, nk_size_t vl) {
|
|
|
118
118
|
return result_f32m4;
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
-
NK_INTERNAL vfloat32m4_t nk_f32m4_atan_rvv_(vfloat32m4_t
|
|
121
|
+
NK_INTERNAL vfloat32m4_t nk_f32m4_atan_rvv_(vfloat32m4_t inputs_f32m4, nk_size_t vl) {
|
|
122
122
|
// 8-term polynomial coefficients for atan approximation
|
|
123
123
|
nk_f32_t const c8 = -0.333331018686294555664062f;
|
|
124
124
|
nk_f32_t const c7 = +0.199926957488059997558594f;
|
|
@@ -130,8 +130,8 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_atan_rvv_(vfloat32m4_t inputs, nk_size_t vl) {
|
|
|
130
130
|
nk_f32_t const c1 = +0.00282363896258175373077393f;
|
|
131
131
|
|
|
132
132
|
// Detect negative values
|
|
133
|
-
vbool8_t negative_mask_b8 = __riscv_vmflt_vf_f32m4_b8(
|
|
134
|
-
vfloat32m4_t values_f32m4 = __riscv_vfabs_v_f32m4(
|
|
133
|
+
vbool8_t negative_mask_b8 = __riscv_vmflt_vf_f32m4_b8(inputs_f32m4, 0.0f, vl);
|
|
134
|
+
vfloat32m4_t values_f32m4 = __riscv_vfabs_v_f32m4(inputs_f32m4, vl);
|
|
135
135
|
|
|
136
136
|
// Check if values > 1 (need reciprocal)
|
|
137
137
|
vbool8_t reciprocal_mask_b8 = __riscv_vmfgt_vf_f32m4_b8(values_f32m4, 1.0f, vl);
|
|
@@ -163,7 +163,7 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_atan_rvv_(vfloat32m4_t inputs, nk_size_t vl) {
|
|
|
163
163
|
return result_f32m4;
|
|
164
164
|
}
|
|
165
165
|
|
|
166
|
-
NK_INTERNAL vfloat32m4_t nk_f32m4_atan2_rvv_(vfloat32m4_t
|
|
166
|
+
NK_INTERNAL vfloat32m4_t nk_f32m4_atan2_rvv_(vfloat32m4_t ys_inputs_f32m4, vfloat32m4_t xs_inputs_f32m4, nk_size_t vl) {
|
|
167
167
|
// 8-term polynomial coefficients (same as atan)
|
|
168
168
|
nk_f32_t const c8 = -0.333331018686294555664062f;
|
|
169
169
|
nk_f32_t const c7 = +0.199926957488059997558594f;
|
|
@@ -175,9 +175,9 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_atan2_rvv_(vfloat32m4_t ys_inputs, vfloat32m4_
|
|
|
175
175
|
nk_f32_t const c1 = +0.00282363896258175373077393f;
|
|
176
176
|
|
|
177
177
|
// Quadrant adjustments - take absolute values
|
|
178
|
-
vbool8_t xs_negative_mask_b8 = __riscv_vmflt_vf_f32m4_b8(
|
|
179
|
-
vfloat32m4_t xs_f32m4 = __riscv_vfabs_v_f32m4(
|
|
180
|
-
vfloat32m4_t ys_f32m4 = __riscv_vfabs_v_f32m4(
|
|
178
|
+
vbool8_t xs_negative_mask_b8 = __riscv_vmflt_vf_f32m4_b8(xs_inputs_f32m4, 0.0f, vl);
|
|
179
|
+
vfloat32m4_t xs_f32m4 = __riscv_vfabs_v_f32m4(xs_inputs_f32m4, vl);
|
|
180
|
+
vfloat32m4_t ys_f32m4 = __riscv_vfabs_v_f32m4(ys_inputs_f32m4, vl);
|
|
181
181
|
|
|
182
182
|
// Ensure proper fraction where numerator < denominator
|
|
183
183
|
vbool8_t swap_mask_b8 = __riscv_vmfgt_vv_f32m4_b8(ys_f32m4, xs_f32m4, vl);
|
|
@@ -214,22 +214,22 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_atan2_rvv_(vfloat32m4_t ys_inputs, vfloat32m4_
|
|
|
214
214
|
// Adjust for quadrant: result += quadrant * pi/2
|
|
215
215
|
results_f32m4 = __riscv_vfmacc_vf_f32m4(results_f32m4, 1.5707963267948966f, quadrant_f32m4, vl);
|
|
216
216
|
|
|
217
|
-
// Transfer sign from x (XOR with sign bit of
|
|
217
|
+
// Transfer sign from x (XOR with sign bit of xs_inputs_f32m4)
|
|
218
218
|
vuint32m4_t sign_mask_u32m4 = __riscv_vreinterpret_v_f32m4_u32m4(__riscv_vfmv_v_f_f32m4(-0.0f, vl));
|
|
219
|
-
vuint32m4_t xs_sign_bits_u32m4 = __riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(
|
|
219
|
+
vuint32m4_t xs_sign_bits_u32m4 = __riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(xs_inputs_f32m4),
|
|
220
220
|
sign_mask_u32m4, vl);
|
|
221
221
|
vuint32m4_t result_bits_u32m4 = __riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(results_f32m4),
|
|
222
222
|
xs_sign_bits_u32m4, vl);
|
|
223
223
|
|
|
224
|
-
// Transfer sign from y (XOR with sign bit of
|
|
225
|
-
vuint32m4_t ys_sign_bits_u32m4 = __riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(
|
|
224
|
+
// Transfer sign from y (XOR with sign bit of ys_inputs_f32m4)
|
|
225
|
+
vuint32m4_t ys_sign_bits_u32m4 = __riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(ys_inputs_f32m4),
|
|
226
226
|
sign_mask_u32m4, vl);
|
|
227
227
|
result_bits_u32m4 = __riscv_vxor_vv_u32m4(result_bits_u32m4, ys_sign_bits_u32m4, vl);
|
|
228
228
|
|
|
229
229
|
return __riscv_vreinterpret_v_u32m4_f32m4(result_bits_u32m4);
|
|
230
230
|
}
|
|
231
231
|
|
|
232
|
-
NK_INTERNAL vfloat64m4_t nk_f64m4_sin_rvv_(vfloat64m4_t
|
|
232
|
+
NK_INTERNAL vfloat64m4_t nk_f64m4_sin_rvv_(vfloat64m4_t angles_radians_f64m4, nk_size_t vl) {
|
|
233
233
|
// Constants for two-step Cody-Waite range reduction
|
|
234
234
|
nk_f64_t const pi_high = 3.141592653589793116;
|
|
235
235
|
nk_f64_t const pi_low = 1.2246467991473532072e-16;
|
|
@@ -247,13 +247,13 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_sin_rvv_(vfloat64m4_t angles_radians, nk_size_
|
|
|
247
247
|
nk_f64_t const c8 = -0.166666666666666657414808;
|
|
248
248
|
|
|
249
249
|
// Compute round(angle / pi)
|
|
250
|
-
vfloat64m4_t quotients_f64m4 = __riscv_vfmul_vf_f64m4(
|
|
250
|
+
vfloat64m4_t quotients_f64m4 = __riscv_vfmul_vf_f64m4(angles_radians_f64m4, pi_recip, vl);
|
|
251
251
|
// Round to nearest: vfcvt_x_f rounds to nearest (RNE), then convert back
|
|
252
252
|
vint64m4_t rounded_i64m4 = __riscv_vfcvt_x_f_v_i64m4(quotients_f64m4, vl);
|
|
253
253
|
vfloat64m4_t rounded_f64m4 = __riscv_vfcvt_f_x_v_f64m4(rounded_i64m4, vl);
|
|
254
254
|
|
|
255
255
|
// Two-step Cody-Waite reduction: angle - rounded * pi_high - rounded * pi_low
|
|
256
|
-
vfloat64m4_t angles_f64m4 = __riscv_vfnmsac_vf_f64m4(
|
|
256
|
+
vfloat64m4_t angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_radians_f64m4, pi_high, rounded_f64m4, vl);
|
|
257
257
|
angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_f64m4, pi_low, rounded_f64m4, vl);
|
|
258
258
|
|
|
259
259
|
// If rounded is odd, negate the angle
|
|
@@ -289,13 +289,13 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_sin_rvv_(vfloat64m4_t angles_radians, nk_size_
|
|
|
289
289
|
results_f64m4 = __riscv_vfmacc_vv_f64m4(angles_f64m4, cubed_f64m4, results_f64m4, vl);
|
|
290
290
|
|
|
291
291
|
// Handle zero input (preserve sign of zero)
|
|
292
|
-
vbool16_t non_zero_mask_b16 = __riscv_vmfne_vf_f64m4_b16(
|
|
292
|
+
vbool16_t non_zero_mask_b16 = __riscv_vmfne_vf_f64m4_b16(angles_radians_f64m4, 0.0, vl);
|
|
293
293
|
vfloat64m4_t zeros_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, vl);
|
|
294
294
|
results_f64m4 = __riscv_vmerge_vvm_f64m4(zeros_f64m4, results_f64m4, non_zero_mask_b16, vl);
|
|
295
295
|
return results_f64m4;
|
|
296
296
|
}
|
|
297
297
|
|
|
298
|
-
NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t
|
|
298
|
+
NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t angles_radians_f64m4, nk_size_t vl) {
|
|
299
299
|
// Constants for two-step Cody-Waite range reduction
|
|
300
300
|
nk_f64_t const pi_high_half = 3.141592653589793116 * 0.5;
|
|
301
301
|
nk_f64_t const pi_low_half = 1.2246467991473532072e-16 * 0.5;
|
|
@@ -313,8 +313,8 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t angles_radians, nk_size_
|
|
|
313
313
|
nk_f64_t const c8 = -0.166666666666666657414808;
|
|
314
314
|
|
|
315
315
|
// Compute 2 * round(angle / pi - 0.5) + 1
|
|
316
|
-
vfloat64m4_t quotients_f64m4 = __riscv_vfsub_vf_f64m4(__riscv_vfmul_vf_f64m4(
|
|
317
|
-
vl);
|
|
316
|
+
vfloat64m4_t quotients_f64m4 = __riscv_vfsub_vf_f64m4(__riscv_vfmul_vf_f64m4(angles_radians_f64m4, pi_recip, vl),
|
|
317
|
+
0.5, vl);
|
|
318
318
|
vint64m4_t rounded_i64m4 = __riscv_vfcvt_x_f_v_i64m4(quotients_f64m4, vl);
|
|
319
319
|
vfloat64m4_t rounded_f64m4 = __riscv_vfcvt_f_x_v_f64m4(rounded_i64m4, vl);
|
|
320
320
|
// rounded_quotients = 2 * rounded + 1
|
|
@@ -322,7 +322,8 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t angles_radians, nk_size_
|
|
|
322
322
|
vl);
|
|
323
323
|
|
|
324
324
|
// Two-step Cody-Waite reduction: angle - rounded_quotients * pi_high_half - rounded_quotients * pi_low_half
|
|
325
|
-
vfloat64m4_t angles_f64m4 = __riscv_vfnmsac_vf_f64m4(
|
|
325
|
+
vfloat64m4_t angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_radians_f64m4, pi_high_half, rounded_quotients_f64m4,
|
|
326
|
+
vl);
|
|
326
327
|
angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_f64m4, pi_low_half, rounded_quotients_f64m4, vl);
|
|
327
328
|
|
|
328
329
|
// If (rounded_quotients & 2) == 0, negate the angle
|
|
@@ -352,7 +353,7 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t angles_radians, nk_size_
|
|
|
352
353
|
return results_f64m4;
|
|
353
354
|
}
|
|
354
355
|
|
|
355
|
-
NK_INTERNAL vfloat64m4_t nk_f64m4_atan_rvv_(vfloat64m4_t
|
|
356
|
+
NK_INTERNAL vfloat64m4_t nk_f64m4_atan_rvv_(vfloat64m4_t inputs_f64m4, nk_size_t vl) {
|
|
356
357
|
// 19-term polynomial coefficients
|
|
357
358
|
nk_f64_t const c19 = -1.88796008463073496563746e-05;
|
|
358
359
|
nk_f64_t const c18 = +0.000209850076645816976906797;
|
|
@@ -375,8 +376,8 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan_rvv_(vfloat64m4_t inputs, nk_size_t vl) {
|
|
|
375
376
|
nk_f64_t const c1 = -0.333333333333311110369124;
|
|
376
377
|
|
|
377
378
|
// Detect negative values
|
|
378
|
-
vbool16_t negative_mask_b16 = __riscv_vmflt_vf_f64m4_b16(
|
|
379
|
-
vfloat64m4_t values_f64m4 = __riscv_vfabs_v_f64m4(
|
|
379
|
+
vbool16_t negative_mask_b16 = __riscv_vmflt_vf_f64m4_b16(inputs_f64m4, 0.0, vl);
|
|
380
|
+
vfloat64m4_t values_f64m4 = __riscv_vfabs_v_f64m4(inputs_f64m4, vl);
|
|
380
381
|
|
|
381
382
|
// Check if values > 1 (need reciprocal)
|
|
382
383
|
vbool16_t reciprocal_mask_b16 = __riscv_vmfgt_vf_f64m4_b16(values_f64m4, 1.0, vl);
|
|
@@ -419,7 +420,7 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan_rvv_(vfloat64m4_t inputs, nk_size_t vl) {
|
|
|
419
420
|
return result_f64m4;
|
|
420
421
|
}
|
|
421
422
|
|
|
422
|
-
NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t
|
|
423
|
+
NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t ys_inputs_f64m4, vfloat64m4_t xs_inputs_f64m4, nk_size_t vl) {
|
|
423
424
|
// 19-term polynomial coefficients (same as atan)
|
|
424
425
|
nk_f64_t const c19 = -1.88796008463073496563746e-05;
|
|
425
426
|
nk_f64_t const c18 = +0.000209850076645816976906797;
|
|
@@ -442,9 +443,9 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t ys_inputs, vfloat64m4_
|
|
|
442
443
|
nk_f64_t const c1 = -0.333333333333311110369124;
|
|
443
444
|
|
|
444
445
|
// Quadrant adjustments - take absolute values
|
|
445
|
-
vbool16_t xs_negative_mask_b16 = __riscv_vmflt_vf_f64m4_b16(
|
|
446
|
-
vfloat64m4_t xs_f64m4 = __riscv_vfabs_v_f64m4(
|
|
447
|
-
vfloat64m4_t ys_f64m4 = __riscv_vfabs_v_f64m4(
|
|
446
|
+
vbool16_t xs_negative_mask_b16 = __riscv_vmflt_vf_f64m4_b16(xs_inputs_f64m4, 0.0, vl);
|
|
447
|
+
vfloat64m4_t xs_f64m4 = __riscv_vfabs_v_f64m4(xs_inputs_f64m4, vl);
|
|
448
|
+
vfloat64m4_t ys_f64m4 = __riscv_vfabs_v_f64m4(ys_inputs_f64m4, vl);
|
|
448
449
|
|
|
449
450
|
// Ensure proper fraction where numerator < denominator
|
|
450
451
|
vbool16_t swap_mask_b16 = __riscv_vmfgt_vv_f64m4_b16(ys_f64m4, xs_f64m4, vl);
|
|
@@ -492,15 +493,15 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t ys_inputs, vfloat64m4_
|
|
|
492
493
|
// Adjust for quadrant: result += quadrant * pi/2
|
|
493
494
|
results_f64m4 = __riscv_vfmacc_vf_f64m4(results_f64m4, 1.5707963267948966, quadrant_f64m4, vl);
|
|
494
495
|
|
|
495
|
-
// Transfer sign from x (XOR with sign bit of
|
|
496
|
+
// Transfer sign from x (XOR with sign bit of xs_inputs_f64m4)
|
|
496
497
|
vuint64m4_t sign_mask_u64m4 = __riscv_vreinterpret_v_f64m4_u64m4(__riscv_vfmv_v_f_f64m4(-0.0, vl));
|
|
497
|
-
vuint64m4_t xs_sign_bits_u64m4 = __riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(
|
|
498
|
+
vuint64m4_t xs_sign_bits_u64m4 = __riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(xs_inputs_f64m4),
|
|
498
499
|
sign_mask_u64m4, vl);
|
|
499
500
|
vuint64m4_t result_bits_u64m4 = __riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(results_f64m4),
|
|
500
501
|
xs_sign_bits_u64m4, vl);
|
|
501
502
|
|
|
502
|
-
// Transfer sign from y (XOR with sign bit of
|
|
503
|
-
vuint64m4_t ys_sign_bits_u64m4 = __riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(
|
|
503
|
+
// Transfer sign from y (XOR with sign bit of ys_inputs_f64m4)
|
|
504
|
+
vuint64m4_t ys_sign_bits_u64m4 = __riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(ys_inputs_f64m4),
|
|
504
505
|
sign_mask_u64m4, vl);
|
|
505
506
|
result_bits_u64m4 = __riscv_vxor_vv_u64m4(result_bits_u64m4, ys_sign_bits_u64m4, vl);
|
|
506
507
|
|
|
@@ -511,15 +512,15 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t ys_inputs, vfloat64m4_
|
|
|
511
512
|
* f16 data is loaded as m1 (16-bit), widened to f32 m2, computed, then narrowed back.
|
|
512
513
|
*/
|
|
513
514
|
|
|
514
|
-
NK_INTERNAL vfloat32m2_t nk_f32m2_sin_rvv_(vfloat32m2_t
|
|
515
|
+
NK_INTERNAL vfloat32m2_t nk_f32m2_sin_rvv_(vfloat32m2_t angles_f32m2, nk_size_t vl) {
|
|
515
516
|
nk_f32_t const pi = 3.14159265358979323846f;
|
|
516
517
|
nk_f32_t const pi_recip = 0.31830988618379067154f;
|
|
517
518
|
|
|
518
|
-
vfloat32m2_t quotients_f32m2 = __riscv_vfmul_vf_f32m2(
|
|
519
|
+
vfloat32m2_t quotients_f32m2 = __riscv_vfmul_vf_f32m2(angles_f32m2, pi_recip, vl);
|
|
519
520
|
vint32m2_t rounded_i32m2 = __riscv_vfcvt_x_f_v_i32m2(quotients_f32m2, vl);
|
|
520
521
|
vfloat32m2_t rounded_f32m2 = __riscv_vfcvt_f_x_v_f32m2(rounded_i32m2, vl);
|
|
521
522
|
|
|
522
|
-
vfloat32m2_t reduced_f32m2 = __riscv_vfnmsac_vf_f32m2(
|
|
523
|
+
vfloat32m2_t reduced_f32m2 = __riscv_vfnmsac_vf_f32m2(angles_f32m2, pi, rounded_f32m2, vl);
|
|
523
524
|
vfloat32m2_t squared_f32m2 = __riscv_vfmul_vv_f32m2(reduced_f32m2, reduced_f32m2, vl);
|
|
524
525
|
vfloat32m2_t cubed_f32m2 = __riscv_vfmul_vv_f32m2(reduced_f32m2, squared_f32m2, vl);
|
|
525
526
|
|
|
@@ -534,17 +535,17 @@ NK_INTERNAL vfloat32m2_t nk_f32m2_sin_rvv_(vfloat32m2_t angles, nk_size_t vl) {
|
|
|
534
535
|
return __riscv_vreinterpret_v_u32m2_f32m2(result_bits_u32m2);
|
|
535
536
|
}
|
|
536
537
|
|
|
537
|
-
NK_INTERNAL vfloat32m2_t nk_f32m2_cos_rvv_(vfloat32m2_t
|
|
538
|
+
NK_INTERNAL vfloat32m2_t nk_f32m2_cos_rvv_(vfloat32m2_t angles_f32m2, nk_size_t vl) {
|
|
538
539
|
nk_f32_t const pi = 3.14159265358979323846f;
|
|
539
540
|
nk_f32_t const pi_half = 1.57079632679489661923f;
|
|
540
541
|
nk_f32_t const pi_recip = 0.31830988618379067154f;
|
|
541
542
|
|
|
542
|
-
vfloat32m2_t quotients_f32m2 = __riscv_vfsub_vf_f32m2(__riscv_vfmul_vf_f32m2(
|
|
543
|
+
vfloat32m2_t quotients_f32m2 = __riscv_vfsub_vf_f32m2(__riscv_vfmul_vf_f32m2(angles_f32m2, pi_recip, vl), 0.5f, vl);
|
|
543
544
|
vint32m2_t rounded_i32m2 = __riscv_vfcvt_x_f_v_i32m2(quotients_f32m2, vl);
|
|
544
545
|
vfloat32m2_t rounded_f32m2 = __riscv_vfcvt_f_x_v_f32m2(rounded_i32m2, vl);
|
|
545
546
|
|
|
546
547
|
vfloat32m2_t offset_f32m2 = __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(pi_half, vl), pi, rounded_f32m2, vl);
|
|
547
|
-
vfloat32m2_t reduced_f32m2 = __riscv_vfsub_vv_f32m2(
|
|
548
|
+
vfloat32m2_t reduced_f32m2 = __riscv_vfsub_vv_f32m2(angles_f32m2, offset_f32m2, vl);
|
|
548
549
|
|
|
549
550
|
vfloat32m2_t squared_f32m2 = __riscv_vfmul_vv_f32m2(reduced_f32m2, reduced_f32m2, vl);
|
|
550
551
|
vfloat32m2_t cubed_f32m2 = __riscv_vfmul_vv_f32m2(reduced_f32m2, squared_f32m2, vl);
|
|
@@ -560,7 +561,7 @@ NK_INTERNAL vfloat32m2_t nk_f32m2_cos_rvv_(vfloat32m2_t angles, nk_size_t vl) {
|
|
|
560
561
|
return result_f32m2;
|
|
561
562
|
}
|
|
562
563
|
|
|
563
|
-
NK_INTERNAL vfloat32m2_t nk_f32m2_atan_rvv_(vfloat32m2_t
|
|
564
|
+
NK_INTERNAL vfloat32m2_t nk_f32m2_atan_rvv_(vfloat32m2_t inputs_f32m2, nk_size_t vl) {
|
|
564
565
|
nk_f32_t const c8 = -0.333331018686294555664062f;
|
|
565
566
|
nk_f32_t const c7 = +0.199926957488059997558594f;
|
|
566
567
|
nk_f32_t const c6 = -0.142027363181114196777344f;
|
|
@@ -570,8 +571,8 @@ NK_INTERNAL vfloat32m2_t nk_f32m2_atan_rvv_(vfloat32m2_t inputs, nk_size_t vl) {
|
|
|
570
571
|
nk_f32_t const c2 = -0.0159569028764963150024414f;
|
|
571
572
|
nk_f32_t const c1 = +0.00282363896258175373077393f;
|
|
572
573
|
|
|
573
|
-
vbool16_t negative_mask_b16 = __riscv_vmflt_vf_f32m2_b16(
|
|
574
|
-
vfloat32m2_t values_f32m2 = __riscv_vfabs_v_f32m2(
|
|
574
|
+
vbool16_t negative_mask_b16 = __riscv_vmflt_vf_f32m2_b16(inputs_f32m2, 0.0f, vl);
|
|
575
|
+
vfloat32m2_t values_f32m2 = __riscv_vfabs_v_f32m2(inputs_f32m2, vl);
|
|
575
576
|
|
|
576
577
|
vbool16_t reciprocal_mask_b16 = __riscv_vmfgt_vf_f32m2_b16(values_f32m2, 1.0f, vl);
|
|
577
578
|
vfloat32m2_t reciprocal_values_f32m2 = nk_f32m2_reciprocal_rvv_(values_f32m2, vl);
|
|
@@ -657,8 +658,8 @@ NK_PUBLIC void nk_each_sin_f16_rvv(nk_f16_t const *ins, nk_size_t n, nk_f16_t *o
|
|
|
657
658
|
vuint16m1_t f16_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)ins, vector_length);
|
|
658
659
|
vfloat32m2_t values_f32m2 = nk_f16m1_to_f32m2_rvv_(f16_u16m1, vector_length);
|
|
659
660
|
vfloat32m2_t results_f32m2 = nk_f32m2_sin_rvv_(values_f32m2, vector_length);
|
|
660
|
-
vuint16m1_t
|
|
661
|
-
__riscv_vse16_v_u16m1((nk_u16_t *)outs,
|
|
661
|
+
vuint16m1_t f16_results_u16m1 = nk_f32m2_to_f16m1_rvv_(results_f32m2, vector_length);
|
|
662
|
+
__riscv_vse16_v_u16m1((nk_u16_t *)outs, f16_results_u16m1, vector_length);
|
|
662
663
|
}
|
|
663
664
|
}
|
|
664
665
|
|
|
@@ -668,8 +669,8 @@ NK_PUBLIC void nk_each_cos_f16_rvv(nk_f16_t const *ins, nk_size_t n, nk_f16_t *o
|
|
|
668
669
|
vuint16m1_t f16_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)ins, vector_length);
|
|
669
670
|
vfloat32m2_t values_f32m2 = nk_f16m1_to_f32m2_rvv_(f16_u16m1, vector_length);
|
|
670
671
|
vfloat32m2_t results_f32m2 = nk_f32m2_cos_rvv_(values_f32m2, vector_length);
|
|
671
|
-
vuint16m1_t
|
|
672
|
-
__riscv_vse16_v_u16m1((nk_u16_t *)outs,
|
|
672
|
+
vuint16m1_t f16_results_u16m1 = nk_f32m2_to_f16m1_rvv_(results_f32m2, vector_length);
|
|
673
|
+
__riscv_vse16_v_u16m1((nk_u16_t *)outs, f16_results_u16m1, vector_length);
|
|
673
674
|
}
|
|
674
675
|
}
|
|
675
676
|
|
|
@@ -679,8 +680,8 @@ NK_PUBLIC void nk_each_atan_f16_rvv(nk_f16_t const *ins, nk_size_t n, nk_f16_t *
|
|
|
679
680
|
vuint16m1_t f16_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)ins, vector_length);
|
|
680
681
|
vfloat32m2_t values_f32m2 = nk_f16m1_to_f32m2_rvv_(f16_u16m1, vector_length);
|
|
681
682
|
vfloat32m2_t results_f32m2 = nk_f32m2_atan_rvv_(values_f32m2, vector_length);
|
|
682
|
-
vuint16m1_t
|
|
683
|
-
__riscv_vse16_v_u16m1((nk_u16_t *)outs,
|
|
683
|
+
vuint16m1_t f16_results_u16m1 = nk_f32m2_to_f16m1_rvv_(results_f32m2, vector_length);
|
|
684
|
+
__riscv_vse16_v_u16m1((nk_u16_t *)outs, f16_results_u16m1, vector_length);
|
|
684
685
|
}
|
|
685
686
|
}
|
|
686
687
|
|
|
@@ -27,8 +27,8 @@ extern "C" {
|
|
|
27
27
|
NK_PUBLIC nk_f32_t nk_f32_sin(nk_f32_t const angle_radians) {
|
|
28
28
|
|
|
29
29
|
// Cody-Waite constants for argument reduction (pi split into hi + lo)
|
|
30
|
-
nk_f32_t const
|
|
31
|
-
nk_f32_t const
|
|
30
|
+
nk_f32_t const pi_high = 3.1415927f;
|
|
31
|
+
nk_f32_t const pi_low = -8.742278e-8f;
|
|
32
32
|
nk_f32_t const pi_reciprocal = 0.31830988618379067154f; /// 1/π
|
|
33
33
|
|
|
34
34
|
// Degree-9 minimax coefficients: sin(x) ≈ x + c3*x³ + c5*x⁵ + c7*x⁷ + c9*x⁹
|
|
@@ -41,9 +41,9 @@ NK_PUBLIC nk_f32_t nk_f32_sin(nk_f32_t const angle_radians) {
|
|
|
41
41
|
nk_f32_t const quotient = angle_radians * pi_reciprocal;
|
|
42
42
|
int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5f : quotient + 0.5f);
|
|
43
43
|
|
|
44
|
-
// Cody-Waite range reduction: angle = angle_radians - multiple * (
|
|
45
|
-
nk_f32_t angle = angle_radians - multiple_of_pi *
|
|
46
|
-
angle -= multiple_of_pi *
|
|
44
|
+
// Cody-Waite range reduction: angle = angle_radians - multiple * (pi_high + pi_low)
|
|
45
|
+
nk_f32_t angle = angle_radians - (nk_f32_t)multiple_of_pi * pi_high;
|
|
46
|
+
angle -= (nk_f32_t)multiple_of_pi * pi_low;
|
|
47
47
|
nk_f32_t const angle_squared = angle * angle;
|
|
48
48
|
nk_f32_t const angle_cubed = angle * angle_squared;
|
|
49
49
|
|
|
@@ -68,8 +68,8 @@ NK_PUBLIC nk_f32_t nk_f32_sin(nk_f32_t const angle_radians) {
|
|
|
68
68
|
NK_PUBLIC nk_f32_t nk_f32_cos(nk_f32_t const angle_radians) {
|
|
69
69
|
|
|
70
70
|
// Cody-Waite constants for argument reduction (pi split into hi + lo)
|
|
71
|
-
nk_f32_t const
|
|
72
|
-
nk_f32_t const
|
|
71
|
+
nk_f32_t const pi_high = 3.1415927f;
|
|
72
|
+
nk_f32_t const pi_low = -8.742278e-8f;
|
|
73
73
|
nk_f32_t const pi_half = 1.57079632679489661923f; /// π/2
|
|
74
74
|
nk_f32_t const pi_reciprocal = 0.31830988618379067154f; /// 1/π
|
|
75
75
|
|
|
@@ -84,9 +84,9 @@ NK_PUBLIC nk_f32_t nk_f32_cos(nk_f32_t const angle_radians) {
|
|
|
84
84
|
int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5f : quotient + 0.5f);
|
|
85
85
|
|
|
86
86
|
// Cody-Waite range reduction: angle = angle_radians - (multiple * pi + pi/2)
|
|
87
|
-
nk_f32_t const offset = pi_half + multiple_of_pi *
|
|
87
|
+
nk_f32_t const offset = pi_half + (nk_f32_t)multiple_of_pi * pi_high;
|
|
88
88
|
nk_f32_t angle = angle_radians - offset;
|
|
89
|
-
angle -= multiple_of_pi *
|
|
89
|
+
angle -= (nk_f32_t)multiple_of_pi * pi_low;
|
|
90
90
|
nk_f32_t const angle_squared = angle * angle;
|
|
91
91
|
nk_f32_t const angle_cubed = angle * angle_squared;
|
|
92
92
|
|
|
@@ -544,8 +544,8 @@ NK_PUBLIC nk_f64_t nk_f64_atan2(nk_f64_t const y_input, nk_f64_t const x_input)
|
|
|
544
544
|
NK_PUBLIC nk_f32_t nk_f32_tan(nk_f32_t const angle_radians) {
|
|
545
545
|
|
|
546
546
|
// Cody-Waite constants for argument reduction
|
|
547
|
-
nk_f32_t const
|
|
548
|
-
nk_f32_t const
|
|
547
|
+
nk_f32_t const pi_high = 3.1415927f;
|
|
548
|
+
nk_f32_t const pi_low = -8.742278e-8f;
|
|
549
549
|
nk_f32_t const pi_half = 1.57079632679489661923f; /// π/2
|
|
550
550
|
nk_f32_t const pi_quarter = 0.78539816339744830962f; /// π/4
|
|
551
551
|
nk_f32_t const pi_reciprocal = 0.31830988618379067154f; /// 1/π
|
|
@@ -560,8 +560,8 @@ NK_PUBLIC nk_f32_t nk_f32_tan(nk_f32_t const angle_radians) {
|
|
|
560
560
|
int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5f : quotient + 0.5f);
|
|
561
561
|
|
|
562
562
|
// Cody-Waite range reduction
|
|
563
|
-
nk_f32_t angle = angle_radians - multiple_of_pi *
|
|
564
|
-
angle -= multiple_of_pi *
|
|
563
|
+
nk_f32_t angle = angle_radians - (nk_f32_t)multiple_of_pi * pi_high;
|
|
564
|
+
angle -= (nk_f32_t)multiple_of_pi * pi_low;
|
|
565
565
|
|
|
566
566
|
// If |angle| > π/4, use tan(x) = 1/tan(π/2 - x) for better accuracy
|
|
567
567
|
int reciprocal = 0;
|