numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -52,499 +52,500 @@ extern "C" {
|
|
|
52
52
|
|
|
53
53
|
NK_INTERNAL v128_t nk_f32x4_sin_v128relaxed_(v128_t const angles_radians) {
|
|
54
54
|
// Constants for argument reduction
|
|
55
|
-
v128_t const
|
|
56
|
-
v128_t const
|
|
57
|
-
v128_t const
|
|
58
|
-
v128_t const
|
|
59
|
-
v128_t const
|
|
60
|
-
|
|
61
|
-
// Compute (
|
|
62
|
-
v128_t
|
|
63
|
-
v128_t
|
|
55
|
+
v128_t const pi_f32x4 = wasm_f32x4_splat(3.14159265358979323846f);
|
|
56
|
+
v128_t const pi_reciprocal_f32x4 = wasm_f32x4_splat(0.31830988618379067154f);
|
|
57
|
+
v128_t const coeff_5_f32x4 = wasm_f32x4_splat(-0.0001881748176f);
|
|
58
|
+
v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.008323502727f);
|
|
59
|
+
v128_t const coeff_1_f32x4 = wasm_f32x4_splat(-0.1666651368f);
|
|
60
|
+
|
|
61
|
+
// Compute (multiples_of_pi_f32x4) = round(angle / pi_f32x4) using nearest rounding
|
|
62
|
+
v128_t quotients_f32x4 = wasm_f32x4_mul(angles_radians, pi_reciprocal_f32x4);
|
|
63
|
+
v128_t rounded_quotients_f32x4 = wasm_f32x4_nearest(quotients_f32x4);
|
|
64
64
|
// relaxed_trunc: 1 instruction (cvttps2dq) vs 7 (with NaN/overflow fixup) on x86.
|
|
65
|
-
// Safe because
|
|
66
|
-
v128_t
|
|
65
|
+
// Safe because rounded_quotients_f32x4 are small integers from nearest(), never NaN or out of i32 range.
|
|
66
|
+
v128_t multiples_of_pi_f32x4 = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients_f32x4);
|
|
67
67
|
|
|
68
|
-
// Reduce the angle: angle -
|
|
68
|
+
// Reduce the angle: angle - rounded_quotients_f32x4 * pi_f32x4
|
|
69
69
|
// vfmsq_f32(acc, a, b) = acc - a*b -> wasm_f32x4_relaxed_nmadd(a, b, acc)
|
|
70
|
-
v128_t const
|
|
71
|
-
v128_t const
|
|
72
|
-
v128_t const
|
|
70
|
+
v128_t const angles_f32x4 = wasm_f32x4_relaxed_nmadd(rounded_quotients_f32x4, pi_f32x4, angles_radians);
|
|
71
|
+
v128_t const angles_squared_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_f32x4);
|
|
72
|
+
v128_t const angles_cubed_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_squared_f32x4);
|
|
73
73
|
|
|
74
74
|
// Compute the polynomial approximation
|
|
75
75
|
// vfmaq_f32(acc, a, b) = acc + a*b -> wasm_f32x4_relaxed_madd(a, b, acc)
|
|
76
|
-
v128_t
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
v128_t
|
|
80
|
-
|
|
81
|
-
// If
|
|
82
|
-
v128_t
|
|
83
|
-
v128_t
|
|
84
|
-
v128_t
|
|
76
|
+
v128_t polynomials_f32x4 = coeff_5_f32x4;
|
|
77
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_3_f32x4);
|
|
78
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_1_f32x4);
|
|
79
|
+
v128_t results_f32x4 = wasm_f32x4_relaxed_madd(angles_cubed_f32x4, polynomials_f32x4, angles_f32x4);
|
|
80
|
+
|
|
81
|
+
// If multiples_of_pi_f32x4 is odd, flip the sign
|
|
82
|
+
v128_t parity_i32x4 = wasm_v128_and(multiples_of_pi_f32x4, wasm_i32x4_splat(1));
|
|
83
|
+
v128_t odd_mask_i32x4 = wasm_i32x4_eq(parity_i32x4, wasm_i32x4_splat(1));
|
|
84
|
+
v128_t negated_f32x4 = wasm_f32x4_neg(results_f32x4);
|
|
85
85
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
86
86
|
// Safe because mask is from comparison (all-ones or all-zeros per lane).
|
|
87
|
-
|
|
88
|
-
return
|
|
87
|
+
results_f32x4 = wasm_i32x4_relaxed_laneselect(negated_f32x4, results_f32x4, odd_mask_i32x4);
|
|
88
|
+
return results_f32x4;
|
|
89
89
|
}
|
|
90
90
|
|
|
91
91
|
NK_INTERNAL v128_t nk_f32x4_cos_v128relaxed_(v128_t const angles_radians) {
|
|
92
92
|
// Constants for argument reduction
|
|
93
|
-
v128_t const
|
|
94
|
-
v128_t const
|
|
95
|
-
v128_t const
|
|
96
|
-
v128_t const
|
|
97
|
-
v128_t const
|
|
98
|
-
v128_t const
|
|
99
|
-
|
|
100
|
-
// Compute round((angle /
|
|
101
|
-
v128_t const
|
|
102
|
-
v128_t
|
|
103
|
-
v128_t
|
|
93
|
+
v128_t const pi_f32x4 = wasm_f32x4_splat(3.14159265358979323846f);
|
|
94
|
+
v128_t const pi_half_f32x4 = wasm_f32x4_splat(1.57079632679489661923f);
|
|
95
|
+
v128_t const pi_reciprocal_f32x4 = wasm_f32x4_splat(0.31830988618379067154f);
|
|
96
|
+
v128_t const coeff_5_f32x4 = wasm_f32x4_splat(-0.0001881748176f);
|
|
97
|
+
v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.008323502727f);
|
|
98
|
+
v128_t const coeff_1_f32x4 = wasm_f32x4_splat(-0.1666651368f);
|
|
99
|
+
|
|
100
|
+
// Compute round((angle / pi_f32x4) - 0.5)
|
|
101
|
+
v128_t const neg_half_f32x4 = wasm_f32x4_splat(-0.5f);
|
|
102
|
+
v128_t quotients_f32x4 = wasm_f32x4_relaxed_madd(angles_radians, pi_reciprocal_f32x4, neg_half_f32x4);
|
|
103
|
+
v128_t rounded_quotients_f32x4 = wasm_f32x4_nearest(quotients_f32x4);
|
|
104
104
|
// relaxed_trunc: 1 instruction (cvttps2dq) vs 7 (with NaN/overflow fixup) on x86.
|
|
105
|
-
// Safe because
|
|
106
|
-
v128_t
|
|
105
|
+
// Safe because rounded_quotients_f32x4 are small integers from nearest(), never NaN or out of i32 range.
|
|
106
|
+
v128_t multiples_of_pi_f32x4 = wasm_i32x4_relaxed_trunc_f32x4(rounded_quotients_f32x4);
|
|
107
107
|
|
|
108
|
-
// Reduce the angle: (angle -
|
|
109
|
-
v128_t
|
|
110
|
-
v128_t const
|
|
111
|
-
v128_t const
|
|
112
|
-
v128_t const
|
|
108
|
+
// Reduce the angle: (angle - pi_f32x4/2) - rounded_quotients_f32x4 * pi_f32x4
|
|
109
|
+
v128_t shifted_f32x4 = wasm_f32x4_sub(angles_radians, pi_half_f32x4);
|
|
110
|
+
v128_t const angles_f32x4 = wasm_f32x4_relaxed_nmadd(rounded_quotients_f32x4, pi_f32x4, shifted_f32x4);
|
|
111
|
+
v128_t const angles_squared_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_f32x4);
|
|
112
|
+
v128_t const angles_cubed_f32x4 = wasm_f32x4_mul(angles_f32x4, angles_squared_f32x4);
|
|
113
113
|
|
|
114
114
|
// Compute the polynomial approximation
|
|
115
|
-
v128_t
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
v128_t
|
|
119
|
-
|
|
120
|
-
// If
|
|
121
|
-
v128_t
|
|
122
|
-
v128_t
|
|
123
|
-
v128_t
|
|
115
|
+
v128_t polynomials_f32x4 = coeff_5_f32x4;
|
|
116
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_3_f32x4);
|
|
117
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, angles_squared_f32x4, coeff_1_f32x4);
|
|
118
|
+
v128_t results_f32x4 = wasm_f32x4_relaxed_madd(angles_cubed_f32x4, polynomials_f32x4, angles_f32x4);
|
|
119
|
+
|
|
120
|
+
// If multiples_of_pi_f32x4 is even, flip the sign
|
|
121
|
+
v128_t parity_i32x4 = wasm_v128_and(multiples_of_pi_f32x4, wasm_i32x4_splat(1));
|
|
122
|
+
v128_t even_mask_i32x4 = wasm_i32x4_eq(parity_i32x4, wasm_i32x4_splat(0));
|
|
123
|
+
v128_t negated_f32x4 = wasm_f32x4_neg(results_f32x4);
|
|
124
124
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
125
125
|
// Safe because mask is from comparison (all-ones or all-zeros per lane).
|
|
126
|
-
|
|
127
|
-
return
|
|
126
|
+
results_f32x4 = wasm_i32x4_relaxed_laneselect(negated_f32x4, results_f32x4, even_mask_i32x4);
|
|
127
|
+
return results_f32x4;
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
NK_INTERNAL v128_t nk_f32x4_atan_v128relaxed_(v128_t const inputs) {
|
|
131
131
|
// Polynomial coefficients for atan approximation (8 terms)
|
|
132
|
-
v128_t const
|
|
133
|
-
v128_t const
|
|
134
|
-
v128_t const
|
|
135
|
-
v128_t const
|
|
136
|
-
v128_t const
|
|
137
|
-
v128_t const
|
|
138
|
-
v128_t const
|
|
139
|
-
v128_t const
|
|
140
|
-
v128_t const
|
|
141
|
-
|
|
142
|
-
// Detect negative
|
|
143
|
-
v128_t const
|
|
144
|
-
v128_t
|
|
145
|
-
v128_t
|
|
146
|
-
|
|
147
|
-
// Check if
|
|
148
|
-
v128_t
|
|
132
|
+
v128_t const coeff_8_f32x4 = wasm_f32x4_splat(-0.333331018686294555664062f);
|
|
133
|
+
v128_t const coeff_7_f32x4 = wasm_f32x4_splat(+0.199926957488059997558594f);
|
|
134
|
+
v128_t const coeff_6_f32x4 = wasm_f32x4_splat(-0.142027363181114196777344f);
|
|
135
|
+
v128_t const coeff_5_f32x4 = wasm_f32x4_splat(+0.106347933411598205566406f);
|
|
136
|
+
v128_t const coeff_4_f32x4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
|
|
137
|
+
v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.0425049886107444763183594f);
|
|
138
|
+
v128_t const coeff_2_f32x4 = wasm_f32x4_splat(-0.0159569028764963150024414f);
|
|
139
|
+
v128_t const coeff_1_f32x4 = wasm_f32x4_splat(+0.00282363896258175373077393f);
|
|
140
|
+
v128_t const half_pi_f32x4 = wasm_f32x4_splat(1.5707963267948966f);
|
|
141
|
+
|
|
142
|
+
// Detect negative values_f32x4 and take absolute value
|
|
143
|
+
v128_t const zeros_f32x4 = wasm_f32x4_splat(0);
|
|
144
|
+
v128_t negative_mask_f32x4 = wasm_f32x4_lt(inputs, zeros_f32x4);
|
|
145
|
+
v128_t values_f32x4 = wasm_f32x4_abs(inputs);
|
|
146
|
+
|
|
147
|
+
// Check if values_f32x4 > 1 (need reciprocal)
|
|
148
|
+
v128_t reciprocal_mask_f32x4 = wasm_f32x4_gt(values_f32x4, wasm_f32x4_splat(1.0f));
|
|
149
149
|
|
|
150
150
|
// No fast reciprocal in WASM — use division
|
|
151
|
-
v128_t
|
|
151
|
+
v128_t recip_f32x4 = wasm_f32x4_div(wasm_f32x4_splat(1.0f), values_f32x4);
|
|
152
152
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
153
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
154
|
-
|
|
153
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
|
|
154
|
+
values_f32x4 = wasm_i32x4_relaxed_laneselect(recip_f32x4, values_f32x4, reciprocal_mask_f32x4);
|
|
155
155
|
|
|
156
156
|
// Compute powers
|
|
157
|
-
v128_t const
|
|
158
|
-
v128_t const
|
|
157
|
+
v128_t const values_squared_f32x4 = wasm_f32x4_mul(values_f32x4, values_f32x4);
|
|
158
|
+
v128_t const values_cubed_f32x4 = wasm_f32x4_mul(values_f32x4, values_squared_f32x4);
|
|
159
159
|
|
|
160
160
|
// Polynomial evaluation using Horner's method
|
|
161
|
-
v128_t
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
// Compute
|
|
171
|
-
v128_t
|
|
172
|
-
|
|
173
|
-
// Adjust for reciprocal:
|
|
174
|
-
v128_t
|
|
161
|
+
v128_t polynomials_f32x4 = coeff_1_f32x4;
|
|
162
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_2_f32x4);
|
|
163
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_3_f32x4);
|
|
164
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_4_f32x4);
|
|
165
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_5_f32x4);
|
|
166
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_6_f32x4);
|
|
167
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_7_f32x4);
|
|
168
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, values_squared_f32x4, coeff_8_f32x4);
|
|
169
|
+
|
|
170
|
+
// Compute result_f32x4: atan(x) ~ x + x^3 * P(x^2)
|
|
171
|
+
v128_t result_f32x4 = wasm_f32x4_relaxed_madd(values_cubed_f32x4, polynomials_f32x4, values_f32x4);
|
|
172
|
+
|
|
173
|
+
// Adjust for reciprocal: result_f32x4 = pi/2 - result_f32x4
|
|
174
|
+
v128_t adjusted_f32x4 = wasm_f32x4_sub(half_pi_f32x4, result_f32x4);
|
|
175
175
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
176
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
177
|
-
|
|
176
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
|
|
177
|
+
result_f32x4 = wasm_i32x4_relaxed_laneselect(adjusted_f32x4, result_f32x4, reciprocal_mask_f32x4);
|
|
178
178
|
|
|
179
|
-
// Adjust for negative:
|
|
180
|
-
v128_t
|
|
179
|
+
// Adjust for negative: result_f32x4 = -result_f32x4
|
|
180
|
+
v128_t negated_f32x4 = wasm_f32x4_neg(result_f32x4);
|
|
181
181
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
182
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
183
|
-
|
|
184
|
-
return
|
|
182
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
|
|
183
|
+
result_f32x4 = wasm_i32x4_relaxed_laneselect(negated_f32x4, result_f32x4, negative_mask_f32x4);
|
|
184
|
+
return result_f32x4;
|
|
185
185
|
}
|
|
186
186
|
|
|
187
187
|
NK_INTERNAL v128_t nk_f32x4_atan2_v128relaxed_(v128_t const ys_inputs, v128_t const xs_inputs) {
|
|
188
188
|
// Polynomial coefficients (same as atan)
|
|
189
|
-
v128_t const
|
|
190
|
-
v128_t const
|
|
191
|
-
v128_t const
|
|
192
|
-
v128_t const
|
|
193
|
-
v128_t const
|
|
194
|
-
v128_t const
|
|
195
|
-
v128_t const
|
|
196
|
-
v128_t const
|
|
197
|
-
v128_t const
|
|
198
|
-
v128_t const
|
|
199
|
-
v128_t const
|
|
189
|
+
v128_t const coeff_8_f32x4 = wasm_f32x4_splat(-0.333331018686294555664062f);
|
|
190
|
+
v128_t const coeff_7_f32x4 = wasm_f32x4_splat(+0.199926957488059997558594f);
|
|
191
|
+
v128_t const coeff_6_f32x4 = wasm_f32x4_splat(-0.142027363181114196777344f);
|
|
192
|
+
v128_t const coeff_5_f32x4 = wasm_f32x4_splat(+0.106347933411598205566406f);
|
|
193
|
+
v128_t const coeff_4_f32x4 = wasm_f32x4_splat(-0.0748900920152664184570312f);
|
|
194
|
+
v128_t const coeff_3_f32x4 = wasm_f32x4_splat(+0.0425049886107444763183594f);
|
|
195
|
+
v128_t const coeff_2_f32x4 = wasm_f32x4_splat(-0.0159569028764963150024414f);
|
|
196
|
+
v128_t const coeff_1_f32x4 = wasm_f32x4_splat(+0.00282363896258175373077393f);
|
|
197
|
+
v128_t const pi_f32x4 = wasm_f32x4_splat(3.14159265358979323846f);
|
|
198
|
+
v128_t const half_pi_f32x4 = wasm_f32x4_splat(1.5707963267948966f);
|
|
199
|
+
v128_t const zeros_f32x4 = wasm_f32x4_splat(0);
|
|
200
200
|
|
|
201
201
|
// Quadrant adjustments - take absolute values
|
|
202
|
-
v128_t
|
|
203
|
-
v128_t
|
|
204
|
-
v128_t
|
|
202
|
+
v128_t xs_negative_mask_f32x4 = wasm_f32x4_lt(xs_inputs, zeros_f32x4);
|
|
203
|
+
v128_t xs_f32x4 = wasm_f32x4_abs(xs_inputs);
|
|
204
|
+
v128_t ys_f32x4 = wasm_f32x4_abs(ys_inputs);
|
|
205
205
|
|
|
206
206
|
// Ensure proper fraction where numerator < denominator
|
|
207
|
-
v128_t
|
|
208
|
-
v128_t
|
|
207
|
+
v128_t swap_mask_f32x4 = wasm_f32x4_gt(ys_f32x4, xs_f32x4);
|
|
208
|
+
v128_t temps_f32x4 = xs_f32x4;
|
|
209
209
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
210
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
211
|
-
|
|
212
|
-
|
|
210
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
|
|
211
|
+
xs_f32x4 = wasm_i32x4_relaxed_laneselect(ys_f32x4, xs_f32x4, swap_mask_f32x4);
|
|
212
|
+
ys_f32x4 = wasm_i32x4_relaxed_laneselect(wasm_f32x4_neg(temps_f32x4), ys_f32x4, swap_mask_f32x4);
|
|
213
213
|
|
|
214
|
-
// Division for
|
|
215
|
-
v128_t const
|
|
216
|
-
v128_t const
|
|
217
|
-
v128_t const
|
|
214
|
+
// Division for ratio_f32x4: ratio_f32x4 = ys_f32x4 / xs_f32x4
|
|
215
|
+
v128_t const ratio_f32x4 = wasm_f32x4_div(ys_f32x4, xs_f32x4);
|
|
216
|
+
v128_t const ratio_squared_f32x4 = wasm_f32x4_mul(ratio_f32x4, ratio_f32x4);
|
|
217
|
+
v128_t const ratio_cubed_f32x4 = wasm_f32x4_mul(ratio_f32x4, ratio_squared_f32x4);
|
|
218
218
|
|
|
219
219
|
// Polynomial evaluation using Horner's method
|
|
220
|
-
v128_t
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
220
|
+
v128_t polynomials_f32x4 = coeff_1_f32x4;
|
|
221
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_2_f32x4);
|
|
222
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_3_f32x4);
|
|
223
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_4_f32x4);
|
|
224
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_5_f32x4);
|
|
225
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_6_f32x4);
|
|
226
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_7_f32x4);
|
|
227
|
+
polynomials_f32x4 = wasm_f32x4_relaxed_madd(polynomials_f32x4, ratio_squared_f32x4, coeff_8_f32x4);
|
|
228
228
|
|
|
229
229
|
// Compute the result
|
|
230
|
-
v128_t
|
|
230
|
+
v128_t results_f32x4 = wasm_f32x4_relaxed_madd(ratio_cubed_f32x4, polynomials_f32x4, ratio_f32x4);
|
|
231
231
|
|
|
232
|
-
// Compute
|
|
232
|
+
// Compute quadrant_f32x4 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
|
|
233
233
|
// -2 for x<0 && !swap, -1 for x<0 && swap
|
|
234
|
-
v128_t
|
|
235
|
-
v128_t
|
|
234
|
+
v128_t quadrant_f32x4 = wasm_f32x4_splat(0.0f);
|
|
235
|
+
v128_t neg_two_f32x4 = wasm_f32x4_splat(-2.0f);
|
|
236
236
|
// relaxed_laneselect: 1 instruction (vblendvps) vs 3 (vpand+vpandn+vpor) on x86.
|
|
237
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
238
|
-
|
|
239
|
-
v128_t
|
|
240
|
-
|
|
237
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f32x4 per lane).
|
|
238
|
+
quadrant_f32x4 = wasm_i32x4_relaxed_laneselect(neg_two_f32x4, quadrant_f32x4, xs_negative_mask_f32x4);
|
|
239
|
+
v128_t quadrant_incremented_f32x4 = wasm_f32x4_add(quadrant_f32x4, wasm_f32x4_splat(1.0f));
|
|
240
|
+
quadrant_f32x4 = wasm_i32x4_relaxed_laneselect(quadrant_incremented_f32x4, quadrant_f32x4, swap_mask_f32x4);
|
|
241
241
|
|
|
242
|
-
// Adjust for
|
|
243
|
-
|
|
242
|
+
// Adjust for quadrant_f32x4: result += quadrant_f32x4 * pi_f32x4/2
|
|
243
|
+
results_f32x4 = wasm_f32x4_relaxed_madd(quadrant_f32x4, half_pi_f32x4, results_f32x4);
|
|
244
244
|
|
|
245
245
|
// Transfer sign from x and y by XOR with sign bits
|
|
246
|
-
v128_t
|
|
247
|
-
v128_t
|
|
248
|
-
v128_t
|
|
249
|
-
|
|
250
|
-
|
|
246
|
+
v128_t sign_mask_f32x4 = wasm_f32x4_splat(-0.0f);
|
|
247
|
+
v128_t xs_sign_f32x4 = wasm_v128_and(xs_inputs, sign_mask_f32x4);
|
|
248
|
+
v128_t ys_sign_f32x4 = wasm_v128_and(ys_inputs, sign_mask_f32x4);
|
|
249
|
+
results_f32x4 = wasm_v128_xor(results_f32x4, xs_sign_f32x4);
|
|
250
|
+
results_f32x4 = wasm_v128_xor(results_f32x4, ys_sign_f32x4);
|
|
251
251
|
|
|
252
|
-
return
|
|
252
|
+
return results_f32x4;
|
|
253
253
|
}
|
|
254
254
|
|
|
255
255
|
NK_INTERNAL v128_t nk_f64x2_sin_v128relaxed_(v128_t const angles_radians) {
|
|
256
256
|
// Constants for argument reduction
|
|
257
|
-
v128_t const
|
|
258
|
-
v128_t const
|
|
259
|
-
v128_t const
|
|
257
|
+
v128_t const pi_high_f64x2 = wasm_f64x2_splat(3.141592653589793116);
|
|
258
|
+
v128_t const pi_low_f64x2 = wasm_f64x2_splat(1.2246467991473532072e-16);
|
|
259
|
+
v128_t const pi_reciprocal_f64x2 = wasm_f64x2_splat(0.31830988618379067154);
|
|
260
260
|
|
|
261
261
|
// Polynomial coefficients for sine approximation
|
|
262
|
-
v128_t const
|
|
263
|
-
v128_t const
|
|
264
|
-
v128_t const
|
|
265
|
-
v128_t const
|
|
266
|
-
v128_t const
|
|
267
|
-
v128_t const
|
|
268
|
-
v128_t const
|
|
269
|
-
v128_t const
|
|
270
|
-
v128_t const
|
|
262
|
+
v128_t const coeff_0_f64x2 = wasm_f64x2_splat(+0.00833333333333332974823815);
|
|
263
|
+
v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.000198412698412696162806809);
|
|
264
|
+
v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
|
|
265
|
+
v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
|
|
266
|
+
v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
|
|
267
|
+
v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
|
|
268
|
+
v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
|
|
269
|
+
v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
|
|
270
|
+
v128_t const coeff_8_f64x2 = wasm_f64x2_splat(-0.166666666666666657414808);
|
|
271
271
|
|
|
272
272
|
// Compute round(angle / pi)
|
|
273
|
-
v128_t const
|
|
274
|
-
v128_t
|
|
273
|
+
v128_t const quotients_f64x2 = wasm_f64x2_mul(angles_radians, pi_reciprocal_f64x2);
|
|
274
|
+
v128_t rounded_quotients_f64x2 = wasm_f64x2_nearest(quotients_f64x2);
|
|
275
275
|
// relaxed_trunc: 1 instruction (cvttpd2dq) vs 7 (with NaN/overflow fixup) on x86.
|
|
276
|
-
// Safe because
|
|
277
|
-
v128_t
|
|
276
|
+
// Safe because rounded_quotients_f64x2 are small integers from nearest(), never NaN or out of i32 range.
|
|
277
|
+
v128_t multiples_i32_f64x2 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients_f64x2);
|
|
278
278
|
|
|
279
|
-
// Two-step Cody-Waite reduction: angle - rounded *
|
|
280
|
-
v128_t
|
|
281
|
-
|
|
282
|
-
|
|
279
|
+
// Two-step Cody-Waite reduction: angle - rounded * pi_high_f64x2 - rounded * pi_low_f64x2
|
|
280
|
+
v128_t angles_f64x2 = angles_radians;
|
|
281
|
+
angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_high_f64x2, angles_f64x2);
|
|
282
|
+
angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_low_f64x2, angles_f64x2);
|
|
283
283
|
|
|
284
284
|
// Check parity in i32, then widen to i64 mask for laneselect
|
|
285
|
-
v128_t
|
|
286
|
-
v128_t
|
|
285
|
+
v128_t parity_i32_i32x4 = wasm_v128_and(multiples_i32_f64x2, wasm_i32x4_splat(1));
|
|
286
|
+
v128_t odd_i32_i32x4 = wasm_i32x4_eq(parity_i32_i32x4, wasm_i32x4_splat(1));
|
|
287
287
|
// Widen: lane0 of i32 -> lanes 0-1 of i64, lane1 -> lanes 2-3
|
|
288
288
|
// Shuffle i32 lanes [0,0,1,1] to broadcast each i32 parity into both halves of each i64
|
|
289
|
-
v128_t
|
|
290
|
-
v128_t
|
|
289
|
+
v128_t odd_mask_i32x4 = wasm_i32x4_shuffle(odd_i32_i32x4, odd_i32_i32x4, 0, 0, 1, 1);
|
|
290
|
+
v128_t negated_angles_f64x2 = wasm_f64x2_neg(angles_f64x2);
|
|
291
291
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
292
292
|
// Safe because mask is lane-granular at i64 width (all-ones or all-zeros per 64-bit lane).
|
|
293
|
-
|
|
293
|
+
angles_f64x2 = wasm_i64x2_relaxed_laneselect(negated_angles_f64x2, angles_f64x2, odd_mask_i32x4);
|
|
294
294
|
|
|
295
|
-
v128_t const
|
|
296
|
-
v128_t const
|
|
297
|
-
v128_t const
|
|
298
|
-
v128_t const
|
|
295
|
+
v128_t const angles_squared_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_f64x2);
|
|
296
|
+
v128_t const angles_cubed_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_squared_f64x2);
|
|
297
|
+
v128_t const angles_quadratic_f64x2 = wasm_f64x2_mul(angles_squared_f64x2, angles_squared_f64x2);
|
|
298
|
+
v128_t const angles_octic_f64x2 = wasm_f64x2_mul(angles_quadratic_f64x2, angles_quadratic_f64x2);
|
|
299
299
|
|
|
300
300
|
// Compute polynomial terms using Estrin's scheme for better ILP
|
|
301
|
-
v128_t const
|
|
302
|
-
v128_t const
|
|
303
|
-
v128_t const
|
|
301
|
+
v128_t const poly_67_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_7_f64x2, coeff_6_f64x2);
|
|
302
|
+
v128_t const poly_45_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_5_f64x2, coeff_4_f64x2);
|
|
303
|
+
v128_t const poly_4567_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_67_f64x2, poly_45_f64x2);
|
|
304
304
|
|
|
305
|
-
v128_t const
|
|
306
|
-
v128_t const
|
|
307
|
-
v128_t const
|
|
305
|
+
v128_t const poly_23_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_3_f64x2, coeff_2_f64x2);
|
|
306
|
+
v128_t const poly_01_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_1_f64x2, coeff_0_f64x2);
|
|
307
|
+
v128_t const poly_0123_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_23_f64x2, poly_01_f64x2);
|
|
308
308
|
|
|
309
309
|
// Combine polynomial terms
|
|
310
|
-
v128_t
|
|
311
|
-
|
|
312
|
-
|
|
310
|
+
v128_t results_f64x2 = wasm_f64x2_relaxed_madd(angles_octic_f64x2, poly_4567_f64x2, poly_0123_f64x2);
|
|
311
|
+
results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_squared_f64x2, coeff_8_f64x2);
|
|
312
|
+
results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_cubed_f64x2, angles_f64x2);
|
|
313
313
|
|
|
314
314
|
// Handle zero input (preserve sign of zero)
|
|
315
|
-
v128_t const
|
|
315
|
+
v128_t const non_zero_mask_f64x2 = wasm_f64x2_eq(angles_radians, wasm_f64x2_splat(0));
|
|
316
316
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
317
317
|
// Safe because mask is from comparison (all-ones or all-zeros per lane).
|
|
318
|
-
|
|
319
|
-
return
|
|
318
|
+
results_f64x2 = wasm_i64x2_relaxed_laneselect(angles_radians, results_f64x2, non_zero_mask_f64x2);
|
|
319
|
+
return results_f64x2;
|
|
320
320
|
}
|
|
321
321
|
|
|
322
322
|
NK_INTERNAL v128_t nk_f64x2_cos_v128relaxed_(v128_t const angles_radians) {
|
|
323
323
|
// Constants for argument reduction
|
|
324
|
-
v128_t const
|
|
325
|
-
v128_t const
|
|
326
|
-
v128_t const
|
|
324
|
+
v128_t const pi_high_half_f64x2 = wasm_f64x2_splat(3.141592653589793116 * 0.5);
|
|
325
|
+
v128_t const pi_low_half_f64x2 = wasm_f64x2_splat(1.2246467991473532072e-16 * 0.5);
|
|
326
|
+
v128_t const pi_reciprocal_f64x2 = wasm_f64x2_splat(0.31830988618379067154);
|
|
327
327
|
|
|
328
328
|
// Polynomial coefficients for cosine approximation
|
|
329
|
-
v128_t const
|
|
330
|
-
v128_t const
|
|
331
|
-
v128_t const
|
|
332
|
-
v128_t const
|
|
333
|
-
v128_t const
|
|
334
|
-
v128_t const
|
|
335
|
-
v128_t const
|
|
336
|
-
v128_t const
|
|
337
|
-
v128_t const
|
|
329
|
+
v128_t const coeff_0_f64x2 = wasm_f64x2_splat(+0.00833333333333332974823815);
|
|
330
|
+
v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.000198412698412696162806809);
|
|
331
|
+
v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+2.75573192239198747630416e-06);
|
|
332
|
+
v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-2.50521083763502045810755e-08);
|
|
333
|
+
v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+1.60590430605664501629054e-10);
|
|
334
|
+
v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-7.64712219118158833288484e-13);
|
|
335
|
+
v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+2.81009972710863200091251e-15);
|
|
336
|
+
v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-7.97255955009037868891952e-18);
|
|
337
|
+
v128_t const coeff_8_f64x2 = wasm_f64x2_splat(-0.166666666666666657414808);
|
|
338
338
|
|
|
339
339
|
// Compute 2 * round(angle / pi - 0.5) + 1
|
|
340
|
-
v128_t const
|
|
341
|
-
v128_t const
|
|
342
|
-
v128_t const
|
|
343
|
-
v128_t const
|
|
340
|
+
v128_t const neg_half_f64x2 = wasm_f64x2_splat(-0.5);
|
|
341
|
+
v128_t const quotients_f64x2 = wasm_f64x2_relaxed_madd(angles_radians, pi_reciprocal_f64x2, neg_half_f64x2);
|
|
342
|
+
v128_t const rounded_f64x2 = wasm_f64x2_nearest(quotients_f64x2);
|
|
343
|
+
v128_t const rounded_quotients_f64x2 = wasm_f64x2_relaxed_madd(wasm_f64x2_splat(2.0), rounded_f64x2,
|
|
344
|
+
wasm_f64x2_splat(1.0));
|
|
344
345
|
// relaxed_trunc: 1 instruction (cvttpd2dq) vs 7 (with NaN/overflow fixup) on x86.
|
|
345
|
-
// Safe because
|
|
346
|
-
v128_t
|
|
346
|
+
// Safe because rounded_quotients_f64x2 are small integers from nearest(), never NaN or out of i32 range.
|
|
347
|
+
v128_t quotients_i32_f64x2 = wasm_i32x4_relaxed_trunc_f64x2_zero(rounded_quotients_f64x2);
|
|
347
348
|
|
|
348
349
|
// Two-step Cody-Waite reduction
|
|
349
|
-
v128_t
|
|
350
|
-
|
|
351
|
-
|
|
350
|
+
v128_t angles_f64x2 = angles_radians;
|
|
351
|
+
angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_high_half_f64x2, angles_f64x2);
|
|
352
|
+
angles_f64x2 = wasm_f64x2_relaxed_nmadd(rounded_quotients_f64x2, pi_low_half_f64x2, angles_f64x2);
|
|
352
353
|
|
|
353
354
|
// Check bit 1 in i32, then widen to i64 mask for laneselect
|
|
354
|
-
v128_t
|
|
355
|
-
v128_t
|
|
356
|
-
v128_t
|
|
357
|
-
v128_t
|
|
355
|
+
v128_t bit2_i32_i32x4 = wasm_v128_and(quotients_i32_f64x2, wasm_i32x4_splat(2));
|
|
356
|
+
v128_t flip_i32_i32x4 = wasm_i32x4_eq(bit2_i32_i32x4, wasm_i32x4_splat(0));
|
|
357
|
+
v128_t flip_mask_i32x4 = wasm_i32x4_shuffle(flip_i32_i32x4, flip_i32_i32x4, 0, 0, 1, 1);
|
|
358
|
+
v128_t negated_angles_f64x2 = wasm_f64x2_neg(angles_f64x2);
|
|
358
359
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
359
360
|
// Safe because mask is lane-granular at i64 width (all-ones or all-zeros per 64-bit lane).
|
|
360
|
-
|
|
361
|
+
angles_f64x2 = wasm_i64x2_relaxed_laneselect(negated_angles_f64x2, angles_f64x2, flip_mask_i32x4);
|
|
361
362
|
|
|
362
|
-
v128_t const
|
|
363
|
-
v128_t const
|
|
364
|
-
v128_t const
|
|
365
|
-
v128_t const
|
|
363
|
+
v128_t const angles_squared_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_f64x2);
|
|
364
|
+
v128_t const angles_cubed_f64x2 = wasm_f64x2_mul(angles_f64x2, angles_squared_f64x2);
|
|
365
|
+
v128_t const angles_quadratic_f64x2 = wasm_f64x2_mul(angles_squared_f64x2, angles_squared_f64x2);
|
|
366
|
+
v128_t const angles_octic_f64x2 = wasm_f64x2_mul(angles_quadratic_f64x2, angles_quadratic_f64x2);
|
|
366
367
|
|
|
367
368
|
// Compute polynomial terms using Estrin's scheme
|
|
368
|
-
v128_t const
|
|
369
|
-
v128_t const
|
|
370
|
-
v128_t const
|
|
369
|
+
v128_t const poly_67_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_7_f64x2, coeff_6_f64x2);
|
|
370
|
+
v128_t const poly_45_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_5_f64x2, coeff_4_f64x2);
|
|
371
|
+
v128_t const poly_4567_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_67_f64x2, poly_45_f64x2);
|
|
371
372
|
|
|
372
|
-
v128_t const
|
|
373
|
-
v128_t const
|
|
374
|
-
v128_t const
|
|
373
|
+
v128_t const poly_23_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_3_f64x2, coeff_2_f64x2);
|
|
374
|
+
v128_t const poly_01_f64x2 = wasm_f64x2_relaxed_madd(angles_squared_f64x2, coeff_1_f64x2, coeff_0_f64x2);
|
|
375
|
+
v128_t const poly_0123_f64x2 = wasm_f64x2_relaxed_madd(angles_quadratic_f64x2, poly_23_f64x2, poly_01_f64x2);
|
|
375
376
|
|
|
376
377
|
// Combine polynomial terms
|
|
377
|
-
v128_t
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
return
|
|
378
|
+
v128_t results_f64x2 = wasm_f64x2_relaxed_madd(angles_octic_f64x2, poly_4567_f64x2, poly_0123_f64x2);
|
|
379
|
+
results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_squared_f64x2, coeff_8_f64x2);
|
|
380
|
+
results_f64x2 = wasm_f64x2_relaxed_madd(results_f64x2, angles_cubed_f64x2, angles_f64x2);
|
|
381
|
+
return results_f64x2;
|
|
381
382
|
}
|
|
382
383
|
|
|
383
384
|
NK_INTERNAL v128_t nk_f64x2_atan_v128relaxed_(v128_t const inputs) {
|
|
384
385
|
// Polynomial coefficients for atan approximation (19 terms)
|
|
385
|
-
v128_t const
|
|
386
|
-
v128_t const
|
|
387
|
-
v128_t const
|
|
388
|
-
v128_t const
|
|
389
|
-
v128_t const
|
|
390
|
-
v128_t const
|
|
391
|
-
v128_t const
|
|
392
|
-
v128_t const
|
|
393
|
-
v128_t const
|
|
394
|
-
v128_t const
|
|
395
|
-
v128_t const
|
|
396
|
-
v128_t const
|
|
397
|
-
v128_t const
|
|
398
|
-
v128_t const
|
|
399
|
-
v128_t const
|
|
400
|
-
v128_t const
|
|
401
|
-
v128_t const
|
|
402
|
-
v128_t const
|
|
403
|
-
v128_t const
|
|
404
|
-
v128_t const
|
|
405
|
-
v128_t const
|
|
386
|
+
v128_t const coeff_19_f64x2 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
|
|
387
|
+
v128_t const coeff_18_f64x2 = wasm_f64x2_splat(+0.000209850076645816976906797);
|
|
388
|
+
v128_t const coeff_17_f64x2 = wasm_f64x2_splat(-0.00110611831486672482563471);
|
|
389
|
+
v128_t const coeff_16_f64x2 = wasm_f64x2_splat(+0.00370026744188713119232403);
|
|
390
|
+
v128_t const coeff_15_f64x2 = wasm_f64x2_splat(-0.00889896195887655491740809);
|
|
391
|
+
v128_t const coeff_14_f64x2 = wasm_f64x2_splat(+0.016599329773529201970117);
|
|
392
|
+
v128_t const coeff_13_f64x2 = wasm_f64x2_splat(-0.0254517624932312641616861);
|
|
393
|
+
v128_t const coeff_12_f64x2 = wasm_f64x2_splat(+0.0337852580001353069993897);
|
|
394
|
+
v128_t const coeff_11_f64x2 = wasm_f64x2_splat(-0.0407629191276836500001934);
|
|
395
|
+
v128_t const coeff_10_f64x2 = wasm_f64x2_splat(+0.0466667150077840625632675);
|
|
396
|
+
v128_t const coeff_9_f64x2 = wasm_f64x2_splat(-0.0523674852303482457616113);
|
|
397
|
+
v128_t const coeff_8_f64x2 = wasm_f64x2_splat(+0.0587666392926673580854313);
|
|
398
|
+
v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-0.0666573579361080525984562);
|
|
399
|
+
v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+0.0769219538311769618355029);
|
|
400
|
+
v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-0.090908995008245008229153);
|
|
401
|
+
v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+0.111111105648261418443745);
|
|
402
|
+
v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-0.14285714266771329383765);
|
|
403
|
+
v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+0.199999999996591265594148);
|
|
404
|
+
v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.333333333333311110369124);
|
|
405
|
+
v128_t const half_pi_f64x2 = wasm_f64x2_splat(1.5707963267948966);
|
|
406
|
+
v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
|
|
406
407
|
|
|
407
408
|
// Detect negative and take absolute value
|
|
408
|
-
v128_t
|
|
409
|
-
v128_t
|
|
409
|
+
v128_t negative_mask_f64x2 = wasm_f64x2_lt(inputs, zeros_f64x2);
|
|
410
|
+
v128_t values_f64x2 = wasm_f64x2_abs(inputs);
|
|
410
411
|
|
|
411
|
-
// Check if
|
|
412
|
-
v128_t
|
|
413
|
-
v128_t
|
|
412
|
+
// Check if values_f64x2 > 1 (need reciprocal) - use division for f64 precision
|
|
413
|
+
v128_t reciprocal_mask_f64x2 = wasm_f64x2_gt(values_f64x2, wasm_f64x2_splat(1.0));
|
|
414
|
+
v128_t reciprocal_values_f64x2 = wasm_f64x2_div(wasm_f64x2_splat(1.0), values_f64x2);
|
|
414
415
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
415
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
416
|
-
|
|
416
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
|
|
417
|
+
values_f64x2 = wasm_i64x2_relaxed_laneselect(reciprocal_values_f64x2, values_f64x2, reciprocal_mask_f64x2);
|
|
417
418
|
|
|
418
419
|
// Compute powers
|
|
419
|
-
v128_t const
|
|
420
|
-
v128_t const
|
|
420
|
+
v128_t const values_squared_f64x2 = wasm_f64x2_mul(values_f64x2, values_f64x2);
|
|
421
|
+
v128_t const values_cubed_f64x2 = wasm_f64x2_mul(values_f64x2, values_squared_f64x2);
|
|
421
422
|
|
|
422
423
|
// Polynomial evaluation using Horner's method
|
|
423
|
-
v128_t
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
// Compute
|
|
444
|
-
v128_t
|
|
445
|
-
|
|
446
|
-
// Adjust for reciprocal:
|
|
447
|
-
v128_t
|
|
424
|
+
v128_t polynomials_f64x2 = coeff_19_f64x2;
|
|
425
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_18_f64x2);
|
|
426
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_17_f64x2);
|
|
427
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_16_f64x2);
|
|
428
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_15_f64x2);
|
|
429
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_14_f64x2);
|
|
430
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_13_f64x2);
|
|
431
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_12_f64x2);
|
|
432
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_11_f64x2);
|
|
433
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_10_f64x2);
|
|
434
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_9_f64x2);
|
|
435
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_8_f64x2);
|
|
436
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_7_f64x2);
|
|
437
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_6_f64x2);
|
|
438
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_5_f64x2);
|
|
439
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_4_f64x2);
|
|
440
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_3_f64x2);
|
|
441
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_2_f64x2);
|
|
442
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, values_squared_f64x2, coeff_1_f64x2);
|
|
443
|
+
|
|
444
|
+
// Compute result_f64x2
|
|
445
|
+
v128_t result_f64x2 = wasm_f64x2_relaxed_madd(values_cubed_f64x2, polynomials_f64x2, values_f64x2);
|
|
446
|
+
|
|
447
|
+
// Adjust for reciprocal: result_f64x2 = pi/2 - result_f64x2
|
|
448
|
+
v128_t adjusted_f64x2 = wasm_f64x2_sub(half_pi_f64x2, result_f64x2);
|
|
448
449
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
449
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
450
|
-
|
|
450
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
|
|
451
|
+
result_f64x2 = wasm_i64x2_relaxed_laneselect(adjusted_f64x2, result_f64x2, reciprocal_mask_f64x2);
|
|
451
452
|
|
|
452
|
-
// Adjust for negative:
|
|
453
|
-
v128_t
|
|
453
|
+
// Adjust for negative: result_f64x2 = -result_f64x2
|
|
454
|
+
v128_t negated_f64x2 = wasm_f64x2_neg(result_f64x2);
|
|
454
455
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
455
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
456
|
-
|
|
457
|
-
return
|
|
456
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
|
|
457
|
+
result_f64x2 = wasm_i64x2_relaxed_laneselect(negated_f64x2, result_f64x2, negative_mask_f64x2);
|
|
458
|
+
return result_f64x2;
|
|
458
459
|
}
|
|
459
460
|
|
|
460
461
|
NK_INTERNAL v128_t nk_f64x2_atan2_v128relaxed_(v128_t const ys_inputs, v128_t const xs_inputs) {
|
|
461
462
|
// Polynomial coefficients (same as atan)
|
|
462
|
-
v128_t const
|
|
463
|
-
v128_t const
|
|
464
|
-
v128_t const
|
|
465
|
-
v128_t const
|
|
466
|
-
v128_t const
|
|
467
|
-
v128_t const
|
|
468
|
-
v128_t const
|
|
469
|
-
v128_t const
|
|
470
|
-
v128_t const
|
|
471
|
-
v128_t const
|
|
472
|
-
v128_t const
|
|
473
|
-
v128_t const
|
|
474
|
-
v128_t const
|
|
475
|
-
v128_t const
|
|
476
|
-
v128_t const
|
|
477
|
-
v128_t const
|
|
478
|
-
v128_t const
|
|
479
|
-
v128_t const
|
|
480
|
-
v128_t const
|
|
481
|
-
v128_t const
|
|
482
|
-
v128_t const
|
|
483
|
-
v128_t const
|
|
463
|
+
v128_t const coeff_19_f64x2 = wasm_f64x2_splat(-1.88796008463073496563746e-05);
|
|
464
|
+
v128_t const coeff_18_f64x2 = wasm_f64x2_splat(+0.000209850076645816976906797);
|
|
465
|
+
v128_t const coeff_17_f64x2 = wasm_f64x2_splat(-0.00110611831486672482563471);
|
|
466
|
+
v128_t const coeff_16_f64x2 = wasm_f64x2_splat(+0.00370026744188713119232403);
|
|
467
|
+
v128_t const coeff_15_f64x2 = wasm_f64x2_splat(-0.00889896195887655491740809);
|
|
468
|
+
v128_t const coeff_14_f64x2 = wasm_f64x2_splat(+0.016599329773529201970117);
|
|
469
|
+
v128_t const coeff_13_f64x2 = wasm_f64x2_splat(-0.0254517624932312641616861);
|
|
470
|
+
v128_t const coeff_12_f64x2 = wasm_f64x2_splat(+0.0337852580001353069993897);
|
|
471
|
+
v128_t const coeff_11_f64x2 = wasm_f64x2_splat(-0.0407629191276836500001934);
|
|
472
|
+
v128_t const coeff_10_f64x2 = wasm_f64x2_splat(+0.0466667150077840625632675);
|
|
473
|
+
v128_t const coeff_9_f64x2 = wasm_f64x2_splat(-0.0523674852303482457616113);
|
|
474
|
+
v128_t const coeff_8_f64x2 = wasm_f64x2_splat(+0.0587666392926673580854313);
|
|
475
|
+
v128_t const coeff_7_f64x2 = wasm_f64x2_splat(-0.0666573579361080525984562);
|
|
476
|
+
v128_t const coeff_6_f64x2 = wasm_f64x2_splat(+0.0769219538311769618355029);
|
|
477
|
+
v128_t const coeff_5_f64x2 = wasm_f64x2_splat(-0.090908995008245008229153);
|
|
478
|
+
v128_t const coeff_4_f64x2 = wasm_f64x2_splat(+0.111111105648261418443745);
|
|
479
|
+
v128_t const coeff_3_f64x2 = wasm_f64x2_splat(-0.14285714266771329383765);
|
|
480
|
+
v128_t const coeff_2_f64x2 = wasm_f64x2_splat(+0.199999999996591265594148);
|
|
481
|
+
v128_t const coeff_1_f64x2 = wasm_f64x2_splat(-0.333333333333311110369124);
|
|
482
|
+
v128_t const pi_f64x2 = wasm_f64x2_splat(3.14159265358979323846);
|
|
483
|
+
v128_t const half_pi_f64x2 = wasm_f64x2_splat(1.5707963267948966);
|
|
484
|
+
v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
|
|
484
485
|
|
|
485
486
|
// Quadrant adjustments - take absolute values
|
|
486
|
-
v128_t
|
|
487
|
-
v128_t
|
|
488
|
-
v128_t
|
|
487
|
+
v128_t xs_negative_mask_f64x2 = wasm_f64x2_lt(xs_inputs, zeros_f64x2);
|
|
488
|
+
v128_t xs_f64x2 = wasm_f64x2_abs(xs_inputs);
|
|
489
|
+
v128_t ys_f64x2 = wasm_f64x2_abs(ys_inputs);
|
|
489
490
|
|
|
490
491
|
// Ensure proper fraction where numerator < denominator
|
|
491
|
-
v128_t
|
|
492
|
-
v128_t
|
|
492
|
+
v128_t swap_mask_f64x2 = wasm_f64x2_gt(ys_f64x2, xs_f64x2);
|
|
493
|
+
v128_t temps_f64x2 = xs_f64x2;
|
|
493
494
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
494
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
495
|
-
|
|
496
|
-
|
|
495
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
|
|
496
|
+
xs_f64x2 = wasm_i64x2_relaxed_laneselect(ys_f64x2, xs_f64x2, swap_mask_f64x2);
|
|
497
|
+
ys_f64x2 = wasm_i64x2_relaxed_laneselect(wasm_f64x2_neg(temps_f64x2), ys_f64x2, swap_mask_f64x2);
|
|
497
498
|
|
|
498
499
|
// Division for f64 precision
|
|
499
|
-
v128_t const
|
|
500
|
-
v128_t const
|
|
501
|
-
v128_t const
|
|
500
|
+
v128_t const ratio_f64x2 = wasm_f64x2_div(ys_f64x2, xs_f64x2);
|
|
501
|
+
v128_t const ratio_squared_f64x2 = wasm_f64x2_mul(ratio_f64x2, ratio_f64x2);
|
|
502
|
+
v128_t const ratio_cubed_f64x2 = wasm_f64x2_mul(ratio_f64x2, ratio_squared_f64x2);
|
|
502
503
|
|
|
503
504
|
// Polynomial evaluation using Horner's method
|
|
504
|
-
v128_t
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
505
|
+
v128_t polynomials_f64x2 = coeff_19_f64x2;
|
|
506
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_18_f64x2);
|
|
507
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_17_f64x2);
|
|
508
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_16_f64x2);
|
|
509
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_15_f64x2);
|
|
510
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_14_f64x2);
|
|
511
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_13_f64x2);
|
|
512
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_12_f64x2);
|
|
513
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_11_f64x2);
|
|
514
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_10_f64x2);
|
|
515
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_9_f64x2);
|
|
516
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_8_f64x2);
|
|
517
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_7_f64x2);
|
|
518
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_6_f64x2);
|
|
519
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_5_f64x2);
|
|
520
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_4_f64x2);
|
|
521
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_3_f64x2);
|
|
522
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_2_f64x2);
|
|
523
|
+
polynomials_f64x2 = wasm_f64x2_relaxed_madd(polynomials_f64x2, ratio_squared_f64x2, coeff_1_f64x2);
|
|
523
524
|
|
|
524
525
|
// Compute the result
|
|
525
|
-
v128_t
|
|
526
|
+
v128_t results_f64x2 = wasm_f64x2_relaxed_madd(ratio_cubed_f64x2, polynomials_f64x2, ratio_f64x2);
|
|
526
527
|
|
|
527
|
-
// Compute
|
|
528
|
+
// Compute quadrant_f64x2 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
|
|
528
529
|
// -2 for x<0 && !swap, -1 for x<0 && swap
|
|
529
|
-
v128_t
|
|
530
|
-
v128_t
|
|
530
|
+
v128_t quadrant_f64x2 = wasm_f64x2_splat(0.0);
|
|
531
|
+
v128_t neg_two_f64x2 = wasm_f64x2_splat(-2.0);
|
|
531
532
|
// relaxed_laneselect: 1 instruction (vblendvpd) vs 3 (vpand+vpandn+vpor) on x86.
|
|
532
|
-
// Safe because mask is from comparison (all-ones or all-
|
|
533
|
-
|
|
534
|
-
v128_t
|
|
535
|
-
|
|
533
|
+
// Safe because mask is from comparison (all-ones or all-zeros_f64x2 per lane).
|
|
534
|
+
quadrant_f64x2 = wasm_i64x2_relaxed_laneselect(neg_two_f64x2, quadrant_f64x2, xs_negative_mask_f64x2);
|
|
535
|
+
v128_t quadrant_incremented_f64x2 = wasm_f64x2_add(quadrant_f64x2, wasm_f64x2_splat(1.0));
|
|
536
|
+
quadrant_f64x2 = wasm_i64x2_relaxed_laneselect(quadrant_incremented_f64x2, quadrant_f64x2, swap_mask_f64x2);
|
|
536
537
|
|
|
537
|
-
// Adjust for
|
|
538
|
-
|
|
538
|
+
// Adjust for quadrant_f64x2: result += quadrant_f64x2 * pi_f64x2/2
|
|
539
|
+
results_f64x2 = wasm_f64x2_relaxed_madd(quadrant_f64x2, half_pi_f64x2, results_f64x2);
|
|
539
540
|
|
|
540
541
|
// Transfer sign from x and y by XOR with sign bits
|
|
541
|
-
v128_t
|
|
542
|
-
v128_t
|
|
543
|
-
v128_t
|
|
544
|
-
|
|
545
|
-
|
|
542
|
+
v128_t sign_mask_f64x2 = wasm_f64x2_splat(-0.0);
|
|
543
|
+
v128_t xs_sign_f64x2 = wasm_v128_and(xs_inputs, sign_mask_f64x2);
|
|
544
|
+
v128_t ys_sign_f64x2 = wasm_v128_and(ys_inputs, sign_mask_f64x2);
|
|
545
|
+
results_f64x2 = wasm_v128_xor(results_f64x2, xs_sign_f64x2);
|
|
546
|
+
results_f64x2 = wasm_v128_xor(results_f64x2, ys_sign_f64x2);
|
|
546
547
|
|
|
547
|
-
return
|
|
548
|
+
return results_f64x2;
|
|
548
549
|
}
|
|
549
550
|
|
|
550
551
|
/* NK_PUBLIC wrappers — same loop+tail pattern as neon.h.
|
|
@@ -555,9 +556,9 @@ NK_INTERNAL v128_t nk_f64x2_atan2_v128relaxed_(v128_t const ys_inputs, v128_t co
|
|
|
555
556
|
NK_PUBLIC void nk_each_sin_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
556
557
|
nk_size_t i = 0;
|
|
557
558
|
for (; i + 4 <= n; i += 4) {
|
|
558
|
-
v128_t
|
|
559
|
-
v128_t
|
|
560
|
-
wasm_v128_store(outs + i,
|
|
559
|
+
v128_t angles_f32x4 = wasm_v128_load(ins + i);
|
|
560
|
+
v128_t results_f32x4 = nk_f32x4_sin_v128relaxed_(angles_f32x4);
|
|
561
|
+
wasm_v128_store(outs + i, results_f32x4);
|
|
561
562
|
}
|
|
562
563
|
if (i < n) {
|
|
563
564
|
nk_size_t remaining = n - i;
|
|
@@ -572,9 +573,9 @@ NK_PUBLIC void nk_each_sin_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_
|
|
|
572
573
|
NK_PUBLIC void nk_each_cos_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
573
574
|
nk_size_t i = 0;
|
|
574
575
|
for (; i + 4 <= n; i += 4) {
|
|
575
|
-
v128_t
|
|
576
|
-
v128_t
|
|
577
|
-
wasm_v128_store(outs + i,
|
|
576
|
+
v128_t angles_f32x4 = wasm_v128_load(ins + i);
|
|
577
|
+
v128_t results_f32x4 = nk_f32x4_cos_v128relaxed_(angles_f32x4);
|
|
578
|
+
wasm_v128_store(outs + i, results_f32x4);
|
|
578
579
|
}
|
|
579
580
|
if (i < n) {
|
|
580
581
|
nk_size_t remaining = n - i;
|
|
@@ -589,9 +590,9 @@ NK_PUBLIC void nk_each_cos_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_
|
|
|
589
590
|
NK_PUBLIC void nk_each_atan_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
|
|
590
591
|
nk_size_t i = 0;
|
|
591
592
|
for (; i + 4 <= n; i += 4) {
|
|
592
|
-
v128_t
|
|
593
|
-
v128_t
|
|
594
|
-
wasm_v128_store(outs + i,
|
|
593
|
+
v128_t values_f32x4 = wasm_v128_load(ins + i);
|
|
594
|
+
v128_t results_f32x4 = nk_f32x4_atan_v128relaxed_(values_f32x4);
|
|
595
|
+
wasm_v128_store(outs + i, results_f32x4);
|
|
595
596
|
}
|
|
596
597
|
if (i < n) {
|
|
597
598
|
nk_size_t remaining = n - i;
|
|
@@ -606,9 +607,9 @@ NK_PUBLIC void nk_each_atan_f32_v128relaxed(nk_f32_t const *ins, nk_size_t n, nk
|
|
|
606
607
|
NK_PUBLIC void nk_each_sin_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
607
608
|
nk_size_t i = 0;
|
|
608
609
|
for (; i + 2 <= n; i += 2) {
|
|
609
|
-
v128_t
|
|
610
|
-
v128_t
|
|
611
|
-
wasm_v128_store(outs + i,
|
|
610
|
+
v128_t angles_f64x2 = wasm_v128_load(ins + i);
|
|
611
|
+
v128_t results_f64x2 = nk_f64x2_sin_v128relaxed_(angles_f64x2);
|
|
612
|
+
wasm_v128_store(outs + i, results_f64x2);
|
|
612
613
|
}
|
|
613
614
|
if (i < n) {
|
|
614
615
|
nk_size_t remaining = n - i;
|
|
@@ -623,9 +624,9 @@ NK_PUBLIC void nk_each_sin_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_
|
|
|
623
624
|
NK_PUBLIC void nk_each_cos_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
624
625
|
nk_size_t i = 0;
|
|
625
626
|
for (; i + 2 <= n; i += 2) {
|
|
626
|
-
v128_t
|
|
627
|
-
v128_t
|
|
628
|
-
wasm_v128_store(outs + i,
|
|
627
|
+
v128_t angles_f64x2 = wasm_v128_load(ins + i);
|
|
628
|
+
v128_t results_f64x2 = nk_f64x2_cos_v128relaxed_(angles_f64x2);
|
|
629
|
+
wasm_v128_store(outs + i, results_f64x2);
|
|
629
630
|
}
|
|
630
631
|
if (i < n) {
|
|
631
632
|
nk_size_t remaining = n - i;
|
|
@@ -640,9 +641,9 @@ NK_PUBLIC void nk_each_cos_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_
|
|
|
640
641
|
NK_PUBLIC void nk_each_atan_f64_v128relaxed(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
|
|
641
642
|
nk_size_t i = 0;
|
|
642
643
|
for (; i + 2 <= n; i += 2) {
|
|
643
|
-
v128_t
|
|
644
|
-
v128_t
|
|
645
|
-
wasm_v128_store(outs + i,
|
|
644
|
+
v128_t values_f64x2 = wasm_v128_load(ins + i);
|
|
645
|
+
v128_t results_f64x2 = nk_f64x2_atan_v128relaxed_(values_f64x2);
|
|
646
|
+
wasm_v128_store(outs + i, results_f64x2);
|
|
646
647
|
}
|
|
647
648
|
if (i < n) {
|
|
648
649
|
nk_size_t remaining = n - i;
|