numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,683 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SIMD-accelerated Scalar Math Helpers.
|
|
3
|
+
* @file include/numkong/scalar.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date March 1, 2026
|
|
6
|
+
*
|
|
7
|
+
* Provides dispatchable scalar helpers: sqrt, rsqrt, fma, saturating arithmetic,
|
|
8
|
+
* and ordering. Each ISA file is header-only with
|
|
9
|
+
* `NK_PUBLIC static inline` implementations; compile-time dispatch selects the
|
|
10
|
+
* best available backend when `NK_DYNAMIC_DISPATCH` is off.
|
|
11
|
+
*
|
|
12
|
+
* For hardware architectures:
|
|
13
|
+
*
|
|
14
|
+
* - Serial: software-emulated (Quake 3 rsqrt, bit-manipulation casts)
|
|
15
|
+
* - Arm: NEON (sqrt, fma, saturating_add)
|
|
16
|
+
* - x86: Haswell (sqrt, rsqrt, fma)
|
|
17
|
+
* - RISC-V: RVV (sqrt, rsqrt, fma, saturating_add via vfrsqrt7 + Newton-Raphson)
|
|
18
|
+
* - WASM: V128Relaxed (sqrt)
|
|
19
|
+
*/
|
|
20
|
+
#ifndef NK_SCALAR_H
|
|
21
|
+
#define NK_SCALAR_H
|
|
22
|
+
|
|
23
|
+
#include "numkong/types.h"
|
|
24
|
+
|
|
25
|
+
#if defined(__cplusplus)
|
|
26
|
+
extern "C" {
|
|
27
|
+
#endif
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* @brief Scalar square root: `√x`.
|
|
31
|
+
*
|
|
32
|
+
* @param[in] x The input value.
|
|
33
|
+
* @return The square root of @p x.
|
|
34
|
+
*/
|
|
35
|
+
NK_DYNAMIC nk_f32_t nk_f32_sqrt(nk_f32_t x);
|
|
36
|
+
/** @copydoc nk_f32_sqrt */
|
|
37
|
+
NK_DYNAMIC nk_f64_t nk_f64_sqrt(nk_f64_t x);
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* @brief Scalar reciprocal square root: `1/√x`.
|
|
41
|
+
* @sa std::rsqrt, @sa Rust f32::rsqrt
|
|
42
|
+
*
|
|
43
|
+
* @param[in] x The input value.
|
|
44
|
+
* @return The reciprocal square root of @p x.
|
|
45
|
+
*/
|
|
46
|
+
NK_DYNAMIC nk_f32_t nk_f32_rsqrt(nk_f32_t x);
|
|
47
|
+
/** @copydoc nk_f32_rsqrt */
|
|
48
|
+
NK_DYNAMIC nk_f64_t nk_f64_rsqrt(nk_f64_t x);
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* @brief Scalar fused multiply-add: `a × b + c`.
|
|
52
|
+
* @sa std::fma, @sa Rust f32::mul_add
|
|
53
|
+
*
|
|
54
|
+
* @param[in] a Multiplicand.
|
|
55
|
+
* @param[in] b Multiplier.
|
|
56
|
+
* @param[in] c Addend.
|
|
57
|
+
* @return `a * b + c` computed without intermediate rounding.
|
|
58
|
+
*/
|
|
59
|
+
NK_DYNAMIC nk_f32_t nk_f32_fma(nk_f32_t a, nk_f32_t b, nk_f32_t c);
|
|
60
|
+
/** @copydoc nk_f32_fma */
|
|
61
|
+
NK_DYNAMIC nk_f64_t nk_f64_fma(nk_f64_t a, nk_f64_t b, nk_f64_t c);
|
|
62
|
+
|
|
63
|
+
/** @copydoc nk_f32_sqrt */
|
|
64
|
+
NK_DYNAMIC nk_f16_t nk_f16_sqrt(nk_f16_t x);
|
|
65
|
+
/** @copydoc nk_f32_rsqrt */
|
|
66
|
+
NK_DYNAMIC nk_f16_t nk_f16_rsqrt(nk_f16_t x);
|
|
67
|
+
/** @copydoc nk_f32_fma */
|
|
68
|
+
NK_DYNAMIC nk_f16_t nk_f16_fma(nk_f16_t a, nk_f16_t b, nk_f16_t c);
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* @brief Saturating addition clamped to the representable range of the type.
|
|
72
|
+
*
|
|
73
|
+
* @param[in] a First operand.
|
|
74
|
+
* @param[in] b Second operand.
|
|
75
|
+
* @return `clamp(a + b, MIN, MAX)`.
|
|
76
|
+
*/
|
|
77
|
+
NK_DYNAMIC nk_u8_t nk_u8_saturating_add(nk_u8_t a, nk_u8_t b);
|
|
78
|
+
/** @copydoc nk_u8_saturating_add */
|
|
79
|
+
NK_DYNAMIC nk_i8_t nk_i8_saturating_add(nk_i8_t a, nk_i8_t b);
|
|
80
|
+
/** @copydoc nk_u8_saturating_add */
|
|
81
|
+
NK_DYNAMIC nk_u16_t nk_u16_saturating_add(nk_u16_t a, nk_u16_t b);
|
|
82
|
+
/** @copydoc nk_u8_saturating_add */
|
|
83
|
+
NK_DYNAMIC nk_i16_t nk_i16_saturating_add(nk_i16_t a, nk_i16_t b);
|
|
84
|
+
/** @copydoc nk_u8_saturating_add */
|
|
85
|
+
NK_DYNAMIC nk_u32_t nk_u32_saturating_add(nk_u32_t a, nk_u32_t b);
|
|
86
|
+
/** @copydoc nk_u8_saturating_add */
|
|
87
|
+
NK_DYNAMIC nk_i32_t nk_i32_saturating_add(nk_i32_t a, nk_i32_t b);
|
|
88
|
+
/** @copydoc nk_u8_saturating_add */
|
|
89
|
+
NK_DYNAMIC nk_u64_t nk_u64_saturating_add(nk_u64_t a, nk_u64_t b);
|
|
90
|
+
/** @copydoc nk_u8_saturating_add */
|
|
91
|
+
NK_DYNAMIC nk_i64_t nk_i64_saturating_add(nk_i64_t a, nk_i64_t b);
|
|
92
|
+
/** @copydoc nk_u8_saturating_add */
|
|
93
|
+
NK_DYNAMIC nk_i4x2_t nk_i4x2_saturating_add(nk_i4x2_t a, nk_i4x2_t b);
|
|
94
|
+
/** @copydoc nk_u8_saturating_add */
|
|
95
|
+
NK_DYNAMIC nk_u4x2_t nk_u4x2_saturating_add(nk_u4x2_t a, nk_u4x2_t b);
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* @brief Saturating multiplication clamped to the representable range of the type.
|
|
99
|
+
*
|
|
100
|
+
* @param[in] a First operand.
|
|
101
|
+
* @param[in] b Second operand.
|
|
102
|
+
* @return `clamp(a * b, MIN, MAX)`.
|
|
103
|
+
*/
|
|
104
|
+
NK_DYNAMIC nk_u8_t nk_u8_saturating_mul(nk_u8_t a, nk_u8_t b);
|
|
105
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
106
|
+
NK_DYNAMIC nk_i8_t nk_i8_saturating_mul(nk_i8_t a, nk_i8_t b);
|
|
107
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
108
|
+
NK_DYNAMIC nk_u16_t nk_u16_saturating_mul(nk_u16_t a, nk_u16_t b);
|
|
109
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
110
|
+
NK_DYNAMIC nk_i16_t nk_i16_saturating_mul(nk_i16_t a, nk_i16_t b);
|
|
111
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
112
|
+
NK_DYNAMIC nk_u32_t nk_u32_saturating_mul(nk_u32_t a, nk_u32_t b);
|
|
113
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
114
|
+
NK_DYNAMIC nk_i32_t nk_i32_saturating_mul(nk_i32_t a, nk_i32_t b);
|
|
115
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
116
|
+
NK_DYNAMIC nk_u64_t nk_u64_saturating_mul(nk_u64_t a, nk_u64_t b);
|
|
117
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
118
|
+
NK_DYNAMIC nk_i64_t nk_i64_saturating_mul(nk_i64_t a, nk_i64_t b);
|
|
119
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
120
|
+
NK_DYNAMIC nk_i4x2_t nk_i4x2_saturating_mul(nk_i4x2_t a, nk_i4x2_t b);
|
|
121
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
122
|
+
NK_DYNAMIC nk_u4x2_t nk_u4x2_saturating_mul(nk_u4x2_t a, nk_u4x2_t b);
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* @brief Branchless sign-magnitude ordering for non-native floating-point scalars.
|
|
126
|
+
* @sa std::strong_order, Rust total_cmp
|
|
127
|
+
*
|
|
128
|
+
* Uses `mask = -sign; ordered = value ^ mask` — the constant offset cancels in subtraction.
|
|
129
|
+
* Returns negative if a < b, 0 if equal, positive if a > b.
|
|
130
|
+
*
|
|
131
|
+
* @param[in] a First operand.
|
|
132
|
+
* @param[in] b Second operand.
|
|
133
|
+
* @return Negative if `a < b`, zero if `a == b`, positive if `a > b`.
|
|
134
|
+
*
|
|
135
|
+
* @note NaN values are ordered at the extremes per IEEE 754 totalOrder
|
|
136
|
+
* (negative NaN < all finite < positive NaN). Callers requiring NaN-exclusion
|
|
137
|
+
* semantics must filter NaN before calling.
|
|
138
|
+
*/
|
|
139
|
+
NK_DYNAMIC int nk_f16_order(nk_f16_t a, nk_f16_t b);
|
|
140
|
+
/** @copydoc nk_f16_order */
|
|
141
|
+
NK_DYNAMIC int nk_bf16_order(nk_bf16_t a, nk_bf16_t b);
|
|
142
|
+
/** @copydoc nk_f16_order */
|
|
143
|
+
NK_DYNAMIC int nk_e4m3_order(nk_e4m3_t a, nk_e4m3_t b);
|
|
144
|
+
/** @copydoc nk_f16_order */
|
|
145
|
+
NK_DYNAMIC int nk_e5m2_order(nk_e5m2_t a, nk_e5m2_t b);
|
|
146
|
+
/** @copydoc nk_f16_order */
|
|
147
|
+
NK_DYNAMIC int nk_e2m3_order(nk_e2m3_t a, nk_e2m3_t b);
|
|
148
|
+
/** @copydoc nk_f16_order */
|
|
149
|
+
NK_DYNAMIC int nk_e3m2_order(nk_e3m2_t a, nk_e3m2_t b);
|
|
150
|
+
|
|
151
|
+
/** @copydoc nk_f32_sqrt */
|
|
152
|
+
NK_PUBLIC nk_f32_t nk_f32_sqrt_serial(nk_f32_t x);
|
|
153
|
+
/** @copydoc nk_f64_sqrt */
|
|
154
|
+
NK_PUBLIC nk_f64_t nk_f64_sqrt_serial(nk_f64_t x);
|
|
155
|
+
/** @copydoc nk_f32_rsqrt */
|
|
156
|
+
NK_PUBLIC nk_f32_t nk_f32_rsqrt_serial(nk_f32_t x);
|
|
157
|
+
/** @copydoc nk_f64_rsqrt */
|
|
158
|
+
NK_PUBLIC nk_f64_t nk_f64_rsqrt_serial(nk_f64_t x);
|
|
159
|
+
/** @copydoc nk_f32_fma */
|
|
160
|
+
NK_PUBLIC nk_f32_t nk_f32_fma_serial(nk_f32_t a, nk_f32_t b, nk_f32_t c);
|
|
161
|
+
/** @copydoc nk_f64_fma */
|
|
162
|
+
NK_PUBLIC nk_f64_t nk_f64_fma_serial(nk_f64_t a, nk_f64_t b, nk_f64_t c);
|
|
163
|
+
|
|
164
|
+
/** @copydoc nk_f16_sqrt */
|
|
165
|
+
NK_PUBLIC nk_f16_t nk_f16_sqrt_serial(nk_f16_t x);
|
|
166
|
+
/** @copydoc nk_f16_rsqrt */
|
|
167
|
+
NK_PUBLIC nk_f16_t nk_f16_rsqrt_serial(nk_f16_t x);
|
|
168
|
+
/** @copydoc nk_f16_fma */
|
|
169
|
+
NK_PUBLIC nk_f16_t nk_f16_fma_serial(nk_f16_t a, nk_f16_t b, nk_f16_t c);
|
|
170
|
+
|
|
171
|
+
/** @copydoc nk_u8_saturating_add */
|
|
172
|
+
NK_PUBLIC nk_u8_t nk_u8_saturating_add_serial(nk_u8_t a, nk_u8_t b);
|
|
173
|
+
/** @copydoc nk_u8_saturating_add */
|
|
174
|
+
NK_PUBLIC nk_i8_t nk_i8_saturating_add_serial(nk_i8_t a, nk_i8_t b);
|
|
175
|
+
/** @copydoc nk_u8_saturating_add */
|
|
176
|
+
NK_PUBLIC nk_u16_t nk_u16_saturating_add_serial(nk_u16_t a, nk_u16_t b);
|
|
177
|
+
/** @copydoc nk_u8_saturating_add */
|
|
178
|
+
NK_PUBLIC nk_i16_t nk_i16_saturating_add_serial(nk_i16_t a, nk_i16_t b);
|
|
179
|
+
/** @copydoc nk_u8_saturating_add */
|
|
180
|
+
NK_PUBLIC nk_u32_t nk_u32_saturating_add_serial(nk_u32_t a, nk_u32_t b);
|
|
181
|
+
/** @copydoc nk_u8_saturating_add */
|
|
182
|
+
NK_PUBLIC nk_i32_t nk_i32_saturating_add_serial(nk_i32_t a, nk_i32_t b);
|
|
183
|
+
/** @copydoc nk_u8_saturating_add */
|
|
184
|
+
NK_PUBLIC nk_u64_t nk_u64_saturating_add_serial(nk_u64_t a, nk_u64_t b);
|
|
185
|
+
/** @copydoc nk_u8_saturating_add */
|
|
186
|
+
NK_PUBLIC nk_i64_t nk_i64_saturating_add_serial(nk_i64_t a, nk_i64_t b);
|
|
187
|
+
/** @copydoc nk_u8_saturating_add */
|
|
188
|
+
NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_add_serial(nk_i4x2_t a, nk_i4x2_t b);
|
|
189
|
+
/** @copydoc nk_u8_saturating_add */
|
|
190
|
+
NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_add_serial(nk_u4x2_t a, nk_u4x2_t b);
|
|
191
|
+
|
|
192
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
193
|
+
NK_PUBLIC nk_u8_t nk_u8_saturating_mul_serial(nk_u8_t a, nk_u8_t b);
|
|
194
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
195
|
+
NK_PUBLIC nk_i8_t nk_i8_saturating_mul_serial(nk_i8_t a, nk_i8_t b);
|
|
196
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
197
|
+
NK_PUBLIC nk_u16_t nk_u16_saturating_mul_serial(nk_u16_t a, nk_u16_t b);
|
|
198
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
199
|
+
NK_PUBLIC nk_i16_t nk_i16_saturating_mul_serial(nk_i16_t a, nk_i16_t b);
|
|
200
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
201
|
+
NK_PUBLIC nk_u32_t nk_u32_saturating_mul_serial(nk_u32_t a, nk_u32_t b);
|
|
202
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
203
|
+
NK_PUBLIC nk_i32_t nk_i32_saturating_mul_serial(nk_i32_t a, nk_i32_t b);
|
|
204
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
205
|
+
NK_PUBLIC nk_u64_t nk_u64_saturating_mul_serial(nk_u64_t a, nk_u64_t b);
|
|
206
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
207
|
+
NK_PUBLIC nk_i64_t nk_i64_saturating_mul_serial(nk_i64_t a, nk_i64_t b);
|
|
208
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
209
|
+
NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_mul_serial(nk_i4x2_t a, nk_i4x2_t b);
|
|
210
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
211
|
+
NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_mul_serial(nk_u4x2_t a, nk_u4x2_t b);
|
|
212
|
+
|
|
213
|
+
/** @copydoc nk_f16_order */
|
|
214
|
+
NK_PUBLIC int nk_f16_order_serial(nk_f16_t a, nk_f16_t b);
|
|
215
|
+
/** @copydoc nk_f16_order */
|
|
216
|
+
NK_PUBLIC int nk_bf16_order_serial(nk_bf16_t a, nk_bf16_t b);
|
|
217
|
+
/** @copydoc nk_f16_order */
|
|
218
|
+
NK_PUBLIC int nk_e4m3_order_serial(nk_e4m3_t a, nk_e4m3_t b);
|
|
219
|
+
/** @copydoc nk_f16_order */
|
|
220
|
+
NK_PUBLIC int nk_e5m2_order_serial(nk_e5m2_t a, nk_e5m2_t b);
|
|
221
|
+
/** @copydoc nk_f16_order */
|
|
222
|
+
NK_PUBLIC int nk_e2m3_order_serial(nk_e2m3_t a, nk_e2m3_t b);
|
|
223
|
+
/** @copydoc nk_f16_order */
|
|
224
|
+
NK_PUBLIC int nk_e3m2_order_serial(nk_e3m2_t a, nk_e3m2_t b);
|
|
225
|
+
|
|
226
|
+
#if NK_TARGET_NEON
|
|
227
|
+
/** @copydoc nk_f32_sqrt */
|
|
228
|
+
NK_PUBLIC nk_f32_t nk_f32_sqrt_neon(nk_f32_t x);
|
|
229
|
+
/** @copydoc nk_f64_sqrt */
|
|
230
|
+
NK_PUBLIC nk_f64_t nk_f64_sqrt_neon(nk_f64_t x);
|
|
231
|
+
/** @copydoc nk_f32_rsqrt */
|
|
232
|
+
NK_PUBLIC nk_f32_t nk_f32_rsqrt_neon(nk_f32_t x);
|
|
233
|
+
/** @copydoc nk_f64_rsqrt */
|
|
234
|
+
NK_PUBLIC nk_f64_t nk_f64_rsqrt_neon(nk_f64_t x);
|
|
235
|
+
/** @copydoc nk_f32_fma */
|
|
236
|
+
NK_PUBLIC nk_f32_t nk_f32_fma_neon(nk_f32_t a, nk_f32_t b, nk_f32_t c);
|
|
237
|
+
/** @copydoc nk_f64_fma */
|
|
238
|
+
NK_PUBLIC nk_f64_t nk_f64_fma_neon(nk_f64_t a, nk_f64_t b, nk_f64_t c);
|
|
239
|
+
/** @copydoc nk_u8_saturating_add */
|
|
240
|
+
NK_PUBLIC nk_u8_t nk_u8_saturating_add_neon(nk_u8_t a, nk_u8_t b);
|
|
241
|
+
/** @copydoc nk_u8_saturating_add */
|
|
242
|
+
NK_PUBLIC nk_i8_t nk_i8_saturating_add_neon(nk_i8_t a, nk_i8_t b);
|
|
243
|
+
/** @copydoc nk_u8_saturating_add */
|
|
244
|
+
NK_PUBLIC nk_u16_t nk_u16_saturating_add_neon(nk_u16_t a, nk_u16_t b);
|
|
245
|
+
/** @copydoc nk_u8_saturating_add */
|
|
246
|
+
NK_PUBLIC nk_i16_t nk_i16_saturating_add_neon(nk_i16_t a, nk_i16_t b);
|
|
247
|
+
/** @copydoc nk_u8_saturating_add */
|
|
248
|
+
NK_PUBLIC nk_u32_t nk_u32_saturating_add_neon(nk_u32_t a, nk_u32_t b);
|
|
249
|
+
/** @copydoc nk_u8_saturating_add */
|
|
250
|
+
NK_PUBLIC nk_i32_t nk_i32_saturating_add_neon(nk_i32_t a, nk_i32_t b);
|
|
251
|
+
/** @copydoc nk_u8_saturating_add */
|
|
252
|
+
NK_PUBLIC nk_u64_t nk_u64_saturating_add_neon(nk_u64_t a, nk_u64_t b);
|
|
253
|
+
/** @copydoc nk_u8_saturating_add */
|
|
254
|
+
NK_PUBLIC nk_i64_t nk_i64_saturating_add_neon(nk_i64_t a, nk_i64_t b);
|
|
255
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
256
|
+
NK_PUBLIC nk_u64_t nk_u64_saturating_mul_neon(nk_u64_t a, nk_u64_t b);
|
|
257
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
258
|
+
NK_PUBLIC nk_i64_t nk_i64_saturating_mul_neon(nk_i64_t a, nk_i64_t b);
|
|
259
|
+
#endif // NK_TARGET_NEON
|
|
260
|
+
|
|
261
|
+
#if NK_TARGET_NEONHALF
|
|
262
|
+
/** @copydoc nk_f16_sqrt */
|
|
263
|
+
NK_PUBLIC nk_f16_t nk_f16_sqrt_neonhalf(nk_f16_t x);
|
|
264
|
+
/** @copydoc nk_f16_rsqrt */
|
|
265
|
+
NK_PUBLIC nk_f16_t nk_f16_rsqrt_neonhalf(nk_f16_t x);
|
|
266
|
+
/** @copydoc nk_f16_fma */
|
|
267
|
+
NK_PUBLIC nk_f16_t nk_f16_fma_neonhalf(nk_f16_t a, nk_f16_t b, nk_f16_t c);
|
|
268
|
+
#endif // NK_TARGET_NEONHALF
|
|
269
|
+
|
|
270
|
+
#if NK_TARGET_HASWELL
|
|
271
|
+
/** @copydoc nk_f32_sqrt */
|
|
272
|
+
NK_PUBLIC nk_f32_t nk_f32_sqrt_haswell(nk_f32_t x);
|
|
273
|
+
/** @copydoc nk_f64_sqrt */
|
|
274
|
+
NK_PUBLIC nk_f64_t nk_f64_sqrt_haswell(nk_f64_t x);
|
|
275
|
+
/** @copydoc nk_f32_rsqrt */
|
|
276
|
+
NK_PUBLIC nk_f32_t nk_f32_rsqrt_haswell(nk_f32_t x);
|
|
277
|
+
/** @copydoc nk_f64_rsqrt */
|
|
278
|
+
NK_PUBLIC nk_f64_t nk_f64_rsqrt_haswell(nk_f64_t x);
|
|
279
|
+
/** @copydoc nk_f32_fma */
|
|
280
|
+
NK_PUBLIC nk_f32_t nk_f32_fma_haswell(nk_f32_t a, nk_f32_t b, nk_f32_t c);
|
|
281
|
+
/** @copydoc nk_f64_fma */
|
|
282
|
+
NK_PUBLIC nk_f64_t nk_f64_fma_haswell(nk_f64_t a, nk_f64_t b, nk_f64_t c);
|
|
283
|
+
/** @copydoc nk_u8_saturating_add */
|
|
284
|
+
NK_PUBLIC nk_u8_t nk_u8_saturating_add_haswell(nk_u8_t a, nk_u8_t b);
|
|
285
|
+
/** @copydoc nk_u8_saturating_add */
|
|
286
|
+
NK_PUBLIC nk_i8_t nk_i8_saturating_add_haswell(nk_i8_t a, nk_i8_t b);
|
|
287
|
+
/** @copydoc nk_u8_saturating_add */
|
|
288
|
+
NK_PUBLIC nk_u16_t nk_u16_saturating_add_haswell(nk_u16_t a, nk_u16_t b);
|
|
289
|
+
/** @copydoc nk_u8_saturating_add */
|
|
290
|
+
NK_PUBLIC nk_i16_t nk_i16_saturating_add_haswell(nk_i16_t a, nk_i16_t b);
|
|
291
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
292
|
+
NK_PUBLIC nk_u64_t nk_u64_saturating_mul_haswell(nk_u64_t a, nk_u64_t b);
|
|
293
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
294
|
+
NK_PUBLIC nk_i64_t nk_i64_saturating_mul_haswell(nk_i64_t a, nk_i64_t b);
|
|
295
|
+
/** @copydoc nk_f16_sqrt */
|
|
296
|
+
NK_PUBLIC nk_f16_t nk_f16_sqrt_haswell(nk_f16_t x);
|
|
297
|
+
/** @copydoc nk_f16_rsqrt */
|
|
298
|
+
NK_PUBLIC nk_f16_t nk_f16_rsqrt_haswell(nk_f16_t x);
|
|
299
|
+
/** @copydoc nk_f16_fma */
|
|
300
|
+
NK_PUBLIC nk_f16_t nk_f16_fma_haswell(nk_f16_t a, nk_f16_t b, nk_f16_t c);
|
|
301
|
+
#endif // NK_TARGET_HASWELL
|
|
302
|
+
|
|
303
|
+
#if NK_TARGET_SAPPHIRE
|
|
304
|
+
/** @copydoc nk_f16_order */
|
|
305
|
+
NK_PUBLIC int nk_f16_order_sapphire(nk_f16_t a, nk_f16_t b);
|
|
306
|
+
/** @copydoc nk_f16_sqrt */
|
|
307
|
+
NK_PUBLIC nk_f16_t nk_f16_sqrt_sapphire(nk_f16_t x);
|
|
308
|
+
/** @copydoc nk_f16_rsqrt */
|
|
309
|
+
NK_PUBLIC nk_f16_t nk_f16_rsqrt_sapphire(nk_f16_t x);
|
|
310
|
+
/** @copydoc nk_f16_fma */
|
|
311
|
+
NK_PUBLIC nk_f16_t nk_f16_fma_sapphire(nk_f16_t a, nk_f16_t b, nk_f16_t c);
|
|
312
|
+
#endif // NK_TARGET_SAPPHIRE
|
|
313
|
+
|
|
314
|
+
#if NK_TARGET_RVV
|
|
315
|
+
/** @copydoc nk_f32_sqrt */
|
|
316
|
+
NK_PUBLIC nk_f32_t nk_f32_sqrt_rvv(nk_f32_t x);
|
|
317
|
+
/** @copydoc nk_f64_sqrt */
|
|
318
|
+
NK_PUBLIC nk_f64_t nk_f64_sqrt_rvv(nk_f64_t x);
|
|
319
|
+
/** @copydoc nk_f32_rsqrt */
|
|
320
|
+
NK_PUBLIC nk_f32_t nk_f32_rsqrt_rvv(nk_f32_t x);
|
|
321
|
+
/** @copydoc nk_f64_rsqrt */
|
|
322
|
+
NK_PUBLIC nk_f64_t nk_f64_rsqrt_rvv(nk_f64_t x);
|
|
323
|
+
/** @copydoc nk_f32_fma */
|
|
324
|
+
NK_PUBLIC nk_f32_t nk_f32_fma_rvv(nk_f32_t a, nk_f32_t b, nk_f32_t c);
|
|
325
|
+
/** @copydoc nk_f64_fma */
|
|
326
|
+
NK_PUBLIC nk_f64_t nk_f64_fma_rvv(nk_f64_t a, nk_f64_t b, nk_f64_t c);
|
|
327
|
+
/** @copydoc nk_u8_saturating_add */
|
|
328
|
+
NK_PUBLIC nk_u8_t nk_u8_saturating_add_rvv(nk_u8_t a, nk_u8_t b);
|
|
329
|
+
/** @copydoc nk_u8_saturating_add */
|
|
330
|
+
NK_PUBLIC nk_i8_t nk_i8_saturating_add_rvv(nk_i8_t a, nk_i8_t b);
|
|
331
|
+
/** @copydoc nk_u8_saturating_add */
|
|
332
|
+
NK_PUBLIC nk_u16_t nk_u16_saturating_add_rvv(nk_u16_t a, nk_u16_t b);
|
|
333
|
+
/** @copydoc nk_u8_saturating_add */
|
|
334
|
+
NK_PUBLIC nk_i16_t nk_i16_saturating_add_rvv(nk_i16_t a, nk_i16_t b);
|
|
335
|
+
/** @copydoc nk_u8_saturating_add */
|
|
336
|
+
NK_PUBLIC nk_u32_t nk_u32_saturating_add_rvv(nk_u32_t a, nk_u32_t b);
|
|
337
|
+
/** @copydoc nk_u8_saturating_add */
|
|
338
|
+
NK_PUBLIC nk_i32_t nk_i32_saturating_add_rvv(nk_i32_t a, nk_i32_t b);
|
|
339
|
+
/** @copydoc nk_u8_saturating_add */
|
|
340
|
+
NK_PUBLIC nk_u64_t nk_u64_saturating_add_rvv(nk_u64_t a, nk_u64_t b);
|
|
341
|
+
/** @copydoc nk_u8_saturating_add */
|
|
342
|
+
NK_PUBLIC nk_i64_t nk_i64_saturating_add_rvv(nk_i64_t a, nk_i64_t b);
|
|
343
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
344
|
+
NK_PUBLIC nk_u8_t nk_u8_saturating_mul_rvv(nk_u8_t a, nk_u8_t b);
|
|
345
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
346
|
+
NK_PUBLIC nk_i8_t nk_i8_saturating_mul_rvv(nk_i8_t a, nk_i8_t b);
|
|
347
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
348
|
+
NK_PUBLIC nk_u16_t nk_u16_saturating_mul_rvv(nk_u16_t a, nk_u16_t b);
|
|
349
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
350
|
+
NK_PUBLIC nk_i16_t nk_i16_saturating_mul_rvv(nk_i16_t a, nk_i16_t b);
|
|
351
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
352
|
+
NK_PUBLIC nk_u32_t nk_u32_saturating_mul_rvv(nk_u32_t a, nk_u32_t b);
|
|
353
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
354
|
+
NK_PUBLIC nk_i32_t nk_i32_saturating_mul_rvv(nk_i32_t a, nk_i32_t b);
|
|
355
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
356
|
+
NK_PUBLIC nk_u64_t nk_u64_saturating_mul_rvv(nk_u64_t a, nk_u64_t b);
|
|
357
|
+
/** @copydoc nk_u8_saturating_mul */
|
|
358
|
+
NK_PUBLIC nk_i64_t nk_i64_saturating_mul_rvv(nk_i64_t a, nk_i64_t b);
|
|
359
|
+
#endif // NK_TARGET_RVV
|
|
360
|
+
|
|
361
|
+
#if NK_TARGET_V128RELAXED
|
|
362
|
+
/** @copydoc nk_f32_sqrt */
|
|
363
|
+
NK_PUBLIC nk_f32_t nk_f32_sqrt_v128relaxed(nk_f32_t x);
|
|
364
|
+
/** @copydoc nk_f64_sqrt */
|
|
365
|
+
NK_PUBLIC nk_f64_t nk_f64_sqrt_v128relaxed(nk_f64_t x);
|
|
366
|
+
/** @copydoc nk_f32_rsqrt */
|
|
367
|
+
NK_PUBLIC nk_f32_t nk_f32_rsqrt_v128relaxed(nk_f32_t x);
|
|
368
|
+
/** @copydoc nk_f64_rsqrt */
|
|
369
|
+
NK_PUBLIC nk_f64_t nk_f64_rsqrt_v128relaxed(nk_f64_t x);
|
|
370
|
+
/** @copydoc nk_f32_fma */
|
|
371
|
+
NK_PUBLIC nk_f32_t nk_f32_fma_v128relaxed(nk_f32_t a, nk_f32_t b, nk_f32_t c);
|
|
372
|
+
/** @copydoc nk_f64_fma */
|
|
373
|
+
NK_PUBLIC nk_f64_t nk_f64_fma_v128relaxed(nk_f64_t a, nk_f64_t b, nk_f64_t c);
|
|
374
|
+
#endif // NK_TARGET_V128RELAXED
|
|
375
|
+
|
|
376
|
+
#if defined(__cplusplus)
|
|
377
|
+
} // extern "C"
|
|
378
|
+
#endif
|
|
379
|
+
|
|
380
|
+
#include "numkong/scalar/serial.h" // `nk_f32_rsqrt_serial`
|
|
381
|
+
#include "numkong/scalar/neon.h" // `nk_f32_sqrt_neon`
|
|
382
|
+
#include "numkong/scalar/neonhalf.h" // `nk_f16_sqrt_neonhalf`
|
|
383
|
+
#include "numkong/scalar/haswell.h" // `nk_f32_sqrt_haswell`
|
|
384
|
+
#include "numkong/scalar/sapphire.h" // `nk_f16_order_sapphire`
|
|
385
|
+
#include "numkong/scalar/rvv.h" // `nk_f32_rsqrt_rvv`
|
|
386
|
+
#include "numkong/scalar/v128relaxed.h" // `nk_f32_sqrt_v128relaxed`
|
|
387
|
+
|
|
388
|
+
#if defined(__cplusplus)
|
|
389
|
+
extern "C" {
|
|
390
|
+
#endif
|
|
391
|
+
|
|
392
|
+
#if !NK_DYNAMIC_DISPATCH
|
|
393
|
+
|
|
394
|
+
NK_PUBLIC nk_f32_t nk_f32_sqrt(nk_f32_t x) {
|
|
395
|
+
#if NK_TARGET_HASWELL
|
|
396
|
+
return nk_f32_sqrt_haswell(x);
|
|
397
|
+
#elif NK_TARGET_NEON
|
|
398
|
+
return nk_f32_sqrt_neon(x);
|
|
399
|
+
#elif NK_TARGET_RVV
|
|
400
|
+
return nk_f32_sqrt_rvv(x);
|
|
401
|
+
#elif NK_TARGET_V128RELAXED
|
|
402
|
+
return nk_f32_sqrt_v128relaxed(x);
|
|
403
|
+
#else
|
|
404
|
+
return nk_f32_sqrt_serial(x);
|
|
405
|
+
#endif
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
NK_PUBLIC nk_f64_t nk_f64_sqrt(nk_f64_t x) {
|
|
409
|
+
#if NK_TARGET_HASWELL
|
|
410
|
+
return nk_f64_sqrt_haswell(x);
|
|
411
|
+
#elif NK_TARGET_NEON
|
|
412
|
+
return nk_f64_sqrt_neon(x);
|
|
413
|
+
#elif NK_TARGET_RVV
|
|
414
|
+
return nk_f64_sqrt_rvv(x);
|
|
415
|
+
#elif NK_TARGET_V128RELAXED
|
|
416
|
+
return nk_f64_sqrt_v128relaxed(x);
|
|
417
|
+
#else
|
|
418
|
+
return nk_f64_sqrt_serial(x);
|
|
419
|
+
#endif
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
NK_PUBLIC nk_f32_t nk_f32_rsqrt(nk_f32_t x) {
|
|
423
|
+
#if NK_TARGET_HASWELL
|
|
424
|
+
return nk_f32_rsqrt_haswell(x);
|
|
425
|
+
#elif NK_TARGET_NEON
|
|
426
|
+
return nk_f32_rsqrt_neon(x);
|
|
427
|
+
#elif NK_TARGET_RVV
|
|
428
|
+
return nk_f32_rsqrt_rvv(x);
|
|
429
|
+
#elif NK_TARGET_V128RELAXED
|
|
430
|
+
return nk_f32_rsqrt_v128relaxed(x);
|
|
431
|
+
#else
|
|
432
|
+
return nk_f32_rsqrt_serial(x);
|
|
433
|
+
#endif
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
NK_PUBLIC nk_f64_t nk_f64_rsqrt(nk_f64_t x) {
|
|
437
|
+
#if NK_TARGET_HASWELL
|
|
438
|
+
return nk_f64_rsqrt_haswell(x);
|
|
439
|
+
#elif NK_TARGET_NEON
|
|
440
|
+
return nk_f64_rsqrt_neon(x);
|
|
441
|
+
#elif NK_TARGET_RVV
|
|
442
|
+
return nk_f64_rsqrt_rvv(x);
|
|
443
|
+
#elif NK_TARGET_V128RELAXED
|
|
444
|
+
return nk_f64_rsqrt_v128relaxed(x);
|
|
445
|
+
#else
|
|
446
|
+
return nk_f64_rsqrt_serial(x);
|
|
447
|
+
#endif
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
NK_PUBLIC nk_f32_t nk_f32_fma(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
|
|
451
|
+
#if NK_TARGET_HASWELL
|
|
452
|
+
return nk_f32_fma_haswell(a, b, c);
|
|
453
|
+
#elif NK_TARGET_NEON
|
|
454
|
+
return nk_f32_fma_neon(a, b, c);
|
|
455
|
+
#elif NK_TARGET_RVV
|
|
456
|
+
return nk_f32_fma_rvv(a, b, c);
|
|
457
|
+
#elif NK_TARGET_V128RELAXED
|
|
458
|
+
return nk_f32_fma_v128relaxed(a, b, c);
|
|
459
|
+
#else
|
|
460
|
+
return nk_f32_fma_serial(a, b, c);
|
|
461
|
+
#endif
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
NK_PUBLIC nk_f64_t nk_f64_fma(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
|
|
465
|
+
#if NK_TARGET_HASWELL
|
|
466
|
+
return nk_f64_fma_haswell(a, b, c);
|
|
467
|
+
#elif NK_TARGET_NEON
|
|
468
|
+
return nk_f64_fma_neon(a, b, c);
|
|
469
|
+
#elif NK_TARGET_RVV
|
|
470
|
+
return nk_f64_fma_rvv(a, b, c);
|
|
471
|
+
#elif NK_TARGET_V128RELAXED
|
|
472
|
+
return nk_f64_fma_v128relaxed(a, b, c);
|
|
473
|
+
#else
|
|
474
|
+
return nk_f64_fma_serial(a, b, c);
|
|
475
|
+
#endif
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
NK_PUBLIC nk_f16_t nk_f16_sqrt(nk_f16_t x) {
|
|
479
|
+
#if NK_TARGET_SAPPHIRE
|
|
480
|
+
return nk_f16_sqrt_sapphire(x);
|
|
481
|
+
#elif NK_TARGET_NEONHALF
|
|
482
|
+
return nk_f16_sqrt_neonhalf(x);
|
|
483
|
+
#elif NK_TARGET_HASWELL
|
|
484
|
+
return nk_f16_sqrt_haswell(x);
|
|
485
|
+
#else
|
|
486
|
+
return nk_f16_sqrt_serial(x);
|
|
487
|
+
#endif
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
NK_PUBLIC nk_f16_t nk_f16_rsqrt(nk_f16_t x) {
|
|
491
|
+
#if NK_TARGET_SAPPHIRE
|
|
492
|
+
return nk_f16_rsqrt_sapphire(x);
|
|
493
|
+
#elif NK_TARGET_NEONHALF
|
|
494
|
+
return nk_f16_rsqrt_neonhalf(x);
|
|
495
|
+
#elif NK_TARGET_HASWELL
|
|
496
|
+
return nk_f16_rsqrt_haswell(x);
|
|
497
|
+
#else
|
|
498
|
+
return nk_f16_rsqrt_serial(x);
|
|
499
|
+
#endif
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
NK_PUBLIC nk_f16_t nk_f16_fma(nk_f16_t a, nk_f16_t b, nk_f16_t c) {
|
|
503
|
+
#if NK_TARGET_SAPPHIRE
|
|
504
|
+
return nk_f16_fma_sapphire(a, b, c);
|
|
505
|
+
#elif NK_TARGET_NEONHALF
|
|
506
|
+
return nk_f16_fma_neonhalf(a, b, c);
|
|
507
|
+
#elif NK_TARGET_HASWELL
|
|
508
|
+
return nk_f16_fma_haswell(a, b, c);
|
|
509
|
+
#else
|
|
510
|
+
return nk_f16_fma_serial(a, b, c);
|
|
511
|
+
#endif
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
NK_PUBLIC nk_u8_t nk_u8_saturating_add(nk_u8_t a, nk_u8_t b) {
|
|
515
|
+
#if NK_TARGET_HASWELL
|
|
516
|
+
return nk_u8_saturating_add_haswell(a, b);
|
|
517
|
+
#elif NK_TARGET_NEON
|
|
518
|
+
return nk_u8_saturating_add_neon(a, b);
|
|
519
|
+
#elif NK_TARGET_RVV
|
|
520
|
+
return nk_u8_saturating_add_rvv(a, b);
|
|
521
|
+
#else
|
|
522
|
+
return nk_u8_saturating_add_serial(a, b);
|
|
523
|
+
#endif
|
|
524
|
+
}
|
|
525
|
+
NK_PUBLIC nk_i8_t nk_i8_saturating_add(nk_i8_t a, nk_i8_t b) {
|
|
526
|
+
#if NK_TARGET_HASWELL
|
|
527
|
+
return nk_i8_saturating_add_haswell(a, b);
|
|
528
|
+
#elif NK_TARGET_NEON
|
|
529
|
+
return nk_i8_saturating_add_neon(a, b);
|
|
530
|
+
#elif NK_TARGET_RVV
|
|
531
|
+
return nk_i8_saturating_add_rvv(a, b);
|
|
532
|
+
#else
|
|
533
|
+
return nk_i8_saturating_add_serial(a, b);
|
|
534
|
+
#endif
|
|
535
|
+
}
|
|
536
|
+
NK_PUBLIC nk_u16_t nk_u16_saturating_add(nk_u16_t a, nk_u16_t b) {
|
|
537
|
+
#if NK_TARGET_HASWELL
|
|
538
|
+
return nk_u16_saturating_add_haswell(a, b);
|
|
539
|
+
#elif NK_TARGET_NEON
|
|
540
|
+
return nk_u16_saturating_add_neon(a, b);
|
|
541
|
+
#elif NK_TARGET_RVV
|
|
542
|
+
return nk_u16_saturating_add_rvv(a, b);
|
|
543
|
+
#else
|
|
544
|
+
return nk_u16_saturating_add_serial(a, b);
|
|
545
|
+
#endif
|
|
546
|
+
}
|
|
547
|
+
NK_PUBLIC nk_i16_t nk_i16_saturating_add(nk_i16_t a, nk_i16_t b) {
|
|
548
|
+
#if NK_TARGET_HASWELL
|
|
549
|
+
return nk_i16_saturating_add_haswell(a, b);
|
|
550
|
+
#elif NK_TARGET_NEON
|
|
551
|
+
return nk_i16_saturating_add_neon(a, b);
|
|
552
|
+
#elif NK_TARGET_RVV
|
|
553
|
+
return nk_i16_saturating_add_rvv(a, b);
|
|
554
|
+
#else
|
|
555
|
+
return nk_i16_saturating_add_serial(a, b);
|
|
556
|
+
#endif
|
|
557
|
+
}
|
|
558
|
+
NK_PUBLIC nk_u32_t nk_u32_saturating_add(nk_u32_t a, nk_u32_t b) {
|
|
559
|
+
#if NK_TARGET_NEON
|
|
560
|
+
return nk_u32_saturating_add_neon(a, b);
|
|
561
|
+
#elif NK_TARGET_RVV
|
|
562
|
+
return nk_u32_saturating_add_rvv(a, b);
|
|
563
|
+
#else
|
|
564
|
+
return nk_u32_saturating_add_serial(a, b);
|
|
565
|
+
#endif
|
|
566
|
+
}
|
|
567
|
+
NK_PUBLIC nk_i32_t nk_i32_saturating_add(nk_i32_t a, nk_i32_t b) {
|
|
568
|
+
#if NK_TARGET_NEON
|
|
569
|
+
return nk_i32_saturating_add_neon(a, b);
|
|
570
|
+
#elif NK_TARGET_RVV
|
|
571
|
+
return nk_i32_saturating_add_rvv(a, b);
|
|
572
|
+
#else
|
|
573
|
+
return nk_i32_saturating_add_serial(a, b);
|
|
574
|
+
#endif
|
|
575
|
+
}
|
|
576
|
+
NK_PUBLIC nk_u64_t nk_u64_saturating_add(nk_u64_t a, nk_u64_t b) {
|
|
577
|
+
#if NK_TARGET_NEON
|
|
578
|
+
return nk_u64_saturating_add_neon(a, b);
|
|
579
|
+
#elif NK_TARGET_RVV
|
|
580
|
+
return nk_u64_saturating_add_rvv(a, b);
|
|
581
|
+
#else
|
|
582
|
+
return nk_u64_saturating_add_serial(a, b);
|
|
583
|
+
#endif
|
|
584
|
+
}
|
|
585
|
+
NK_PUBLIC nk_i64_t nk_i64_saturating_add(nk_i64_t a, nk_i64_t b) {
|
|
586
|
+
#if NK_TARGET_NEON
|
|
587
|
+
return nk_i64_saturating_add_neon(a, b);
|
|
588
|
+
#elif NK_TARGET_RVV
|
|
589
|
+
return nk_i64_saturating_add_rvv(a, b);
|
|
590
|
+
#else
|
|
591
|
+
return nk_i64_saturating_add_serial(a, b);
|
|
592
|
+
#endif
|
|
593
|
+
}
|
|
594
|
+
NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_add(nk_i4x2_t a, nk_i4x2_t b) { return nk_i4x2_saturating_add_serial(a, b); }
|
|
595
|
+
NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_add(nk_u4x2_t a, nk_u4x2_t b) { return nk_u4x2_saturating_add_serial(a, b); }
|
|
596
|
+
|
|
597
|
+
NK_PUBLIC nk_u8_t nk_u8_saturating_mul(nk_u8_t a, nk_u8_t b) {
|
|
598
|
+
#if NK_TARGET_RVV
|
|
599
|
+
return nk_u8_saturating_mul_rvv(a, b);
|
|
600
|
+
#else
|
|
601
|
+
return nk_u8_saturating_mul_serial(a, b);
|
|
602
|
+
#endif
|
|
603
|
+
}
|
|
604
|
+
NK_PUBLIC nk_i8_t nk_i8_saturating_mul(nk_i8_t a, nk_i8_t b) {
|
|
605
|
+
#if NK_TARGET_RVV
|
|
606
|
+
return nk_i8_saturating_mul_rvv(a, b);
|
|
607
|
+
#else
|
|
608
|
+
return nk_i8_saturating_mul_serial(a, b);
|
|
609
|
+
#endif
|
|
610
|
+
}
|
|
611
|
+
NK_PUBLIC nk_u16_t nk_u16_saturating_mul(nk_u16_t a, nk_u16_t b) {
|
|
612
|
+
#if NK_TARGET_RVV
|
|
613
|
+
return nk_u16_saturating_mul_rvv(a, b);
|
|
614
|
+
#else
|
|
615
|
+
return nk_u16_saturating_mul_serial(a, b);
|
|
616
|
+
#endif
|
|
617
|
+
}
|
|
618
|
+
NK_PUBLIC nk_i16_t nk_i16_saturating_mul(nk_i16_t a, nk_i16_t b) {
|
|
619
|
+
#if NK_TARGET_RVV
|
|
620
|
+
return nk_i16_saturating_mul_rvv(a, b);
|
|
621
|
+
#else
|
|
622
|
+
return nk_i16_saturating_mul_serial(a, b);
|
|
623
|
+
#endif
|
|
624
|
+
}
|
|
625
|
+
NK_PUBLIC nk_u32_t nk_u32_saturating_mul(nk_u32_t a, nk_u32_t b) {
|
|
626
|
+
#if NK_TARGET_RVV
|
|
627
|
+
return nk_u32_saturating_mul_rvv(a, b);
|
|
628
|
+
#else
|
|
629
|
+
return nk_u32_saturating_mul_serial(a, b);
|
|
630
|
+
#endif
|
|
631
|
+
}
|
|
632
|
+
NK_PUBLIC nk_i32_t nk_i32_saturating_mul(nk_i32_t a, nk_i32_t b) {
|
|
633
|
+
#if NK_TARGET_RVV
|
|
634
|
+
return nk_i32_saturating_mul_rvv(a, b);
|
|
635
|
+
#else
|
|
636
|
+
return nk_i32_saturating_mul_serial(a, b);
|
|
637
|
+
#endif
|
|
638
|
+
}
|
|
639
|
+
NK_PUBLIC nk_u64_t nk_u64_saturating_mul(nk_u64_t a, nk_u64_t b) {
|
|
640
|
+
#if NK_TARGET_HASWELL
|
|
641
|
+
return nk_u64_saturating_mul_haswell(a, b);
|
|
642
|
+
#elif NK_TARGET_NEON
|
|
643
|
+
return nk_u64_saturating_mul_neon(a, b);
|
|
644
|
+
#elif NK_TARGET_RVV
|
|
645
|
+
return nk_u64_saturating_mul_rvv(a, b);
|
|
646
|
+
#else
|
|
647
|
+
return nk_u64_saturating_mul_serial(a, b);
|
|
648
|
+
#endif
|
|
649
|
+
}
|
|
650
|
+
NK_PUBLIC nk_i64_t nk_i64_saturating_mul(nk_i64_t a, nk_i64_t b) {
|
|
651
|
+
#if NK_TARGET_HASWELL
|
|
652
|
+
return nk_i64_saturating_mul_haswell(a, b);
|
|
653
|
+
#elif NK_TARGET_NEON
|
|
654
|
+
return nk_i64_saturating_mul_neon(a, b);
|
|
655
|
+
#elif NK_TARGET_RVV
|
|
656
|
+
return nk_i64_saturating_mul_rvv(a, b);
|
|
657
|
+
#else
|
|
658
|
+
return nk_i64_saturating_mul_serial(a, b);
|
|
659
|
+
#endif
|
|
660
|
+
}
|
|
661
|
+
NK_PUBLIC nk_i4x2_t nk_i4x2_saturating_mul(nk_i4x2_t a, nk_i4x2_t b) { return nk_i4x2_saturating_mul_serial(a, b); }
|
|
662
|
+
NK_PUBLIC nk_u4x2_t nk_u4x2_saturating_mul(nk_u4x2_t a, nk_u4x2_t b) { return nk_u4x2_saturating_mul_serial(a, b); }
|
|
663
|
+
|
|
664
|
+
NK_PUBLIC int nk_f16_order(nk_f16_t a, nk_f16_t b) {
|
|
665
|
+
#if NK_TARGET_SAPPHIRE
|
|
666
|
+
return nk_f16_order_sapphire(a, b);
|
|
667
|
+
#else
|
|
668
|
+
return nk_f16_order_serial(a, b);
|
|
669
|
+
#endif
|
|
670
|
+
}
|
|
671
|
+
NK_PUBLIC int nk_bf16_order(nk_bf16_t a, nk_bf16_t b) { return nk_bf16_order_serial(a, b); }
|
|
672
|
+
NK_PUBLIC int nk_e4m3_order(nk_e4m3_t a, nk_e4m3_t b) { return nk_e4m3_order_serial(a, b); }
|
|
673
|
+
NK_PUBLIC int nk_e5m2_order(nk_e5m2_t a, nk_e5m2_t b) { return nk_e5m2_order_serial(a, b); }
|
|
674
|
+
NK_PUBLIC int nk_e2m3_order(nk_e2m3_t a, nk_e2m3_t b) { return nk_e2m3_order_serial(a, b); }
|
|
675
|
+
NK_PUBLIC int nk_e3m2_order(nk_e3m2_t a, nk_e3m2_t b) { return nk_e3m2_order_serial(a, b); }
|
|
676
|
+
|
|
677
|
+
#endif // !NK_DYNAMIC_DISPATCH
|
|
678
|
+
|
|
679
|
+
#if defined(__cplusplus)
|
|
680
|
+
} // extern "C"
|
|
681
|
+
#endif
|
|
682
|
+
|
|
683
|
+
#endif // NK_SCALAR_H
|