numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -36,7 +36,7 @@ NK_INTERNAL void nk_store_b256_v128relaxed_(nk_b256_vec_t const *src, void *dst)
|
|
|
36
36
|
|
|
37
37
|
/** @brief BF16 is the upper 16 bits of F32, so zero-extend to u32 and shift left by 16. */
|
|
38
38
|
NK_INTERNAL nk_b128_vec_t nk_bf16x4_to_f32x4_v128relaxed_(nk_b64_vec_t bf16_vec) {
|
|
39
|
-
v128_t bf16_u16x4_in_u64 =
|
|
39
|
+
v128_t bf16_u16x4_in_u64 = wasm_i64x2_splat(bf16_vec.u64);
|
|
40
40
|
v128_t bf16_u32x4_low = wasm_u32x4_extend_low_u16x8(bf16_u16x4_in_u64);
|
|
41
41
|
nk_b128_vec_t result;
|
|
42
42
|
result.v128 = wasm_i32x4_shl(bf16_u32x4_low, 16);
|
|
@@ -44,58 +44,38 @@ NK_INTERNAL nk_b128_vec_t nk_bf16x4_to_f32x4_v128relaxed_(nk_b64_vec_t bf16_vec)
|
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
/**
|
|
47
|
-
* @brief F16→F32
|
|
48
|
-
*
|
|
49
|
-
*
|
|
47
|
+
* @brief F16→F32 via Giesen's magic-number multiply trick.
|
|
48
|
+
* @see https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
|
|
49
|
+
*
|
|
50
|
+
* Shifts the 15-bit magnitude into F32 exponent+mantissa position, then multiplies
|
|
51
|
+
* by 2^112 (magic = 0x77800000) to rebias the exponent. This single multiply also
|
|
52
|
+
* correctly normalizes F16 subnormals into F32 normals — no branching or FPU
|
|
53
|
+
* integer-to-float conversion needed. Inf/NaN (exp=31) overflows the multiply and
|
|
54
|
+
* is fixed with a comparison + blend.
|
|
50
55
|
*/
|
|
51
56
|
NK_INTERNAL nk_b128_vec_t nk_f16x4_to_f32x4_v128relaxed_(nk_b64_vec_t f16_vec) {
|
|
52
|
-
v128_t
|
|
53
|
-
v128_t
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
v128_t
|
|
57
|
-
v128_t
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
v128_t
|
|
63
|
-
v128_t
|
|
64
|
-
v128_t normal_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 13);
|
|
65
|
-
v128_t normal_bits_u32x4 = wasm_v128_or(sign_f32_u32x4, wasm_v128_or(normal_exp_u32x4, normal_mant_u32x4));
|
|
66
|
-
|
|
67
|
-
// Early exit: skip zero/denormal/inf/NaN handling when all lanes are normal
|
|
68
|
-
v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
|
|
69
|
-
v128_t exp_max_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(31));
|
|
70
|
-
v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, exp_max_mask);
|
|
71
|
-
if (!wasm_v128_any_true(exceptional_mask)) {
|
|
72
|
-
nk_b128_vec_t result;
|
|
73
|
-
result.v128 = normal_bits_u32x4;
|
|
74
|
-
return result;
|
|
75
|
-
}
|
|
57
|
+
v128_t raw_u16x4_in_u64 = wasm_i64x2_splat(f16_vec.u64);
|
|
58
|
+
v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(raw_u16x4_in_u64);
|
|
59
|
+
|
|
60
|
+
// Extract sign and unsigned magnitude
|
|
61
|
+
v128_t sign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x8000));
|
|
62
|
+
v128_t sign_f32_u32x4 = wasm_i32x4_shl(sign_u32x4, 16);
|
|
63
|
+
v128_t magnitude_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x7FFF));
|
|
64
|
+
|
|
65
|
+
// Shift mantissa+exponent into F32 position and multiply by magic 2^112
|
|
66
|
+
v128_t shifted_u32x4 = wasm_i32x4_shl(magnitude_u32x4, 13);
|
|
67
|
+
v128_t magic_f32x4 = wasm_i32x4_splat(0x77800000);
|
|
68
|
+
v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)magic_f32x4);
|
|
76
69
|
|
|
77
|
-
//
|
|
78
|
-
|
|
79
|
-
v128_t
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
//
|
|
85
|
-
|
|
86
|
-
v128_t mant_f32x4 = wasm_f32x4_convert_u32x4(mant_u32x4);
|
|
87
|
-
v128_t denorm_normalized_f32x4 = wasm_f32x4_mul(mant_f32x4, wasm_f32x4_splat(0x1p-24f));
|
|
88
|
-
v128_t denorm_bits_u32x4 = wasm_v128_or(denorm_normalized_f32x4, sign_f32_u32x4);
|
|
89
|
-
|
|
90
|
-
v128_t mant_zero_mask = wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(0));
|
|
91
|
-
v128_t is_zero_mask = wasm_v128_and(exp_zero_mask, mant_zero_mask);
|
|
92
|
-
v128_t is_denormal_mask = wasm_v128_andnot(exp_zero_mask, mant_zero_mask);
|
|
93
|
-
|
|
94
|
-
// Blend via relaxed_laneselect (1 instruction: vblendvps on x86, vs 3 for and/andn/or)
|
|
95
|
-
v128_t result_u32x4 = normal_bits_u32x4;
|
|
96
|
-
result_u32x4 = wasm_i32x4_relaxed_laneselect(zero_bits_u32x4, result_u32x4, is_zero_mask);
|
|
97
|
-
result_u32x4 = wasm_i32x4_relaxed_laneselect(denorm_bits_u32x4, result_u32x4, is_denormal_mask);
|
|
98
|
-
result_u32x4 = wasm_i32x4_relaxed_laneselect(inf_nan_bits_u32x4, result_u32x4, exp_max_mask);
|
|
70
|
+
// Fix inf/NaN: exp=31 after shift becomes 0x1F<<13 = 0x000F8000, ×2^112 overflows.
|
|
71
|
+
// Detect via threshold on shifted magnitude and apply direct rebias instead.
|
|
72
|
+
v128_t infnan_threshold_u32x4 = wasm_i32x4_splat(0x38800000);
|
|
73
|
+
v128_t infnan_mask_u32x4 = wasm_u32x4_ge(shifted_u32x4, infnan_threshold_u32x4);
|
|
74
|
+
v128_t direct_u32x4 = wasm_v128_or(shifted_u32x4, wasm_i32x4_splat(0x70000000));
|
|
75
|
+
v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(direct_u32x4, rebiased_f32x4, infnan_mask_u32x4);
|
|
76
|
+
|
|
77
|
+
// Apply sign
|
|
78
|
+
result_u32x4 = wasm_v128_or(result_u32x4, sign_f32_u32x4);
|
|
99
79
|
|
|
100
80
|
nk_b128_vec_t result;
|
|
101
81
|
result.v128 = result_u32x4;
|
|
@@ -103,69 +83,446 @@ NK_INTERNAL nk_b128_vec_t nk_f16x4_to_f32x4_v128relaxed_(nk_b64_vec_t f16_vec) {
|
|
|
103
83
|
}
|
|
104
84
|
|
|
105
85
|
/**
|
|
106
|
-
* @brief E4M3→F32
|
|
107
|
-
*
|
|
86
|
+
* @brief E4M3→F32 via Giesen's magic multiply (×2^120).
|
|
87
|
+
* Shift 7-bit magnitude left by 20 into f32 position, multiply by 2^120 to rebias exponent.
|
|
88
|
+
* The multiply also normalizes subnormals. NaN fixup for magnitude 0x7F only.
|
|
108
89
|
*/
|
|
109
90
|
NK_INTERNAL nk_b128_vec_t nk_e4m3x4_to_f32x4_v128relaxed_(nk_b32_vec_t e4m3_vec) {
|
|
110
|
-
v128_t
|
|
111
|
-
v128_t
|
|
112
|
-
v128_t
|
|
113
|
-
v128_t
|
|
114
|
-
v128_t
|
|
115
|
-
v128_t
|
|
116
|
-
v128_t
|
|
117
|
-
v128_t
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(7)));
|
|
122
|
-
v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, is_nan_mask);
|
|
123
|
-
if (!wasm_v128_any_true(exceptional_mask)) {
|
|
124
|
-
nk_b128_vec_t result;
|
|
125
|
-
result.v128 = normal_bits_u32x4;
|
|
126
|
-
return result;
|
|
127
|
-
}
|
|
128
|
-
v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(subnorm_f32x4, normal_bits_u32x4, exp_zero_mask);
|
|
129
|
-
if (wasm_v128_any_true(is_nan_mask)) {
|
|
130
|
-
v128_t nan_bits = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7FC00000));
|
|
131
|
-
result_u32x4 = wasm_i32x4_relaxed_laneselect(nan_bits, result_u32x4, is_nan_mask);
|
|
132
|
-
}
|
|
133
|
-
nk_b128_vec_t result;
|
|
134
|
-
result.v128 = result_u32x4;
|
|
135
|
-
return result;
|
|
91
|
+
v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_i32x4_splat(e4m3_vec.u32)));
|
|
92
|
+
v128_t sign_u32x4 = wasm_i32x4_shl(wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x80)), 24);
|
|
93
|
+
v128_t nonsign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x7F));
|
|
94
|
+
v128_t shifted_u32x4 = wasm_i32x4_shl(nonsign_u32x4, 20);
|
|
95
|
+
v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)wasm_i32x4_splat(0x7B800000)); // 2^120
|
|
96
|
+
v128_t is_nan_u32x4 = wasm_i32x4_eq(nonsign_u32x4, wasm_i32x4_splat(0x7F));
|
|
97
|
+
v128_t nan_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7FC00000));
|
|
98
|
+
v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(nan_u32x4, rebiased_f32x4, is_nan_u32x4);
|
|
99
|
+
nk_b128_vec_t result_vec;
|
|
100
|
+
result_vec.v128 = wasm_v128_or(result_u32x4, sign_u32x4);
|
|
101
|
+
return result_vec;
|
|
136
102
|
}
|
|
137
103
|
|
|
138
104
|
/**
|
|
139
|
-
* @brief E5M2→F32
|
|
140
|
-
*
|
|
105
|
+
* @brief E5M2→F32 via Giesen's magic multiply (×2^112).
|
|
106
|
+
* Same exponent encoding as F16 (5-bit, bias=15). Shift 7-bit magnitude left by 21,
|
|
107
|
+
* multiply by 2^112 to rebias. Inf/NaN fixup for exp=31 (nonsign > 123).
|
|
141
108
|
*/
|
|
142
109
|
NK_INTERNAL nk_b128_vec_t nk_e5m2x4_to_f32x4_v128relaxed_(nk_b32_vec_t e5m2_vec) {
|
|
143
|
-
v128_t
|
|
144
|
-
v128_t
|
|
145
|
-
v128_t
|
|
146
|
-
v128_t
|
|
147
|
-
v128_t
|
|
148
|
-
v128_t
|
|
149
|
-
v128_t
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
110
|
+
v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_i32x4_splat(e5m2_vec.u32)));
|
|
111
|
+
v128_t sign_u32x4 = wasm_i32x4_shl(wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x80)), 24);
|
|
112
|
+
v128_t nonsign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x7F));
|
|
113
|
+
v128_t shifted_u32x4 = wasm_i32x4_shl(nonsign_u32x4, 21);
|
|
114
|
+
v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)wasm_i32x4_splat(0x77800000)); // 2^112
|
|
115
|
+
v128_t is_infnan_u32x4 = wasm_u32x4_gt(nonsign_u32x4, wasm_i32x4_splat(123));
|
|
116
|
+
v128_t result_u32x4 = wasm_v128_or(rebiased_f32x4, wasm_v128_and(is_infnan_u32x4, wasm_i32x4_splat(0x7F800000)));
|
|
117
|
+
nk_b128_vec_t result_vec;
|
|
118
|
+
result_vec.v128 = wasm_v128_or(result_u32x4, sign_u32x4);
|
|
119
|
+
return result_vec;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* @brief E2M3→F32 via Giesen's magic multiply (×2^126).
|
|
124
|
+
* S EE MMM (bias=1). Shift 5-bit magnitude left by 20, multiply by 2^126 to rebias.
|
|
125
|
+
* No inf/NaN in E2M3FN format, so no fixup needed.
|
|
126
|
+
*/
|
|
127
|
+
NK_INTERNAL nk_b128_vec_t nk_e2m3x4_to_f32x4_v128relaxed_(nk_b32_vec_t e2m3_vec) {
|
|
128
|
+
v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_i32x4_splat(e2m3_vec.u32)));
|
|
129
|
+
v128_t sign_u32x4 = wasm_i32x4_shl(wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x20)), 26);
|
|
130
|
+
v128_t nonsign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x1F));
|
|
131
|
+
v128_t shifted_u32x4 = wasm_i32x4_shl(nonsign_u32x4, 20);
|
|
132
|
+
v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)wasm_i32x4_splat(0x7E800000)); // 2^126
|
|
133
|
+
nk_b128_vec_t result_vec;
|
|
134
|
+
result_vec.v128 = wasm_v128_or(rebiased_f32x4, sign_u32x4);
|
|
135
|
+
return result_vec;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* @brief E3M2→F32 via Giesen's magic multiply (×2^124).
|
|
140
|
+
* S EEE MM (bias=3). Shift 5-bit magnitude left by 21, multiply by 2^124 to rebias.
|
|
141
|
+
* No inf/NaN in E3M2FN format, so no fixup needed.
|
|
142
|
+
*/
|
|
143
|
+
NK_INTERNAL nk_b128_vec_t nk_e3m2x4_to_f32x4_v128relaxed_(nk_b32_vec_t e3m2_vec) {
|
|
144
|
+
v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_i32x4_splat(e3m2_vec.u32)));
|
|
145
|
+
v128_t sign_u32x4 = wasm_i32x4_shl(wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x20)), 26);
|
|
146
|
+
v128_t nonsign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x1F));
|
|
147
|
+
v128_t shifted_u32x4 = wasm_i32x4_shl(nonsign_u32x4, 21);
|
|
148
|
+
v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)wasm_i32x4_splat(0x7D800000)); // 2^124
|
|
149
|
+
nk_b128_vec_t result_vec;
|
|
150
|
+
result_vec.v128 = wasm_v128_or(rebiased_f32x4, sign_u32x4);
|
|
151
|
+
return result_vec;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/** @brief Convert 4x i8 → f32x4 (WASM). Widen i8→i16→i32, convert to f32. */
|
|
155
|
+
NK_INTERNAL nk_b128_vec_t nk_i8x4_to_f32x4_v128relaxed_(nk_b32_vec_t in_vec) {
|
|
156
|
+
v128_t in_i8x16 = wasm_i32x4_splat(in_vec.u32);
|
|
157
|
+
v128_t in_i16x8 = wasm_i16x8_extend_low_i8x16(in_i8x16);
|
|
158
|
+
v128_t in_i32x4 = wasm_i32x4_extend_low_i16x8(in_i16x8);
|
|
159
|
+
nk_b128_vec_t result_vec;
|
|
160
|
+
result_vec.v128 = wasm_f32x4_convert_i32x4(in_i32x4);
|
|
161
|
+
return result_vec;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/** @brief Convert 4x u8 → f32x4 (WASM). Widen u8→u16→u32, convert to f32. */
|
|
165
|
+
NK_INTERNAL nk_b128_vec_t nk_u8x4_to_f32x4_v128relaxed_(nk_b32_vec_t in_vec) {
|
|
166
|
+
v128_t in_u8x16 = wasm_i32x4_splat(in_vec.u32);
|
|
167
|
+
v128_t in_u16x8 = wasm_u16x8_extend_low_u8x16(in_u8x16);
|
|
168
|
+
v128_t in_u32x4 = wasm_u32x4_extend_low_u16x8(in_u16x8);
|
|
169
|
+
nk_b128_vec_t result_vec;
|
|
170
|
+
result_vec.v128 = wasm_f32x4_convert_u32x4(in_u32x4);
|
|
171
|
+
return result_vec;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/** @brief Convert f32x4 → 4x bf16 via RNE rounding (WASM). */
|
|
175
|
+
NK_INTERNAL nk_b64_vec_t nk_f32x4_to_bf16x4_v128relaxed_(nk_b128_vec_t hub_vec) {
|
|
176
|
+
v128_t bits_u32x4 = hub_vec.v128;
|
|
177
|
+
v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 16), wasm_i32x4_splat(1));
|
|
178
|
+
v128_t rounded_u32x4 = wasm_i32x4_add(bits_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x7FFF), lsb_u32x4));
|
|
179
|
+
v128_t bf16_u32x4 = wasm_u32x4_shr(rounded_u32x4, 16);
|
|
180
|
+
v128_t packed_u16x8 = wasm_u16x8_narrow_i32x4(bf16_u32x4, bf16_u32x4);
|
|
181
|
+
nk_b64_vec_t result_vec;
|
|
182
|
+
result_vec.u64 = (nk_u64_t)wasm_i64x2_extract_lane(packed_u16x8, 0);
|
|
183
|
+
return result_vec;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* @brief F32→F16 via bit manipulation with RNE (WASM).
|
|
188
|
+
* Handles normal, subnormal, overflow (→inf), and inf/NaN cases.
|
|
189
|
+
*/
|
|
190
|
+
NK_INTERNAL nk_b64_vec_t nk_f32x4_to_f16x4_v128relaxed_(nk_b128_vec_t hub_vec) {
|
|
191
|
+
v128_t bits_u32x4 = hub_vec.v128;
|
|
192
|
+
v128_t sign_u32x4 = wasm_i32x4_shl(wasm_u32x4_shr(bits_u32x4, 31), 15);
|
|
193
|
+
v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
|
|
194
|
+
v128_t f32_mant_u32x4 = wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF));
|
|
195
|
+
|
|
196
|
+
// Normal path: rebias exponent (127→15), RNE round mantissa 23→10 bits
|
|
197
|
+
v128_t f16_exp_i32x4 = wasm_i32x4_sub(f32_exp_u32x4, wasm_i32x4_splat(112));
|
|
198
|
+
v128_t significand_u32x4 = wasm_v128_or(f32_mant_u32x4, wasm_i32x4_splat(0x00800000));
|
|
199
|
+
v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 13), wasm_i32x4_splat(1));
|
|
200
|
+
v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x0FFF), lsb_u32x4));
|
|
201
|
+
v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
|
|
202
|
+
v128_t f16_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 13), wasm_i32x4_splat(0x3FF));
|
|
203
|
+
v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
|
|
204
|
+
f16_mant_u32x4 = wasm_v128_andnot(f16_mant_u32x4, carry_mask_u32x4);
|
|
205
|
+
f16_exp_i32x4 = wasm_i32x4_add(f16_exp_i32x4, carry_u32x4);
|
|
206
|
+
|
|
207
|
+
// Clamp exponent and assemble normal result
|
|
208
|
+
v128_t clamped_exp_i32x4 = wasm_i32x4_max(f16_exp_i32x4, wasm_i32x4_splat(1));
|
|
209
|
+
clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(30));
|
|
210
|
+
v128_t normal_result_u32x4 = wasm_v128_or(sign_u32x4,
|
|
211
|
+
wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 10), f16_mant_u32x4));
|
|
212
|
+
|
|
213
|
+
// Overflow → infinity
|
|
214
|
+
v128_t overflow_mask_u32x4 = wasm_i32x4_gt(f16_exp_i32x4, wasm_i32x4_splat(30));
|
|
215
|
+
v128_t inf_result_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7C00));
|
|
216
|
+
normal_result_u32x4 = wasm_i32x4_relaxed_laneselect(inf_result_u32x4, normal_result_u32x4, overflow_mask_u32x4);
|
|
217
|
+
|
|
218
|
+
// Underflow → zero (exp <= 0 after rebias, ignoring subnormals for simplicity)
|
|
219
|
+
v128_t underflow_mask_u32x4 = wasm_i32x4_lt(f16_exp_i32x4, wasm_i32x4_splat(1));
|
|
220
|
+
normal_result_u32x4 = wasm_i32x4_relaxed_laneselect(sign_u32x4, normal_result_u32x4, underflow_mask_u32x4);
|
|
221
|
+
|
|
222
|
+
// Inf/NaN passthrough: f32 exp=255
|
|
223
|
+
v128_t infnan_mask_u32x4 = wasm_i32x4_eq(f32_exp_u32x4, wasm_i32x4_splat(255));
|
|
224
|
+
v128_t nan_payload_u32x4 = wasm_v128_or(wasm_u32x4_shr(f32_mant_u32x4, 13), wasm_i32x4_splat(1));
|
|
225
|
+
v128_t mant_nonzero_u32x4 = wasm_i32x4_ne(f32_mant_u32x4, wasm_i32x4_splat(0));
|
|
226
|
+
v128_t nan_result_u32x4 = wasm_v128_or(
|
|
227
|
+
sign_u32x4, wasm_v128_or(wasm_i32x4_splat(0x7C00), wasm_v128_and(nan_payload_u32x4, mant_nonzero_u32x4)));
|
|
228
|
+
normal_result_u32x4 = wasm_i32x4_relaxed_laneselect(nan_result_u32x4, normal_result_u32x4, infnan_mask_u32x4);
|
|
229
|
+
|
|
230
|
+
// F32 zero/denorm → f16 zero
|
|
231
|
+
v128_t f32_zero_mask_u32x4 = wasm_i32x4_eq(f32_exp_u32x4, wasm_i32x4_splat(0));
|
|
232
|
+
normal_result_u32x4 = wasm_i32x4_relaxed_laneselect(sign_u32x4, normal_result_u32x4, f32_zero_mask_u32x4);
|
|
233
|
+
|
|
234
|
+
// Pack 4x u32 → 4x u16
|
|
235
|
+
v128_t packed_u16x8 = wasm_u16x8_narrow_i32x4(normal_result_u32x4, normal_result_u32x4);
|
|
236
|
+
nk_b64_vec_t result_vec;
|
|
237
|
+
result_vec.u64 = (nk_u64_t)wasm_i64x2_extract_lane(packed_u16x8, 0);
|
|
238
|
+
return result_vec;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/** @brief Convert f32x4 → 4x e4m3 via bit manipulation with RNE (WASM). */
|
|
242
|
+
NK_INTERNAL nk_b32_vec_t nk_f32x4_to_e4m3x4_v128relaxed_(nk_b128_vec_t hub_vec) {
|
|
243
|
+
v128_t bits_u32x4 = hub_vec.v128;
|
|
244
|
+
v128_t sign_u32x4 = wasm_u32x4_shr(bits_u32x4, 31);
|
|
245
|
+
v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
|
|
246
|
+
|
|
247
|
+
// RNE mantissa rounding from 23 to 3 bits
|
|
248
|
+
v128_t significand_u32x4 = wasm_v128_or(wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF)),
|
|
249
|
+
wasm_i32x4_splat(0x00800000));
|
|
250
|
+
v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 20), wasm_i32x4_splat(1));
|
|
251
|
+
v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x0007FFFF), lsb_u32x4));
|
|
252
|
+
v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
|
|
253
|
+
v128_t f32_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 20), wasm_i32x4_splat(0x07));
|
|
254
|
+
v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
|
|
255
|
+
f32_mant_u32x4 = wasm_v128_andnot(f32_mant_u32x4, carry_mask_u32x4);
|
|
256
|
+
v128_t e4m3_exp_i32x4 = wasm_i32x4_sub(wasm_i32x4_add(f32_exp_u32x4, carry_u32x4), wasm_i32x4_splat(120));
|
|
257
|
+
|
|
258
|
+
v128_t is_subnormal_u32x4 = wasm_i32x4_lt(e4m3_exp_i32x4, wasm_i32x4_splat(1));
|
|
259
|
+
v128_t overflow_u32x4 = wasm_i32x4_gt(e4m3_exp_i32x4, wasm_i32x4_splat(15));
|
|
260
|
+
|
|
261
|
+
// Normal path
|
|
262
|
+
v128_t clamped_exp_i32x4 = wasm_i32x4_max(e4m3_exp_i32x4, wasm_i32x4_splat(1));
|
|
263
|
+
clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(15));
|
|
264
|
+
v128_t is_max_exp_u32x4 = wasm_i32x4_eq(clamped_exp_i32x4, wasm_i32x4_splat(15));
|
|
265
|
+
v128_t max_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(6), wasm_i32x4_splat(7), is_max_exp_u32x4);
|
|
266
|
+
v128_t normal_mant_u32x4 = wasm_i32x4_min(f32_mant_u32x4, max_mant_u32x4);
|
|
267
|
+
normal_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(0x06), normal_mant_u32x4, overflow_u32x4);
|
|
268
|
+
v128_t normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7),
|
|
269
|
+
wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 3), normal_mant_u32x4));
|
|
270
|
+
|
|
271
|
+
// Subnormal path
|
|
272
|
+
v128_t abs_f32x4 = wasm_v128_and(hub_vec.v128, wasm_i32x4_splat(0x7FFFFFFF));
|
|
273
|
+
v128_t scaled_f32x4 = wasm_f32x4_mul((v128_t)abs_f32x4, wasm_f32x4_splat(512.0f));
|
|
274
|
+
v128_t sub_mant_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(scaled_f32x4));
|
|
275
|
+
v128_t promotes_u32x4 = wasm_i32x4_gt(sub_mant_i32x4, wasm_i32x4_splat(7));
|
|
276
|
+
sub_mant_i32x4 = wasm_i32x4_min(sub_mant_i32x4, wasm_i32x4_splat(7));
|
|
277
|
+
sub_mant_i32x4 = wasm_i32x4_max(sub_mant_i32x4, wasm_i32x4_splat(0));
|
|
278
|
+
v128_t subnormal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7), sub_mant_i32x4);
|
|
279
|
+
v128_t first_normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7), wasm_i32x4_splat(0x08));
|
|
280
|
+
subnormal_u32x4 = wasm_i32x4_relaxed_laneselect(first_normal_u32x4, subnormal_u32x4, promotes_u32x4);
|
|
281
|
+
|
|
282
|
+
v128_t e4m3_u32x4 = wasm_i32x4_relaxed_laneselect(subnormal_u32x4, normal_u32x4, is_subnormal_u32x4);
|
|
283
|
+
|
|
284
|
+
// Pack 4x u32 → 4x u8
|
|
285
|
+
v128_t packed_u16 = wasm_u16x8_narrow_i32x4(e4m3_u32x4, e4m3_u32x4);
|
|
286
|
+
v128_t packed_u8 = wasm_u8x16_narrow_i16x8(packed_u16, packed_u16);
|
|
287
|
+
nk_b32_vec_t result_vec;
|
|
288
|
+
result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(packed_u8, 0);
|
|
289
|
+
return result_vec;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/** @brief Convert f32x4 → 4x e5m2 via bit manipulation with RNE (WASM). */
|
|
293
|
+
NK_INTERNAL nk_b32_vec_t nk_f32x4_to_e5m2x4_v128relaxed_(nk_b128_vec_t hub_vec) {
|
|
294
|
+
v128_t bits_u32x4 = hub_vec.v128;
|
|
295
|
+
v128_t sign_u32x4 = wasm_u32x4_shr(bits_u32x4, 31);
|
|
296
|
+
v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
|
|
297
|
+
|
|
298
|
+
// RNE mantissa rounding from 23 to 2 bits
|
|
299
|
+
v128_t significand_u32x4 = wasm_v128_or(wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF)),
|
|
300
|
+
wasm_i32x4_splat(0x00800000));
|
|
301
|
+
v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 21), wasm_i32x4_splat(1));
|
|
302
|
+
v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x000FFFFF), lsb_u32x4));
|
|
303
|
+
v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
|
|
304
|
+
v128_t f32_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 21), wasm_i32x4_splat(0x03));
|
|
305
|
+
v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
|
|
306
|
+
f32_mant_u32x4 = wasm_v128_andnot(f32_mant_u32x4, carry_mask_u32x4);
|
|
307
|
+
v128_t e5m2_exp_i32x4 = wasm_i32x4_sub(wasm_i32x4_add(f32_exp_u32x4, carry_u32x4), wasm_i32x4_splat(112));
|
|
308
|
+
|
|
309
|
+
v128_t is_subnormal_u32x4 = wasm_i32x4_lt(e5m2_exp_i32x4, wasm_i32x4_splat(1));
|
|
310
|
+
v128_t overflow_u32x4 = wasm_i32x4_gt(e5m2_exp_i32x4, wasm_i32x4_splat(31));
|
|
311
|
+
|
|
312
|
+
// Normal path: overflow → infinity (exp=31, mant=0)
|
|
313
|
+
v128_t clamped_exp_i32x4 = wasm_i32x4_max(e5m2_exp_i32x4, wasm_i32x4_splat(1));
|
|
314
|
+
clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(31));
|
|
315
|
+
v128_t normal_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(0), f32_mant_u32x4, overflow_u32x4);
|
|
316
|
+
v128_t normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7),
|
|
317
|
+
wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 2), normal_mant_u32x4));
|
|
318
|
+
|
|
319
|
+
// Subnormal path
|
|
320
|
+
v128_t abs_f32x4 = wasm_v128_and(hub_vec.v128, wasm_i32x4_splat(0x7FFFFFFF));
|
|
321
|
+
v128_t scaled_f32x4 = wasm_f32x4_mul((v128_t)abs_f32x4, wasm_f32x4_splat(65536.0f));
|
|
322
|
+
v128_t sub_mant_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(scaled_f32x4));
|
|
323
|
+
v128_t promotes_u32x4 = wasm_i32x4_gt(sub_mant_i32x4, wasm_i32x4_splat(3));
|
|
324
|
+
sub_mant_i32x4 = wasm_i32x4_min(sub_mant_i32x4, wasm_i32x4_splat(3));
|
|
325
|
+
sub_mant_i32x4 = wasm_i32x4_max(sub_mant_i32x4, wasm_i32x4_splat(0));
|
|
326
|
+
v128_t subnormal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7), sub_mant_i32x4);
|
|
327
|
+
v128_t first_normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7), wasm_i32x4_splat(0x04));
|
|
328
|
+
subnormal_u32x4 = wasm_i32x4_relaxed_laneselect(first_normal_u32x4, subnormal_u32x4, promotes_u32x4);
|
|
329
|
+
|
|
330
|
+
v128_t e5m2_u32x4 = wasm_i32x4_relaxed_laneselect(subnormal_u32x4, normal_u32x4, is_subnormal_u32x4);
|
|
331
|
+
|
|
332
|
+
v128_t packed_u16 = wasm_u16x8_narrow_i32x4(e5m2_u32x4, e5m2_u32x4);
|
|
333
|
+
v128_t packed_u8 = wasm_u8x16_narrow_i16x8(packed_u16, packed_u16);
|
|
334
|
+
nk_b32_vec_t result_vec;
|
|
335
|
+
result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(packed_u8, 0);
|
|
336
|
+
return result_vec;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/** @brief Convert f32x4 → 4x e2m3 via bit manipulation with RNE (WASM). */
|
|
340
|
+
NK_INTERNAL nk_b32_vec_t nk_f32x4_to_e2m3x4_v128relaxed_(nk_b128_vec_t hub_vec) {
|
|
341
|
+
v128_t bits_u32x4 = hub_vec.v128;
|
|
342
|
+
v128_t sign_u32x4 = wasm_u32x4_shr(bits_u32x4, 31);
|
|
343
|
+
v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
|
|
344
|
+
|
|
345
|
+
v128_t significand_u32x4 = wasm_v128_or(wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF)),
|
|
346
|
+
wasm_i32x4_splat(0x00800000));
|
|
347
|
+
v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 20), wasm_i32x4_splat(1));
|
|
348
|
+
v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x0007FFFF), lsb_u32x4));
|
|
349
|
+
v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
|
|
350
|
+
v128_t f32_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 20), wasm_i32x4_splat(0x07));
|
|
351
|
+
v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
|
|
352
|
+
f32_mant_u32x4 = wasm_v128_andnot(f32_mant_u32x4, carry_mask_u32x4);
|
|
353
|
+
v128_t e2m3_exp_i32x4 = wasm_i32x4_sub(wasm_i32x4_add(f32_exp_u32x4, carry_u32x4), wasm_i32x4_splat(126));
|
|
354
|
+
|
|
355
|
+
v128_t is_subnormal_u32x4 = wasm_i32x4_lt(e2m3_exp_i32x4, wasm_i32x4_splat(1));
|
|
356
|
+
v128_t overflow_u32x4 = wasm_i32x4_gt(e2m3_exp_i32x4, wasm_i32x4_splat(3));
|
|
357
|
+
|
|
358
|
+
v128_t clamped_exp_i32x4 = wasm_i32x4_max(e2m3_exp_i32x4, wasm_i32x4_splat(1));
|
|
359
|
+
clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(3));
|
|
360
|
+
v128_t normal_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(0x07), f32_mant_u32x4, overflow_u32x4);
|
|
361
|
+
v128_t normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5),
|
|
362
|
+
wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 3), normal_mant_u32x4));
|
|
363
|
+
|
|
364
|
+
v128_t abs_f32x4 = wasm_v128_and(hub_vec.v128, wasm_i32x4_splat(0x7FFFFFFF));
|
|
365
|
+
v128_t scaled_f32x4 = wasm_f32x4_mul((v128_t)abs_f32x4, wasm_f32x4_splat(8.0f));
|
|
366
|
+
v128_t sub_mant_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(scaled_f32x4));
|
|
367
|
+
v128_t promotes_u32x4 = wasm_i32x4_gt(sub_mant_i32x4, wasm_i32x4_splat(7));
|
|
368
|
+
sub_mant_i32x4 = wasm_i32x4_min(sub_mant_i32x4, wasm_i32x4_splat(7));
|
|
369
|
+
sub_mant_i32x4 = wasm_i32x4_max(sub_mant_i32x4, wasm_i32x4_splat(0));
|
|
370
|
+
v128_t subnormal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5), sub_mant_i32x4);
|
|
371
|
+
v128_t first_normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5), wasm_i32x4_splat(0x08));
|
|
372
|
+
subnormal_u32x4 = wasm_i32x4_relaxed_laneselect(first_normal_u32x4, subnormal_u32x4, promotes_u32x4);
|
|
373
|
+
|
|
374
|
+
v128_t e2m3_u32x4 = wasm_i32x4_relaxed_laneselect(subnormal_u32x4, normal_u32x4, is_subnormal_u32x4);
|
|
375
|
+
|
|
376
|
+
v128_t packed_u16 = wasm_u16x8_narrow_i32x4(e2m3_u32x4, e2m3_u32x4);
|
|
377
|
+
v128_t packed_u8 = wasm_u8x16_narrow_i16x8(packed_u16, packed_u16);
|
|
378
|
+
nk_b32_vec_t result_vec;
|
|
379
|
+
result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(packed_u8, 0);
|
|
380
|
+
return result_vec;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/** @brief Convert f32x4 → 4x e3m2 via bit manipulation with RNE (WASM). */
|
|
384
|
+
NK_INTERNAL nk_b32_vec_t nk_f32x4_to_e3m2x4_v128relaxed_(nk_b128_vec_t hub_vec) {
|
|
385
|
+
v128_t bits_u32x4 = hub_vec.v128;
|
|
386
|
+
v128_t sign_u32x4 = wasm_u32x4_shr(bits_u32x4, 31);
|
|
387
|
+
v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
|
|
388
|
+
|
|
389
|
+
v128_t significand_u32x4 = wasm_v128_or(wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF)),
|
|
390
|
+
wasm_i32x4_splat(0x00800000));
|
|
391
|
+
v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 21), wasm_i32x4_splat(1));
|
|
392
|
+
v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x000FFFFF), lsb_u32x4));
|
|
393
|
+
v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
|
|
394
|
+
v128_t f32_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 21), wasm_i32x4_splat(0x03));
|
|
395
|
+
v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
|
|
396
|
+
f32_mant_u32x4 = wasm_v128_andnot(f32_mant_u32x4, carry_mask_u32x4);
|
|
397
|
+
v128_t e3m2_exp_i32x4 = wasm_i32x4_sub(wasm_i32x4_add(f32_exp_u32x4, carry_u32x4), wasm_i32x4_splat(124));
|
|
398
|
+
|
|
399
|
+
v128_t is_subnormal_u32x4 = wasm_i32x4_lt(e3m2_exp_i32x4, wasm_i32x4_splat(1));
|
|
400
|
+
v128_t overflow_u32x4 = wasm_i32x4_gt(e3m2_exp_i32x4, wasm_i32x4_splat(7));
|
|
401
|
+
|
|
402
|
+
v128_t clamped_exp_i32x4 = wasm_i32x4_max(e3m2_exp_i32x4, wasm_i32x4_splat(1));
|
|
403
|
+
clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(7));
|
|
404
|
+
v128_t normal_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(0x03), f32_mant_u32x4, overflow_u32x4);
|
|
405
|
+
v128_t normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5),
|
|
406
|
+
wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 2), normal_mant_u32x4));
|
|
407
|
+
|
|
408
|
+
v128_t abs_f32x4 = wasm_v128_and(hub_vec.v128, wasm_i32x4_splat(0x7FFFFFFF));
|
|
409
|
+
v128_t scaled_f32x4 = wasm_f32x4_mul((v128_t)abs_f32x4, wasm_f32x4_splat(16.0f));
|
|
410
|
+
v128_t sub_mant_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(scaled_f32x4));
|
|
411
|
+
v128_t promotes_u32x4 = wasm_i32x4_gt(sub_mant_i32x4, wasm_i32x4_splat(3));
|
|
412
|
+
sub_mant_i32x4 = wasm_i32x4_min(sub_mant_i32x4, wasm_i32x4_splat(3));
|
|
413
|
+
sub_mant_i32x4 = wasm_i32x4_max(sub_mant_i32x4, wasm_i32x4_splat(0));
|
|
414
|
+
v128_t subnormal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5), sub_mant_i32x4);
|
|
415
|
+
v128_t first_normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5), wasm_i32x4_splat(0x04));
|
|
416
|
+
subnormal_u32x4 = wasm_i32x4_relaxed_laneselect(first_normal_u32x4, subnormal_u32x4, promotes_u32x4);
|
|
417
|
+
|
|
418
|
+
v128_t e3m2_u32x4 = wasm_i32x4_relaxed_laneselect(subnormal_u32x4, normal_u32x4, is_subnormal_u32x4);
|
|
419
|
+
|
|
420
|
+
v128_t packed_u16 = wasm_u16x8_narrow_i32x4(e3m2_u32x4, e3m2_u32x4);
|
|
421
|
+
v128_t packed_u8 = wasm_u8x16_narrow_i16x8(packed_u16, packed_u16);
|
|
422
|
+
nk_b32_vec_t result_vec;
|
|
423
|
+
result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(packed_u8, 0);
|
|
424
|
+
return result_vec;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/** @brief Convert f32x4 → 4x i8 with saturation (WASM). */
|
|
428
|
+
NK_INTERNAL nk_b32_vec_t nk_f32x4_to_i8x4_v128relaxed_(nk_b128_vec_t hub_vec) {
|
|
429
|
+
v128_t clamped_f32x4 = wasm_f32x4_min(wasm_f32x4_max(hub_vec.v128, wasm_f32x4_splat(-128.0f)),
|
|
430
|
+
wasm_f32x4_splat(127.0f));
|
|
431
|
+
v128_t result_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(clamped_f32x4));
|
|
432
|
+
v128_t result_i16x8 = wasm_i16x8_narrow_i32x4(result_i32x4, result_i32x4);
|
|
433
|
+
v128_t result_i8x16 = wasm_i8x16_narrow_i16x8(result_i16x8, result_i16x8);
|
|
434
|
+
nk_b32_vec_t result_vec;
|
|
435
|
+
result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(result_i8x16, 0);
|
|
436
|
+
return result_vec;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
/** @brief Convert f32x4 → 4x u8 with saturation (WASM). */
|
|
440
|
+
NK_INTERNAL nk_b32_vec_t nk_f32x4_to_u8x4_v128relaxed_(nk_b128_vec_t hub_vec) {
|
|
441
|
+
v128_t clamped_f32x4 = wasm_f32x4_min(wasm_f32x4_max(hub_vec.v128, wasm_f32x4_splat(0.0f)),
|
|
442
|
+
wasm_f32x4_splat(255.0f));
|
|
443
|
+
v128_t result_u32x4 = wasm_u32x4_trunc_sat_f32x4(wasm_f32x4_nearest(clamped_f32x4));
|
|
444
|
+
v128_t result_u16x8 = wasm_u16x8_narrow_i32x4(result_u32x4, result_u32x4);
|
|
445
|
+
v128_t result_u8x16 = wasm_u8x16_narrow_i16x8(result_u16x8, result_u16x8);
|
|
446
|
+
nk_b32_vec_t result_vec;
|
|
447
|
+
result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(result_u8x16, 0);
|
|
448
|
+
return result_vec;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
NK_PUBLIC void nk_cast_v128relaxed(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type) {
|
|
452
|
+
// Same-type fast path
|
|
453
|
+
if (from_type == to_type) {
|
|
454
|
+
nk_size_t size_bits = nk_dtype_bits(from_type);
|
|
455
|
+
if (size_bits > 0) nk_copy_bytes_(to, from, nk_size_divide_round_up_(n * size_bits, 8));
|
|
456
|
+
return;
|
|
159
457
|
}
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
458
|
+
|
|
459
|
+
// Validate supported types
|
|
460
|
+
int from_ok = (from_type == nk_f32_k || from_type == nk_f16_k || from_type == nk_bf16_k || from_type == nk_e4m3_k ||
|
|
461
|
+
from_type == nk_e5m2_k || from_type == nk_e2m3_k || from_type == nk_e3m2_k || from_type == nk_i8_k ||
|
|
462
|
+
from_type == nk_u8_k);
|
|
463
|
+
int to_ok = (to_type == nk_f32_k || to_type == nk_f16_k || to_type == nk_bf16_k || to_type == nk_e4m3_k ||
|
|
464
|
+
to_type == nk_e5m2_k || to_type == nk_e2m3_k || to_type == nk_e3m2_k || to_type == nk_i8_k ||
|
|
465
|
+
to_type == nk_u8_k);
|
|
466
|
+
|
|
467
|
+
if (!from_ok || !to_ok) {
|
|
468
|
+
nk_cast_serial(from, from_type, n, to, to_type);
|
|
469
|
+
return;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// F32 hub: 4 elements per iteration
|
|
473
|
+
nk_size_t batches = n / 4;
|
|
474
|
+
nk_size_t tail = n % 4;
|
|
475
|
+
nk_size_t from_step = 4 * nk_dtype_bits(from_type) / 8;
|
|
476
|
+
nk_size_t to_step = 4 * nk_dtype_bits(to_type) / 8;
|
|
477
|
+
nk_u8_t const *from_ptr = (nk_u8_t const *)from;
|
|
478
|
+
nk_u8_t *to_ptr = (nk_u8_t *)to;
|
|
479
|
+
|
|
480
|
+
for (nk_size_t idx = 0; idx < batches; ++idx, from_ptr += from_step, to_ptr += to_step) {
|
|
481
|
+
nk_b128_vec_t hub_vec;
|
|
482
|
+
|
|
483
|
+
// Upcast to f32x4 hub using size-appropriate loads
|
|
484
|
+
if (from_step == 16) { hub_vec.v128 = wasm_v128_load(from_ptr); }
|
|
485
|
+
else if (from_step == 8) {
|
|
486
|
+
nk_b64_vec_t raw64_vec;
|
|
487
|
+
raw64_vec.u64 = (nk_u64_t)wasm_i64x2_extract_lane(wasm_v128_load64_zero(from_ptr), 0);
|
|
488
|
+
switch (from_type) {
|
|
489
|
+
case nk_f16_k: hub_vec = nk_f16x4_to_f32x4_v128relaxed_(raw64_vec); break;
|
|
490
|
+
case nk_bf16_k: hub_vec = nk_bf16x4_to_f32x4_v128relaxed_(raw64_vec); break;
|
|
491
|
+
default: break;
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
else if (from_step == 4) {
|
|
495
|
+
nk_b32_vec_t raw32_vec;
|
|
496
|
+
raw32_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(wasm_v128_load32_zero(from_ptr), 0);
|
|
497
|
+
switch (from_type) {
|
|
498
|
+
case nk_e4m3_k: hub_vec = nk_e4m3x4_to_f32x4_v128relaxed_(raw32_vec); break;
|
|
499
|
+
case nk_e5m2_k: hub_vec = nk_e5m2x4_to_f32x4_v128relaxed_(raw32_vec); break;
|
|
500
|
+
case nk_e2m3_k: hub_vec = nk_e2m3x4_to_f32x4_v128relaxed_(raw32_vec); break;
|
|
501
|
+
case nk_e3m2_k: hub_vec = nk_e3m2x4_to_f32x4_v128relaxed_(raw32_vec); break;
|
|
502
|
+
case nk_i8_k: hub_vec = nk_i8x4_to_f32x4_v128relaxed_(raw32_vec); break;
|
|
503
|
+
case nk_u8_k: hub_vec = nk_u8x4_to_f32x4_v128relaxed_(raw32_vec); break;
|
|
504
|
+
default: break;
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
else hub_vec.v128 = wasm_f32x4_splat(0);
|
|
508
|
+
|
|
509
|
+
// Downcast from f32x4 hub and store using half-register stores
|
|
510
|
+
switch (to_type) {
|
|
511
|
+
case nk_f32_k: wasm_v128_store(to_ptr, hub_vec.v128); break;
|
|
512
|
+
case nk_f16_k: *(nk_u64_t *)to_ptr = nk_f32x4_to_f16x4_v128relaxed_(hub_vec).u64; break;
|
|
513
|
+
case nk_bf16_k: *(nk_u64_t *)to_ptr = nk_f32x4_to_bf16x4_v128relaxed_(hub_vec).u64; break;
|
|
514
|
+
case nk_e4m3_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_e4m3x4_v128relaxed_(hub_vec).u32; break;
|
|
515
|
+
case nk_e5m2_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_e5m2x4_v128relaxed_(hub_vec).u32; break;
|
|
516
|
+
case nk_e2m3_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_e2m3x4_v128relaxed_(hub_vec).u32; break;
|
|
517
|
+
case nk_e3m2_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_e3m2x4_v128relaxed_(hub_vec).u32; break;
|
|
518
|
+
case nk_i8_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_i8x4_v128relaxed_(hub_vec).u32; break;
|
|
519
|
+
case nk_u8_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_u8x4_v128relaxed_(hub_vec).u32; break;
|
|
520
|
+
default: break;
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
// Handle tail elements with serial fallback
|
|
525
|
+
if (tail) nk_cast_serial(from_ptr, from_type, tail, to_ptr, to_type);
|
|
169
526
|
}
|
|
170
527
|
|
|
171
528
|
#if defined(__clang__)
|