numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -36,7 +36,7 @@ NK_INTERNAL void nk_store_b256_v128relaxed_(nk_b256_vec_t const *src, void *dst)
36
36
 
37
37
  /** @brief BF16 is the upper 16 bits of F32, so zero-extend to u32 and shift left by 16. */
38
38
  NK_INTERNAL nk_b128_vec_t nk_bf16x4_to_f32x4_v128relaxed_(nk_b64_vec_t bf16_vec) {
39
- v128_t bf16_u16x4_in_u64 = wasm_v128_load64_zero(&bf16_vec.u64);
39
+ v128_t bf16_u16x4_in_u64 = wasm_i64x2_splat(bf16_vec.u64);
40
40
  v128_t bf16_u32x4_low = wasm_u32x4_extend_low_u16x8(bf16_u16x4_in_u64);
41
41
  nk_b128_vec_t result;
42
42
  result.v128 = wasm_i32x4_shl(bf16_u32x4_low, 16);
@@ -44,58 +44,38 @@ NK_INTERNAL nk_b128_vec_t nk_bf16x4_to_f32x4_v128relaxed_(nk_b64_vec_t bf16_vec)
44
44
  }
45
45
 
46
46
  /**
47
- * @brief F16→F32: extract sign/exp/mantissa, rebias exponent (F16 bias=15, F32 bias=127, delta=112),
48
- * widen mantissa from 10 to 23 bits. Early-exit when all lanes are normal (exp in [1,30]),
49
- * skipping the expensive f32x4.convert_u32x4 needed for denormal FPU-based normalization.
47
+ * @brief F16→F32 via Giesen's magic-number multiply trick.
48
+ * @see https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
49
+ *
50
+ * Shifts the 15-bit magnitude into F32 exponent+mantissa position, then multiplies
51
+ * by 2^112 (magic = 0x77800000) to rebias the exponent. This single multiply also
52
+ * correctly normalizes F16 subnormals into F32 normals — no branching or FPU
53
+ * integer-to-float conversion needed. Inf/NaN (exp=31) overflows the multiply and
54
+ * is fixed with a comparison + blend.
50
55
  */
51
56
  NK_INTERNAL nk_b128_vec_t nk_f16x4_to_f32x4_v128relaxed_(nk_b64_vec_t f16_vec) {
52
- v128_t f16_u16x4_in_u64 = wasm_v128_load64_zero(&f16_vec.u64);
53
- v128_t f16_u32x4 = wasm_u32x4_extend_low_u16x8(f16_u16x4_in_u64);
54
-
55
- v128_t sign_u32x4 = wasm_v128_and(f16_u32x4, wasm_i32x4_splat(0x8000)); // bit 15
56
- v128_t exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(f16_u32x4, 10), wasm_i32x4_splat(0x1F)); // bits 14-10
57
- v128_t mant_u32x4 = wasm_v128_and(f16_u32x4, wasm_i32x4_splat(0x03FF)); // bits 9-0
58
-
59
- v128_t sign_f32_u32x4 = wasm_i32x4_shl(sign_u32x4, 16); // shift sign to F32 bit 31
60
-
61
- // Normal path: rebias exponent, widen mantissa
62
- v128_t exp_rebiased_u32x4 = wasm_i32x4_add(exp_u32x4, wasm_i32x4_splat(112));
63
- v128_t normal_exp_u32x4 = wasm_i32x4_shl(exp_rebiased_u32x4, 23);
64
- v128_t normal_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 13);
65
- v128_t normal_bits_u32x4 = wasm_v128_or(sign_f32_u32x4, wasm_v128_or(normal_exp_u32x4, normal_mant_u32x4));
66
-
67
- // Early exit: skip zero/denormal/inf/NaN handling when all lanes are normal
68
- v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
69
- v128_t exp_max_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(31));
70
- v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, exp_max_mask);
71
- if (!wasm_v128_any_true(exceptional_mask)) {
72
- nk_b128_vec_t result;
73
- result.v128 = normal_bits_u32x4;
74
- return result;
75
- }
57
+ v128_t raw_u16x4_in_u64 = wasm_i64x2_splat(f16_vec.u64);
58
+ v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(raw_u16x4_in_u64);
59
+
60
+ // Extract sign and unsigned magnitude
61
+ v128_t sign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x8000));
62
+ v128_t sign_f32_u32x4 = wasm_i32x4_shl(sign_u32x4, 16);
63
+ v128_t magnitude_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x7FFF));
64
+
65
+ // Shift mantissa+exponent into F32 position and multiply by magic 2^112
66
+ v128_t shifted_u32x4 = wasm_i32x4_shl(magnitude_u32x4, 13);
67
+ v128_t magic_f32x4 = wasm_i32x4_splat(0x77800000);
68
+ v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)magic_f32x4);
76
69
 
77
- // Slow path: handle zero (exp=0, mant=0), denormal (exp=0, mant!=0), inf/NaN (exp=31)
78
- v128_t zero_bits_u32x4 = sign_f32_u32x4;
79
- v128_t inf_nan_bits_u32x4 = wasm_v128_or(
80
- sign_f32_u32x4, wasm_v128_or(wasm_i32x4_splat(0x7F800000), wasm_i32x4_shl(mant_u32x4, 13)));
81
-
82
- // Denormals: convert mantissa to f32 and multiply by 2^-24, letting the FPU normalize.
83
- // This avoids a manual CLZ+shift loop. The f32x4.convert_u32x4 legalizes to a
84
- // multi-instruction sequence on x86 (no native u32→f32 until AVX-512), which is why
85
- // the early exit above is so valuable.
86
- v128_t mant_f32x4 = wasm_f32x4_convert_u32x4(mant_u32x4);
87
- v128_t denorm_normalized_f32x4 = wasm_f32x4_mul(mant_f32x4, wasm_f32x4_splat(0x1p-24f));
88
- v128_t denorm_bits_u32x4 = wasm_v128_or(denorm_normalized_f32x4, sign_f32_u32x4);
89
-
90
- v128_t mant_zero_mask = wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(0));
91
- v128_t is_zero_mask = wasm_v128_and(exp_zero_mask, mant_zero_mask);
92
- v128_t is_denormal_mask = wasm_v128_andnot(exp_zero_mask, mant_zero_mask);
93
-
94
- // Blend via relaxed_laneselect (1 instruction: vblendvps on x86, vs 3 for and/andn/or)
95
- v128_t result_u32x4 = normal_bits_u32x4;
96
- result_u32x4 = wasm_i32x4_relaxed_laneselect(zero_bits_u32x4, result_u32x4, is_zero_mask);
97
- result_u32x4 = wasm_i32x4_relaxed_laneselect(denorm_bits_u32x4, result_u32x4, is_denormal_mask);
98
- result_u32x4 = wasm_i32x4_relaxed_laneselect(inf_nan_bits_u32x4, result_u32x4, exp_max_mask);
70
+ // Fix inf/NaN: exp=31 after shift becomes 0x1F<<13 = 0x000F8000, ×2^112 overflows.
71
+ // Detect via threshold on shifted magnitude and apply direct rebias instead.
72
+ v128_t infnan_threshold_u32x4 = wasm_i32x4_splat(0x38800000);
73
+ v128_t infnan_mask_u32x4 = wasm_u32x4_ge(shifted_u32x4, infnan_threshold_u32x4);
74
+ v128_t direct_u32x4 = wasm_v128_or(shifted_u32x4, wasm_i32x4_splat(0x70000000));
75
+ v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(direct_u32x4, rebiased_f32x4, infnan_mask_u32x4);
76
+
77
+ // Apply sign
78
+ result_u32x4 = wasm_v128_or(result_u32x4, sign_f32_u32x4);
99
79
 
100
80
  nk_b128_vec_t result;
101
81
  result.v128 = result_u32x4;
@@ -103,69 +83,446 @@ NK_INTERNAL nk_b128_vec_t nk_f16x4_to_f32x4_v128relaxed_(nk_b64_vec_t f16_vec) {
103
83
  }
104
84
 
105
85
  /**
106
- * @brief E4M3→F32: 4-bit exponent (bias=7→127, delta=120), 3-bit mantissa (shift by 20).
107
- * Subnormal via FPU: mant * (1/512) = mant * 2^-9. NaN only at exp=15,mant=7.
86
+ * @brief E4M3→F32 via Giesen's magic multiply (×2^120).
87
+ * Shift 7-bit magnitude left by 20 into f32 position, multiply by 2^120 to rebias exponent.
88
+ * The multiply also normalizes subnormals. NaN fixup for magnitude 0x7F only.
108
89
  */
109
90
  NK_INTERNAL nk_b128_vec_t nk_e4m3x4_to_f32x4_v128relaxed_(nk_b32_vec_t e4m3_vec) {
110
- v128_t e4m3_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_v128_load32_zero(&e4m3_vec.u32)));
111
- v128_t exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(e4m3_u32x4, 3), wasm_i32x4_splat(0x0F));
112
- v128_t mant_u32x4 = wasm_v128_and(e4m3_u32x4, wasm_i32x4_splat(0x07));
113
- v128_t sign_u32x4 = wasm_i32x4_shl(wasm_u32x4_shr(e4m3_u32x4, 7), 31);
114
- v128_t f32_exp_u32x4 = wasm_i32x4_shl(wasm_i32x4_add(exp_u32x4, wasm_i32x4_splat(120)), 23);
115
- v128_t f32_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 20);
116
- v128_t normal_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_v128_or(f32_exp_u32x4, f32_mant_u32x4));
117
- v128_t subnorm_abs_f32x4 = wasm_f32x4_mul(wasm_f32x4_convert_u32x4(mant_u32x4), wasm_f32x4_splat(1.0f / 512.0f));
118
- v128_t subnorm_f32x4 = wasm_v128_or(subnorm_abs_f32x4, sign_u32x4);
119
- v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
120
- v128_t is_nan_mask = wasm_v128_and(wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(15)),
121
- wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(7)));
122
- v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, is_nan_mask);
123
- if (!wasm_v128_any_true(exceptional_mask)) {
124
- nk_b128_vec_t result;
125
- result.v128 = normal_bits_u32x4;
126
- return result;
127
- }
128
- v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(subnorm_f32x4, normal_bits_u32x4, exp_zero_mask);
129
- if (wasm_v128_any_true(is_nan_mask)) {
130
- v128_t nan_bits = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7FC00000));
131
- result_u32x4 = wasm_i32x4_relaxed_laneselect(nan_bits, result_u32x4, is_nan_mask);
132
- }
133
- nk_b128_vec_t result;
134
- result.v128 = result_u32x4;
135
- return result;
91
+ v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_i32x4_splat(e4m3_vec.u32)));
92
+ v128_t sign_u32x4 = wasm_i32x4_shl(wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x80)), 24);
93
+ v128_t nonsign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x7F));
94
+ v128_t shifted_u32x4 = wasm_i32x4_shl(nonsign_u32x4, 20);
95
+ v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)wasm_i32x4_splat(0x7B800000)); // 2^120
96
+ v128_t is_nan_u32x4 = wasm_i32x4_eq(nonsign_u32x4, wasm_i32x4_splat(0x7F));
97
+ v128_t nan_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7FC00000));
98
+ v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(nan_u32x4, rebiased_f32x4, is_nan_u32x4);
99
+ nk_b128_vec_t result_vec;
100
+ result_vec.v128 = wasm_v128_or(result_u32x4, sign_u32x4);
101
+ return result_vec;
136
102
  }
137
103
 
138
104
  /**
139
- * @brief E5M2→F32: same exponent encoding as F16 (5-bit, bias=15, delta=112), 2-bit mantissa (shift by 21).
140
- * Subnormal via FPU: mant * (1/65536) = mant * 2^-16. Inf at exp=31,mant=0; NaN otherwise.
105
+ * @brief E5M2→F32 via Giesen's magic multiply (×2^112).
106
+ * Same exponent encoding as F16 (5-bit, bias=15). Shift 7-bit magnitude left by 21,
107
+ * multiply by 2^112 to rebias. Inf/NaN fixup for exp=31 (nonsign > 123).
141
108
  */
142
109
  NK_INTERNAL nk_b128_vec_t nk_e5m2x4_to_f32x4_v128relaxed_(nk_b32_vec_t e5m2_vec) {
143
- v128_t e5m2_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_v128_load32_zero(&e5m2_vec.u32)));
144
- v128_t exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(e5m2_u32x4, 2), wasm_i32x4_splat(0x1F));
145
- v128_t mant_u32x4 = wasm_v128_and(e5m2_u32x4, wasm_i32x4_splat(0x03));
146
- v128_t sign_u32x4 = wasm_i32x4_shl(wasm_u32x4_shr(e5m2_u32x4, 7), 31);
147
- v128_t f32_exp_u32x4 = wasm_i32x4_shl(wasm_i32x4_add(exp_u32x4, wasm_i32x4_splat(112)), 23);
148
- v128_t f32_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 21);
149
- v128_t normal_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_v128_or(f32_exp_u32x4, f32_mant_u32x4));
150
- v128_t subnorm_abs_f32x4 = wasm_f32x4_mul(wasm_f32x4_convert_u32x4(mant_u32x4), wasm_f32x4_splat(1.0f / 65536.0f));
151
- v128_t subnorm_f32x4 = wasm_v128_or(subnorm_abs_f32x4, sign_u32x4);
152
- v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
153
- v128_t exp_max_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(31));
154
- v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, exp_max_mask);
155
- if (!wasm_v128_any_true(exceptional_mask)) {
156
- nk_b128_vec_t result;
157
- result.v128 = normal_bits_u32x4;
158
- return result;
110
+ v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_i32x4_splat(e5m2_vec.u32)));
111
+ v128_t sign_u32x4 = wasm_i32x4_shl(wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x80)), 24);
112
+ v128_t nonsign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x7F));
113
+ v128_t shifted_u32x4 = wasm_i32x4_shl(nonsign_u32x4, 21);
114
+ v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)wasm_i32x4_splat(0x77800000)); // 2^112
115
+ v128_t is_infnan_u32x4 = wasm_u32x4_gt(nonsign_u32x4, wasm_i32x4_splat(123));
116
+ v128_t result_u32x4 = wasm_v128_or(rebiased_f32x4, wasm_v128_and(is_infnan_u32x4, wasm_i32x4_splat(0x7F800000)));
117
+ nk_b128_vec_t result_vec;
118
+ result_vec.v128 = wasm_v128_or(result_u32x4, sign_u32x4);
119
+ return result_vec;
120
+ }
121
+
122
+ /**
123
+ * @brief E2M3→F32 via Giesen's magic multiply (×2^126).
124
+ * S EE MMM (bias=1). Shift 5-bit magnitude left by 20, multiply by 2^126 to rebias.
125
+ * No inf/NaN in E2M3FN format, so no fixup needed.
126
+ */
127
+ NK_INTERNAL nk_b128_vec_t nk_e2m3x4_to_f32x4_v128relaxed_(nk_b32_vec_t e2m3_vec) {
128
+ v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_i32x4_splat(e2m3_vec.u32)));
129
+ v128_t sign_u32x4 = wasm_i32x4_shl(wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x20)), 26);
130
+ v128_t nonsign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x1F));
131
+ v128_t shifted_u32x4 = wasm_i32x4_shl(nonsign_u32x4, 20);
132
+ v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)wasm_i32x4_splat(0x7E800000)); // 2^126
133
+ nk_b128_vec_t result_vec;
134
+ result_vec.v128 = wasm_v128_or(rebiased_f32x4, sign_u32x4);
135
+ return result_vec;
136
+ }
137
+
138
+ /**
139
+ * @brief E3M2→F32 via Giesen's magic multiply (×2^124).
140
+ * S EEE MM (bias=3). Shift 5-bit magnitude left by 21, multiply by 2^124 to rebias.
141
+ * No inf/NaN in E3M2FN format, so no fixup needed.
142
+ */
143
+ NK_INTERNAL nk_b128_vec_t nk_e3m2x4_to_f32x4_v128relaxed_(nk_b32_vec_t e3m2_vec) {
144
+ v128_t raw_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_i32x4_splat(e3m2_vec.u32)));
145
+ v128_t sign_u32x4 = wasm_i32x4_shl(wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x20)), 26);
146
+ v128_t nonsign_u32x4 = wasm_v128_and(raw_u32x4, wasm_i32x4_splat(0x1F));
147
+ v128_t shifted_u32x4 = wasm_i32x4_shl(nonsign_u32x4, 21);
148
+ v128_t rebiased_f32x4 = wasm_f32x4_mul((v128_t)shifted_u32x4, (v128_t)wasm_i32x4_splat(0x7D800000)); // 2^124
149
+ nk_b128_vec_t result_vec;
150
+ result_vec.v128 = wasm_v128_or(rebiased_f32x4, sign_u32x4);
151
+ return result_vec;
152
+ }
153
+
154
+ /** @brief Convert 4x i8 → f32x4 (WASM). Widen i8→i16→i32, convert to f32. */
155
+ NK_INTERNAL nk_b128_vec_t nk_i8x4_to_f32x4_v128relaxed_(nk_b32_vec_t in_vec) {
156
+ v128_t in_i8x16 = wasm_i32x4_splat(in_vec.u32);
157
+ v128_t in_i16x8 = wasm_i16x8_extend_low_i8x16(in_i8x16);
158
+ v128_t in_i32x4 = wasm_i32x4_extend_low_i16x8(in_i16x8);
159
+ nk_b128_vec_t result_vec;
160
+ result_vec.v128 = wasm_f32x4_convert_i32x4(in_i32x4);
161
+ return result_vec;
162
+ }
163
+
164
+ /** @brief Convert 4x u8 → f32x4 (WASM). Widen u8→u16→u32, convert to f32. */
165
+ NK_INTERNAL nk_b128_vec_t nk_u8x4_to_f32x4_v128relaxed_(nk_b32_vec_t in_vec) {
166
+ v128_t in_u8x16 = wasm_i32x4_splat(in_vec.u32);
167
+ v128_t in_u16x8 = wasm_u16x8_extend_low_u8x16(in_u8x16);
168
+ v128_t in_u32x4 = wasm_u32x4_extend_low_u16x8(in_u16x8);
169
+ nk_b128_vec_t result_vec;
170
+ result_vec.v128 = wasm_f32x4_convert_u32x4(in_u32x4);
171
+ return result_vec;
172
+ }
173
+
174
+ /** @brief Convert f32x4 → 4x bf16 via RNE rounding (WASM). */
175
+ NK_INTERNAL nk_b64_vec_t nk_f32x4_to_bf16x4_v128relaxed_(nk_b128_vec_t hub_vec) {
176
+ v128_t bits_u32x4 = hub_vec.v128;
177
+ v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 16), wasm_i32x4_splat(1));
178
+ v128_t rounded_u32x4 = wasm_i32x4_add(bits_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x7FFF), lsb_u32x4));
179
+ v128_t bf16_u32x4 = wasm_u32x4_shr(rounded_u32x4, 16);
180
+ v128_t packed_u16x8 = wasm_u16x8_narrow_i32x4(bf16_u32x4, bf16_u32x4);
181
+ nk_b64_vec_t result_vec;
182
+ result_vec.u64 = (nk_u64_t)wasm_i64x2_extract_lane(packed_u16x8, 0);
183
+ return result_vec;
184
+ }
185
+
186
+ /**
187
+ * @brief F32→F16 via bit manipulation with RNE (WASM).
188
+ * Handles normal, subnormal, overflow (→inf), and inf/NaN cases.
189
+ */
190
+ NK_INTERNAL nk_b64_vec_t nk_f32x4_to_f16x4_v128relaxed_(nk_b128_vec_t hub_vec) {
191
+ v128_t bits_u32x4 = hub_vec.v128;
192
+ v128_t sign_u32x4 = wasm_i32x4_shl(wasm_u32x4_shr(bits_u32x4, 31), 15);
193
+ v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
194
+ v128_t f32_mant_u32x4 = wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF));
195
+
196
+ // Normal path: rebias exponent (127→15), RNE round mantissa 23→10 bits
197
+ v128_t f16_exp_i32x4 = wasm_i32x4_sub(f32_exp_u32x4, wasm_i32x4_splat(112));
198
+ v128_t significand_u32x4 = wasm_v128_or(f32_mant_u32x4, wasm_i32x4_splat(0x00800000));
199
+ v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 13), wasm_i32x4_splat(1));
200
+ v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x0FFF), lsb_u32x4));
201
+ v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
202
+ v128_t f16_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 13), wasm_i32x4_splat(0x3FF));
203
+ v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
204
+ f16_mant_u32x4 = wasm_v128_andnot(f16_mant_u32x4, carry_mask_u32x4);
205
+ f16_exp_i32x4 = wasm_i32x4_add(f16_exp_i32x4, carry_u32x4);
206
+
207
+ // Clamp exponent and assemble normal result
208
+ v128_t clamped_exp_i32x4 = wasm_i32x4_max(f16_exp_i32x4, wasm_i32x4_splat(1));
209
+ clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(30));
210
+ v128_t normal_result_u32x4 = wasm_v128_or(sign_u32x4,
211
+ wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 10), f16_mant_u32x4));
212
+
213
+ // Overflow → infinity
214
+ v128_t overflow_mask_u32x4 = wasm_i32x4_gt(f16_exp_i32x4, wasm_i32x4_splat(30));
215
+ v128_t inf_result_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7C00));
216
+ normal_result_u32x4 = wasm_i32x4_relaxed_laneselect(inf_result_u32x4, normal_result_u32x4, overflow_mask_u32x4);
217
+
218
+ // Underflow → zero (exp <= 0 after rebias, ignoring subnormals for simplicity)
219
+ v128_t underflow_mask_u32x4 = wasm_i32x4_lt(f16_exp_i32x4, wasm_i32x4_splat(1));
220
+ normal_result_u32x4 = wasm_i32x4_relaxed_laneselect(sign_u32x4, normal_result_u32x4, underflow_mask_u32x4);
221
+
222
+ // Inf/NaN passthrough: f32 exp=255
223
+ v128_t infnan_mask_u32x4 = wasm_i32x4_eq(f32_exp_u32x4, wasm_i32x4_splat(255));
224
+ v128_t nan_payload_u32x4 = wasm_v128_or(wasm_u32x4_shr(f32_mant_u32x4, 13), wasm_i32x4_splat(1));
225
+ v128_t mant_nonzero_u32x4 = wasm_i32x4_ne(f32_mant_u32x4, wasm_i32x4_splat(0));
226
+ v128_t nan_result_u32x4 = wasm_v128_or(
227
+ sign_u32x4, wasm_v128_or(wasm_i32x4_splat(0x7C00), wasm_v128_and(nan_payload_u32x4, mant_nonzero_u32x4)));
228
+ normal_result_u32x4 = wasm_i32x4_relaxed_laneselect(nan_result_u32x4, normal_result_u32x4, infnan_mask_u32x4);
229
+
230
+ // F32 zero/denorm → f16 zero
231
+ v128_t f32_zero_mask_u32x4 = wasm_i32x4_eq(f32_exp_u32x4, wasm_i32x4_splat(0));
232
+ normal_result_u32x4 = wasm_i32x4_relaxed_laneselect(sign_u32x4, normal_result_u32x4, f32_zero_mask_u32x4);
233
+
234
+ // Pack 4x u32 → 4x u16
235
+ v128_t packed_u16x8 = wasm_u16x8_narrow_i32x4(normal_result_u32x4, normal_result_u32x4);
236
+ nk_b64_vec_t result_vec;
237
+ result_vec.u64 = (nk_u64_t)wasm_i64x2_extract_lane(packed_u16x8, 0);
238
+ return result_vec;
239
+ }
240
+
241
+ /** @brief Convert f32x4 → 4x e4m3 via bit manipulation with RNE (WASM). */
242
+ NK_INTERNAL nk_b32_vec_t nk_f32x4_to_e4m3x4_v128relaxed_(nk_b128_vec_t hub_vec) {
243
+ v128_t bits_u32x4 = hub_vec.v128;
244
+ v128_t sign_u32x4 = wasm_u32x4_shr(bits_u32x4, 31);
245
+ v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
246
+
247
+ // RNE mantissa rounding from 23 to 3 bits
248
+ v128_t significand_u32x4 = wasm_v128_or(wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF)),
249
+ wasm_i32x4_splat(0x00800000));
250
+ v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 20), wasm_i32x4_splat(1));
251
+ v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x0007FFFF), lsb_u32x4));
252
+ v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
253
+ v128_t f32_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 20), wasm_i32x4_splat(0x07));
254
+ v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
255
+ f32_mant_u32x4 = wasm_v128_andnot(f32_mant_u32x4, carry_mask_u32x4);
256
+ v128_t e4m3_exp_i32x4 = wasm_i32x4_sub(wasm_i32x4_add(f32_exp_u32x4, carry_u32x4), wasm_i32x4_splat(120));
257
+
258
+ v128_t is_subnormal_u32x4 = wasm_i32x4_lt(e4m3_exp_i32x4, wasm_i32x4_splat(1));
259
+ v128_t overflow_u32x4 = wasm_i32x4_gt(e4m3_exp_i32x4, wasm_i32x4_splat(15));
260
+
261
+ // Normal path
262
+ v128_t clamped_exp_i32x4 = wasm_i32x4_max(e4m3_exp_i32x4, wasm_i32x4_splat(1));
263
+ clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(15));
264
+ v128_t is_max_exp_u32x4 = wasm_i32x4_eq(clamped_exp_i32x4, wasm_i32x4_splat(15));
265
+ v128_t max_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(6), wasm_i32x4_splat(7), is_max_exp_u32x4);
266
+ v128_t normal_mant_u32x4 = wasm_i32x4_min(f32_mant_u32x4, max_mant_u32x4);
267
+ normal_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(0x06), normal_mant_u32x4, overflow_u32x4);
268
+ v128_t normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7),
269
+ wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 3), normal_mant_u32x4));
270
+
271
+ // Subnormal path
272
+ v128_t abs_f32x4 = wasm_v128_and(hub_vec.v128, wasm_i32x4_splat(0x7FFFFFFF));
273
+ v128_t scaled_f32x4 = wasm_f32x4_mul((v128_t)abs_f32x4, wasm_f32x4_splat(512.0f));
274
+ v128_t sub_mant_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(scaled_f32x4));
275
+ v128_t promotes_u32x4 = wasm_i32x4_gt(sub_mant_i32x4, wasm_i32x4_splat(7));
276
+ sub_mant_i32x4 = wasm_i32x4_min(sub_mant_i32x4, wasm_i32x4_splat(7));
277
+ sub_mant_i32x4 = wasm_i32x4_max(sub_mant_i32x4, wasm_i32x4_splat(0));
278
+ v128_t subnormal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7), sub_mant_i32x4);
279
+ v128_t first_normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7), wasm_i32x4_splat(0x08));
280
+ subnormal_u32x4 = wasm_i32x4_relaxed_laneselect(first_normal_u32x4, subnormal_u32x4, promotes_u32x4);
281
+
282
+ v128_t e4m3_u32x4 = wasm_i32x4_relaxed_laneselect(subnormal_u32x4, normal_u32x4, is_subnormal_u32x4);
283
+
284
+ // Pack 4x u32 → 4x u8
285
+ v128_t packed_u16 = wasm_u16x8_narrow_i32x4(e4m3_u32x4, e4m3_u32x4);
286
+ v128_t packed_u8 = wasm_u8x16_narrow_i16x8(packed_u16, packed_u16);
287
+ nk_b32_vec_t result_vec;
288
+ result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(packed_u8, 0);
289
+ return result_vec;
290
+ }
291
+
292
+ /** @brief Convert f32x4 → 4x e5m2 via bit manipulation with RNE (WASM). */
293
+ NK_INTERNAL nk_b32_vec_t nk_f32x4_to_e5m2x4_v128relaxed_(nk_b128_vec_t hub_vec) {
294
+ v128_t bits_u32x4 = hub_vec.v128;
295
+ v128_t sign_u32x4 = wasm_u32x4_shr(bits_u32x4, 31);
296
+ v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
297
+
298
+ // RNE mantissa rounding from 23 to 2 bits
299
+ v128_t significand_u32x4 = wasm_v128_or(wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF)),
300
+ wasm_i32x4_splat(0x00800000));
301
+ v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 21), wasm_i32x4_splat(1));
302
+ v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x000FFFFF), lsb_u32x4));
303
+ v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
304
+ v128_t f32_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 21), wasm_i32x4_splat(0x03));
305
+ v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
306
+ f32_mant_u32x4 = wasm_v128_andnot(f32_mant_u32x4, carry_mask_u32x4);
307
+ v128_t e5m2_exp_i32x4 = wasm_i32x4_sub(wasm_i32x4_add(f32_exp_u32x4, carry_u32x4), wasm_i32x4_splat(112));
308
+
309
+ v128_t is_subnormal_u32x4 = wasm_i32x4_lt(e5m2_exp_i32x4, wasm_i32x4_splat(1));
310
+ v128_t overflow_u32x4 = wasm_i32x4_gt(e5m2_exp_i32x4, wasm_i32x4_splat(31));
311
+
312
+ // Normal path: overflow → infinity (exp=31, mant=0)
313
+ v128_t clamped_exp_i32x4 = wasm_i32x4_max(e5m2_exp_i32x4, wasm_i32x4_splat(1));
314
+ clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(31));
315
+ v128_t normal_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(0), f32_mant_u32x4, overflow_u32x4);
316
+ v128_t normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7),
317
+ wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 2), normal_mant_u32x4));
318
+
319
+ // Subnormal path
320
+ v128_t abs_f32x4 = wasm_v128_and(hub_vec.v128, wasm_i32x4_splat(0x7FFFFFFF));
321
+ v128_t scaled_f32x4 = wasm_f32x4_mul((v128_t)abs_f32x4, wasm_f32x4_splat(65536.0f));
322
+ v128_t sub_mant_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(scaled_f32x4));
323
+ v128_t promotes_u32x4 = wasm_i32x4_gt(sub_mant_i32x4, wasm_i32x4_splat(3));
324
+ sub_mant_i32x4 = wasm_i32x4_min(sub_mant_i32x4, wasm_i32x4_splat(3));
325
+ sub_mant_i32x4 = wasm_i32x4_max(sub_mant_i32x4, wasm_i32x4_splat(0));
326
+ v128_t subnormal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7), sub_mant_i32x4);
327
+ v128_t first_normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 7), wasm_i32x4_splat(0x04));
328
+ subnormal_u32x4 = wasm_i32x4_relaxed_laneselect(first_normal_u32x4, subnormal_u32x4, promotes_u32x4);
329
+
330
+ v128_t e5m2_u32x4 = wasm_i32x4_relaxed_laneselect(subnormal_u32x4, normal_u32x4, is_subnormal_u32x4);
331
+
332
+ v128_t packed_u16 = wasm_u16x8_narrow_i32x4(e5m2_u32x4, e5m2_u32x4);
333
+ v128_t packed_u8 = wasm_u8x16_narrow_i16x8(packed_u16, packed_u16);
334
+ nk_b32_vec_t result_vec;
335
+ result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(packed_u8, 0);
336
+ return result_vec;
337
+ }
338
+
339
+ /** @brief Convert f32x4 → 4x e2m3 via bit manipulation with RNE (WASM). */
340
+ NK_INTERNAL nk_b32_vec_t nk_f32x4_to_e2m3x4_v128relaxed_(nk_b128_vec_t hub_vec) {
341
+ v128_t bits_u32x4 = hub_vec.v128;
342
+ v128_t sign_u32x4 = wasm_u32x4_shr(bits_u32x4, 31);
343
+ v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
344
+
345
+ v128_t significand_u32x4 = wasm_v128_or(wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF)),
346
+ wasm_i32x4_splat(0x00800000));
347
+ v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 20), wasm_i32x4_splat(1));
348
+ v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x0007FFFF), lsb_u32x4));
349
+ v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
350
+ v128_t f32_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 20), wasm_i32x4_splat(0x07));
351
+ v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
352
+ f32_mant_u32x4 = wasm_v128_andnot(f32_mant_u32x4, carry_mask_u32x4);
353
+ v128_t e2m3_exp_i32x4 = wasm_i32x4_sub(wasm_i32x4_add(f32_exp_u32x4, carry_u32x4), wasm_i32x4_splat(126));
354
+
355
+ v128_t is_subnormal_u32x4 = wasm_i32x4_lt(e2m3_exp_i32x4, wasm_i32x4_splat(1));
356
+ v128_t overflow_u32x4 = wasm_i32x4_gt(e2m3_exp_i32x4, wasm_i32x4_splat(3));
357
+
358
+ v128_t clamped_exp_i32x4 = wasm_i32x4_max(e2m3_exp_i32x4, wasm_i32x4_splat(1));
359
+ clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(3));
360
+ v128_t normal_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(0x07), f32_mant_u32x4, overflow_u32x4);
361
+ v128_t normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5),
362
+ wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 3), normal_mant_u32x4));
363
+
364
+ v128_t abs_f32x4 = wasm_v128_and(hub_vec.v128, wasm_i32x4_splat(0x7FFFFFFF));
365
+ v128_t scaled_f32x4 = wasm_f32x4_mul((v128_t)abs_f32x4, wasm_f32x4_splat(8.0f));
366
+ v128_t sub_mant_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(scaled_f32x4));
367
+ v128_t promotes_u32x4 = wasm_i32x4_gt(sub_mant_i32x4, wasm_i32x4_splat(7));
368
+ sub_mant_i32x4 = wasm_i32x4_min(sub_mant_i32x4, wasm_i32x4_splat(7));
369
+ sub_mant_i32x4 = wasm_i32x4_max(sub_mant_i32x4, wasm_i32x4_splat(0));
370
+ v128_t subnormal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5), sub_mant_i32x4);
371
+ v128_t first_normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5), wasm_i32x4_splat(0x08));
372
+ subnormal_u32x4 = wasm_i32x4_relaxed_laneselect(first_normal_u32x4, subnormal_u32x4, promotes_u32x4);
373
+
374
+ v128_t e2m3_u32x4 = wasm_i32x4_relaxed_laneselect(subnormal_u32x4, normal_u32x4, is_subnormal_u32x4);
375
+
376
+ v128_t packed_u16 = wasm_u16x8_narrow_i32x4(e2m3_u32x4, e2m3_u32x4);
377
+ v128_t packed_u8 = wasm_u8x16_narrow_i16x8(packed_u16, packed_u16);
378
+ nk_b32_vec_t result_vec;
379
+ result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(packed_u8, 0);
380
+ return result_vec;
381
+ }
382
+
383
+ /** @brief Convert f32x4 → 4x e3m2 via bit manipulation with RNE (WASM). */
384
+ NK_INTERNAL nk_b32_vec_t nk_f32x4_to_e3m2x4_v128relaxed_(nk_b128_vec_t hub_vec) {
385
+ v128_t bits_u32x4 = hub_vec.v128;
386
+ v128_t sign_u32x4 = wasm_u32x4_shr(bits_u32x4, 31);
387
+ v128_t f32_exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(bits_u32x4, 23), wasm_i32x4_splat(0xFF));
388
+
389
+ v128_t significand_u32x4 = wasm_v128_or(wasm_v128_and(bits_u32x4, wasm_i32x4_splat(0x007FFFFF)),
390
+ wasm_i32x4_splat(0x00800000));
391
+ v128_t lsb_u32x4 = wasm_v128_and(wasm_u32x4_shr(significand_u32x4, 21), wasm_i32x4_splat(1));
392
+ v128_t rounded_u32x4 = wasm_i32x4_add(significand_u32x4, wasm_i32x4_add(wasm_i32x4_splat(0x000FFFFF), lsb_u32x4));
393
+ v128_t carry_u32x4 = wasm_u32x4_shr(rounded_u32x4, 24);
394
+ v128_t f32_mant_u32x4 = wasm_v128_and(wasm_u32x4_shr(rounded_u32x4, 21), wasm_i32x4_splat(0x03));
395
+ v128_t carry_mask_u32x4 = wasm_i32x4_eq(carry_u32x4, wasm_i32x4_splat(1));
396
+ f32_mant_u32x4 = wasm_v128_andnot(f32_mant_u32x4, carry_mask_u32x4);
397
+ v128_t e3m2_exp_i32x4 = wasm_i32x4_sub(wasm_i32x4_add(f32_exp_u32x4, carry_u32x4), wasm_i32x4_splat(124));
398
+
399
+ v128_t is_subnormal_u32x4 = wasm_i32x4_lt(e3m2_exp_i32x4, wasm_i32x4_splat(1));
400
+ v128_t overflow_u32x4 = wasm_i32x4_gt(e3m2_exp_i32x4, wasm_i32x4_splat(7));
401
+
402
+ v128_t clamped_exp_i32x4 = wasm_i32x4_max(e3m2_exp_i32x4, wasm_i32x4_splat(1));
403
+ clamped_exp_i32x4 = wasm_i32x4_min(clamped_exp_i32x4, wasm_i32x4_splat(7));
404
+ v128_t normal_mant_u32x4 = wasm_i32x4_relaxed_laneselect(wasm_i32x4_splat(0x03), f32_mant_u32x4, overflow_u32x4);
405
+ v128_t normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5),
406
+ wasm_v128_or(wasm_i32x4_shl(clamped_exp_i32x4, 2), normal_mant_u32x4));
407
+
408
+ v128_t abs_f32x4 = wasm_v128_and(hub_vec.v128, wasm_i32x4_splat(0x7FFFFFFF));
409
+ v128_t scaled_f32x4 = wasm_f32x4_mul((v128_t)abs_f32x4, wasm_f32x4_splat(16.0f));
410
+ v128_t sub_mant_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(scaled_f32x4));
411
+ v128_t promotes_u32x4 = wasm_i32x4_gt(sub_mant_i32x4, wasm_i32x4_splat(3));
412
+ sub_mant_i32x4 = wasm_i32x4_min(sub_mant_i32x4, wasm_i32x4_splat(3));
413
+ sub_mant_i32x4 = wasm_i32x4_max(sub_mant_i32x4, wasm_i32x4_splat(0));
414
+ v128_t subnormal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5), sub_mant_i32x4);
415
+ v128_t first_normal_u32x4 = wasm_v128_or(wasm_i32x4_shl(sign_u32x4, 5), wasm_i32x4_splat(0x04));
416
+ subnormal_u32x4 = wasm_i32x4_relaxed_laneselect(first_normal_u32x4, subnormal_u32x4, promotes_u32x4);
417
+
418
+ v128_t e3m2_u32x4 = wasm_i32x4_relaxed_laneselect(subnormal_u32x4, normal_u32x4, is_subnormal_u32x4);
419
+
420
+ v128_t packed_u16 = wasm_u16x8_narrow_i32x4(e3m2_u32x4, e3m2_u32x4);
421
+ v128_t packed_u8 = wasm_u8x16_narrow_i16x8(packed_u16, packed_u16);
422
+ nk_b32_vec_t result_vec;
423
+ result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(packed_u8, 0);
424
+ return result_vec;
425
+ }
426
+
427
+ /** @brief Convert f32x4 → 4x i8 with saturation (WASM). */
428
+ NK_INTERNAL nk_b32_vec_t nk_f32x4_to_i8x4_v128relaxed_(nk_b128_vec_t hub_vec) {
429
+ v128_t clamped_f32x4 = wasm_f32x4_min(wasm_f32x4_max(hub_vec.v128, wasm_f32x4_splat(-128.0f)),
430
+ wasm_f32x4_splat(127.0f));
431
+ v128_t result_i32x4 = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(clamped_f32x4));
432
+ v128_t result_i16x8 = wasm_i16x8_narrow_i32x4(result_i32x4, result_i32x4);
433
+ v128_t result_i8x16 = wasm_i8x16_narrow_i16x8(result_i16x8, result_i16x8);
434
+ nk_b32_vec_t result_vec;
435
+ result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(result_i8x16, 0);
436
+ return result_vec;
437
+ }
438
+
439
+ /** @brief Convert f32x4 → 4x u8 with saturation (WASM). */
440
+ NK_INTERNAL nk_b32_vec_t nk_f32x4_to_u8x4_v128relaxed_(nk_b128_vec_t hub_vec) {
441
+ v128_t clamped_f32x4 = wasm_f32x4_min(wasm_f32x4_max(hub_vec.v128, wasm_f32x4_splat(0.0f)),
442
+ wasm_f32x4_splat(255.0f));
443
+ v128_t result_u32x4 = wasm_u32x4_trunc_sat_f32x4(wasm_f32x4_nearest(clamped_f32x4));
444
+ v128_t result_u16x8 = wasm_u16x8_narrow_i32x4(result_u32x4, result_u32x4);
445
+ v128_t result_u8x16 = wasm_u8x16_narrow_i16x8(result_u16x8, result_u16x8);
446
+ nk_b32_vec_t result_vec;
447
+ result_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(result_u8x16, 0);
448
+ return result_vec;
449
+ }
450
+
451
+ NK_PUBLIC void nk_cast_v128relaxed(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type) {
452
+ // Same-type fast path
453
+ if (from_type == to_type) {
454
+ nk_size_t size_bits = nk_dtype_bits(from_type);
455
+ if (size_bits > 0) nk_copy_bytes_(to, from, nk_size_divide_round_up_(n * size_bits, 8));
456
+ return;
159
457
  }
160
- v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(subnorm_f32x4, normal_bits_u32x4, exp_zero_mask);
161
- v128_t mant_zero_mask = wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(0));
162
- v128_t inf_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7F800000));
163
- v128_t nan_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7FC00000));
164
- v128_t special_bits_u32x4 = wasm_i32x4_relaxed_laneselect(inf_bits_u32x4, nan_bits_u32x4, mant_zero_mask);
165
- result_u32x4 = wasm_i32x4_relaxed_laneselect(special_bits_u32x4, result_u32x4, exp_max_mask);
166
- nk_b128_vec_t result;
167
- result.v128 = result_u32x4;
168
- return result;
458
+
459
+ // Validate supported types
460
+ int from_ok = (from_type == nk_f32_k || from_type == nk_f16_k || from_type == nk_bf16_k || from_type == nk_e4m3_k ||
461
+ from_type == nk_e5m2_k || from_type == nk_e2m3_k || from_type == nk_e3m2_k || from_type == nk_i8_k ||
462
+ from_type == nk_u8_k);
463
+ int to_ok = (to_type == nk_f32_k || to_type == nk_f16_k || to_type == nk_bf16_k || to_type == nk_e4m3_k ||
464
+ to_type == nk_e5m2_k || to_type == nk_e2m3_k || to_type == nk_e3m2_k || to_type == nk_i8_k ||
465
+ to_type == nk_u8_k);
466
+
467
+ if (!from_ok || !to_ok) {
468
+ nk_cast_serial(from, from_type, n, to, to_type);
469
+ return;
470
+ }
471
+
472
+ // F32 hub: 4 elements per iteration
473
+ nk_size_t batches = n / 4;
474
+ nk_size_t tail = n % 4;
475
+ nk_size_t from_step = 4 * nk_dtype_bits(from_type) / 8;
476
+ nk_size_t to_step = 4 * nk_dtype_bits(to_type) / 8;
477
+ nk_u8_t const *from_ptr = (nk_u8_t const *)from;
478
+ nk_u8_t *to_ptr = (nk_u8_t *)to;
479
+
480
+ for (nk_size_t idx = 0; idx < batches; ++idx, from_ptr += from_step, to_ptr += to_step) {
481
+ nk_b128_vec_t hub_vec;
482
+
483
+ // Upcast to f32x4 hub using size-appropriate loads
484
+ if (from_step == 16) { hub_vec.v128 = wasm_v128_load(from_ptr); }
485
+ else if (from_step == 8) {
486
+ nk_b64_vec_t raw64_vec;
487
+ raw64_vec.u64 = (nk_u64_t)wasm_i64x2_extract_lane(wasm_v128_load64_zero(from_ptr), 0);
488
+ switch (from_type) {
489
+ case nk_f16_k: hub_vec = nk_f16x4_to_f32x4_v128relaxed_(raw64_vec); break;
490
+ case nk_bf16_k: hub_vec = nk_bf16x4_to_f32x4_v128relaxed_(raw64_vec); break;
491
+ default: break;
492
+ }
493
+ }
494
+ else if (from_step == 4) {
495
+ nk_b32_vec_t raw32_vec;
496
+ raw32_vec.u32 = (nk_u32_t)wasm_i32x4_extract_lane(wasm_v128_load32_zero(from_ptr), 0);
497
+ switch (from_type) {
498
+ case nk_e4m3_k: hub_vec = nk_e4m3x4_to_f32x4_v128relaxed_(raw32_vec); break;
499
+ case nk_e5m2_k: hub_vec = nk_e5m2x4_to_f32x4_v128relaxed_(raw32_vec); break;
500
+ case nk_e2m3_k: hub_vec = nk_e2m3x4_to_f32x4_v128relaxed_(raw32_vec); break;
501
+ case nk_e3m2_k: hub_vec = nk_e3m2x4_to_f32x4_v128relaxed_(raw32_vec); break;
502
+ case nk_i8_k: hub_vec = nk_i8x4_to_f32x4_v128relaxed_(raw32_vec); break;
503
+ case nk_u8_k: hub_vec = nk_u8x4_to_f32x4_v128relaxed_(raw32_vec); break;
504
+ default: break;
505
+ }
506
+ }
507
+ else hub_vec.v128 = wasm_f32x4_splat(0);
508
+
509
+ // Downcast from f32x4 hub and store using half-register stores
510
+ switch (to_type) {
511
+ case nk_f32_k: wasm_v128_store(to_ptr, hub_vec.v128); break;
512
+ case nk_f16_k: *(nk_u64_t *)to_ptr = nk_f32x4_to_f16x4_v128relaxed_(hub_vec).u64; break;
513
+ case nk_bf16_k: *(nk_u64_t *)to_ptr = nk_f32x4_to_bf16x4_v128relaxed_(hub_vec).u64; break;
514
+ case nk_e4m3_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_e4m3x4_v128relaxed_(hub_vec).u32; break;
515
+ case nk_e5m2_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_e5m2x4_v128relaxed_(hub_vec).u32; break;
516
+ case nk_e2m3_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_e2m3x4_v128relaxed_(hub_vec).u32; break;
517
+ case nk_e3m2_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_e3m2x4_v128relaxed_(hub_vec).u32; break;
518
+ case nk_i8_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_i8x4_v128relaxed_(hub_vec).u32; break;
519
+ case nk_u8_k: *(nk_u32_t *)to_ptr = nk_f32x4_to_u8x4_v128relaxed_(hub_vec).u32; break;
520
+ default: break;
521
+ }
522
+ }
523
+
524
+ // Handle tail elements with serial fallback
525
+ if (tail) nk_cast_serial(from_ptr, from_type, tail, to_ptr, to_type);
169
526
  }
170
527
 
171
528
  #if defined(__clang__)