npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/cast/serial.h CHANGED Viewed

@@ -13,14 +13,32 @@
 extern "C" {
 #endif
-#pragma region - Type Punned Loads and Stores
+#pragma region Type Punned Loads and Stores
 /** @brief Type-agnostic 32-bit full load (scalar). */
 NK_INTERNAL void nk_load_b32_serial_(void const *src, nk_b32_vec_t *dst) { dst->u32 = *(nk_u32_t const *)src; }
+/** @brief Type-agnostic 64-bit full load. */
+NK_INTERNAL void nk_load_b64_serial_(void const *src, nk_b64_vec_t *dst) { dst->u64 = *(nk_u64_t const *)src; }
+/** @brief Type-agnostic 128-bit full load. */
+NK_INTERNAL void nk_load_b128_serial_(void const *src, nk_b128_vec_t *dst) {
+    nk_u64_t const *s = (nk_u64_t const *)src;
+    dst->u64s[0] = s[0], dst->u64s[1] = s[1];
+}
+/** @brief Type-agnostic 256-bit full load. */
+NK_INTERNAL void nk_load_b256_serial_(void const *src, nk_b256_vec_t *dst) {
+    nk_u64_t const *s = (nk_u64_t const *)src;
+    dst->u64s[0] = s[0], dst->u64s[1] = s[1], dst->u64s[2] = s[2], dst->u64s[3] = s[3];
+}
 /** @brief Type-agnostic 32-bit full store (scalar). */
 NK_INTERNAL void nk_store_b32_serial_(nk_b32_vec_t const *src, void *dst) { *(nk_u32_t *)dst = src->u32; }
+/** @brief Type-agnostic 64-bit full store (scalar). */
+NK_INTERNAL void nk_store_b64_serial_(nk_b64_vec_t const *src, void *dst) { *(nk_u64_t *)dst = src->u64; }
 /** @brief Type-agnostic 128-bit store (serial, word-by-word). */
 NK_INTERNAL void nk_store_b128_serial_(nk_b128_vec_t const *src, void *dst) {
     nk_u64_t *d = (nk_u64_t *)dst;
@@ -37,164 +55,681 @@ NK_INTERNAL void nk_store_b256_serial_(nk_b256_vec_t const *src, void *dst) {
     d[3] = src->u64s[3];
 }
-#pragma endregion - Type Punned Loads and Stores
-/**
- *  @brief Expands an `f16` (IEEE-754 16-bit) to a `float`.
- *
- *  Handles all IEEE-754 edge cases:
- *
- *       Input        F16 Hex   F32 Hex       Description
- *       +0           0x0000    0x00000000    Positive zero
- *       -0           0x8000    0x80000000    Negative zero
- *       +inf         0x7C00    0x7F800000    Positive infinity
- *       -inf         0xFC00    0xFF800000    Negative infinity
- *       NaN          0x7E00    0x7FC00000    Quiet NaN (payload preserved)
- *       Min normal   0x0400    0x38800000    2⁻¹⁴
- *       Max normal   0x7BFF    0x477FE000    65504
- *       Min denorm   0x0001    0x33800000    2⁻²⁴
- *       Max denorm   0x03FF    0x387FC000    2⁻¹⁴ - 2⁻²⁴
- *
- *  https://stackoverflow.com/a/60047308
- *  https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
- *  https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
- */
-NK_PUBLIC void nk_f16_to_f32_serial(nk_f16_t const *src, nk_f32_t *dest) {
-#if NK_NATIVE_F16
-    *dest = (nk_f32_t)(*src);
-#else
-    unsigned short x;
-    nk_copy_bytes_(&x, src, 2);
-    unsigned int sign = (x >> 15) & 1;
-    unsigned int exponent = (x >> 10) & 0x1F;
-    unsigned int mantissa = x & 0x03FF;
-    nk_fui32_t conv;
-    if (exponent == 0) {
-        if (mantissa == 0) {
-            // Zero (preserve sign)
-            conv.u = sign << 31;
-        }
-        else {
-            // Denormal: value = mantissa × 2⁻²⁴
-            // Use FPU normalization, then subtract 24 from exponent
-            nk_fui32_t temp;
-            temp.f = (float)mantissa;
-            conv.u = (sign << 31) | (temp.u - 0x0C000000);
-        }
-    }
-    else if (exponent == 31) {
-        // Infinity (mantissa=0) or NaN (mantissa!=0)
-        conv.u = (sign << 31) | 0x7F800000 | (mantissa << 13);
-    }
-    else {
-        // Normal: rebias exponent (127-15=112), shift mantissa
-        conv.u = (sign << 31) | ((exponent + 112) << 23) | (mantissa << 13);
+/** @brief Type-agnostic partial load for 64-bit elements (4 elements max) into 256-bit vector. */
+NK_INTERNAL void nk_partial_load_b64x4_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
+    nk_u64_t const *s = (nk_u64_t const *)src;
+    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
+    switch (n) {
+    default:
+    case 4: dst->u64s[3] = s[3]; // fallthrough
+    case 3: dst->u64s[2] = s[2]; // fallthrough
+    case 2: dst->u64s[1] = s[1]; // fallthrough
+    case 1: dst->u64s[0] = s[0]; // fallthrough
+    case 0: break;
     }
-    *dest = conv.f;
-#endif
 }
-/**
- *  @brief Compresses a `float` to an `f16` (IEEE-754 16-bit).
- *
- *  Handles all IEEE-754 edge cases with round-to-nearest:
- *
- *      Input           F32 Hex       F16 Hex   Description
- *      +0              0x00000000    0x0000    Positive zero
- *      -0              0x80000000    0x8000    Negative zero
- *      +inf            0x7F800000    0x7C00    Positive infinity
- *      -inf            0xFF800000    0xFC00    Negative infinity
- *      NaN             0x7FC00000    0x7E00    Quiet NaN (payload truncated)
- *      1.0             0x3F800000    0x3C00    Normal number
- *      65504           0x477FE000    0x7BFF    Max f16 normal
- *      65520+          >0x477FE000   0x7C00    Overflow → infinity
- *      2⁻¹⁴           0x38800000    0x0400    Min f16 normal
- *      2⁻²⁴           0x33800000    0x0001    Min f16 denormal
- *      <2⁻²⁵          <0x33000000   0x0000    Underflow → zero
- *
- *  https://stackoverflow.com/a/60047308
- *  https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
- *  https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
- */
-NK_PUBLIC void nk_f32_to_f16_serial(nk_f32_t const *src, nk_f16_t *dest) {
-#if NK_NATIVE_F16
-    *dest = (nk_f16_t)(*src);
-#else
-    nk_fui32_t conv;
-    conv.f = *src;
-    unsigned int sign = (conv.u >> 31) & 1;
-    unsigned int exponent = (conv.u >> 23) & 0xFF;
-    unsigned int mantissa = conv.u & 0x007FFFFF;
-    unsigned short result;
-    if (exponent == 0) {
-        // Zero or f32 denormal → f16 zero
-        result = (unsigned short)(sign << 15);
-    }
-    else if (exponent == 255) {
-        // Infinity or NaN
-        unsigned short payload = (unsigned short)(mantissa >> 13);
-        if (mantissa != 0 && payload == 0) payload = 1; // Preserve NaN-ness
-        result = (unsigned short)((sign << 15) | 0x7C00 | payload);
-    }
-    else if (exponent <= 102) {
-        // Below or at f16 denormal threshold
-        // exp=102 with mant=0 is exactly 2^-25 (tie point, rounds to 0 per round-to-even)
-        // exp=102 with mant>0 is above tie point (rounds to smallest denormal 0x0001)
-        if (exponent == 102 && mantissa > 0) result = (unsigned short)((sign << 15) | 0x0001);
-        else result = (unsigned short)(sign << 15);
+/** @brief Type-agnostic partial store for 64-bit elements (4 elements max) from 256-bit vector. */
+NK_INTERNAL void nk_partial_store_b64x4_serial_(nk_b256_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u64_t *d = (nk_u64_t *)dst;
+    switch (n) {
+    default:
+    case 4: d[3] = src->u64s[3]; // fallthrough
+    case 3: d[2] = src->u64s[2]; // fallthrough
+    case 2: d[1] = src->u64s[1]; // fallthrough
+    case 1: d[0] = src->u64s[0]; // fallthrough
+    case 0: break;
     }
-    else if (exponent < 113) {
-        // F16 denormal range (exp 103-112) with IEEE 754 round-to-nearest-even
-        unsigned int shift = 113 - exponent;
-        unsigned int shift_amount = shift + 13;
-        unsigned long long full_mant = 0x00800000ULL | mantissa;
-        // Extract result before rounding
-        unsigned int mant = (unsigned int)(full_mant >> shift_amount);
-        // IEEE 754 round-to-nearest-even: round up if round_bit is set AND
-        // (sticky_bits are nonzero OR result is odd)
-        unsigned int round_bit = (full_mant >> (shift_amount - 1)) & 1;
-        unsigned long long sticky_bits = full_mant & ((1ULL << (shift_amount - 1)) - 1);
-        if (round_bit && (sticky_bits || (mant & 1))) mant++;
+}
-        result = (unsigned short)((sign << 15) | mant);
+NK_INTERNAL void nk_partial_load_b64x2_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    nk_u64_t const *s = (nk_u64_t const *)src;
+    switch (n) {
+    default:
+    case 2: dst->u64s[1] = s[1]; // fallthrough
+    case 1: dst->u64s[0] = s[0]; // fallthrough
+    case 0: break;
     }
-    else if (exponent < 143) {
-        // Normal f16 range with IEEE 754 round-to-nearest-even
-        unsigned int f16_exp = exponent - 112;
-        unsigned int f16_mant = mantissa >> 13;
+}
-        // IEEE 754 rounding: check round bit (bit 12) and sticky bits (bits 0-11)
-        unsigned int round_bit = (mantissa >> 12) & 1;
-        unsigned int sticky_bits = mantissa & 0xFFF;
+/** @brief Type-agnostic partial store for 64-bit elements (2 elements max) from 128-bit vector. */
+NK_INTERNAL void nk_partial_store_b64x2_serial_(nk_b128_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u64_t *d = (nk_u64_t *)dst;
+    switch (n) {
+    default:
+    case 2: d[1] = src->u64s[1]; // fallthrough
+    case 1: d[0] = src->u64s[0]; // fallthrough
+    case 0: break;
+    }
+}
-        if (round_bit && (sticky_bits || (f16_mant & 1))) {
-            f16_mant++;
-            if (f16_mant > 0x3FF) f16_mant = 0, f16_exp++;
-        }
+/** @brief Type-agnostic partial load for 32-bit elements (8 elements max) into 256-bit vector. */
+NK_INTERNAL void nk_partial_load_b32x8_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
+    nk_u32_t const *s = (nk_u32_t const *)src;
+    switch (n) {
+    default:
+    case 8: dst->u32s[7] = s[7]; // fallthrough
+    case 7: dst->u32s[6] = s[6]; // fallthrough
+    case 6: dst->u32s[5] = s[5]; // fallthrough
+    case 5: dst->u32s[4] = s[4]; // fallthrough
+    case 4: dst->u32s[3] = s[3]; // fallthrough
+    case 3: dst->u32s[2] = s[2]; // fallthrough
+    case 2: dst->u32s[1] = s[1]; // fallthrough
+    case 1: dst->u32s[0] = s[0]; // fallthrough
+    case 0: break;
+    }
+}
-        if (f16_exp > 30) result = (unsigned short)((sign << 15) | 0x7C00);
-        else result = (unsigned short)((sign << 15) | (f16_exp << 10) | f16_mant);
+/** @brief Type-agnostic partial store for 32-bit elements (8 elements max) from 256-bit vector. */
+NK_INTERNAL void nk_partial_store_b32x8_serial_(nk_b256_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u32_t *d = (nk_u32_t *)dst;
+    switch (n) {
+    default:
+    case 8: d[7] = src->u32s[7]; // fallthrough
+    case 7: d[6] = src->u32s[6]; // fallthrough
+    case 6: d[5] = src->u32s[5]; // fallthrough
+    case 5: d[4] = src->u32s[4]; // fallthrough
+    case 4: d[3] = src->u32s[3]; // fallthrough
+    case 3: d[2] = src->u32s[2]; // fallthrough
+    case 2: d[1] = src->u32s[1]; // fallthrough
+    case 1: d[0] = src->u32s[0]; // fallthrough
+    case 0: break;
     }
-    else {
-        // Overflow → infinity
-        result = (unsigned short)((sign << 15) | 0x7C00);
+}
+/** @brief Type-agnostic partial load for 32-bit elements (4 elements max) into 128-bit vector. */
+NK_INTERNAL void nk_partial_load_b32x4_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    nk_u32_t const *s = (nk_u32_t const *)src;
+    switch (n) {
+    default:
+    case 4: dst->u32s[3] = s[3]; // fallthrough
+    case 3: dst->u32s[2] = s[2]; // fallthrough
+    case 2: dst->u32s[1] = s[1]; // fallthrough
+    case 1: dst->u32s[0] = s[0]; // fallthrough
+    case 0: break;
     }
+}
-    nk_copy_bytes_(dest, &result, 2);
-#endif
+/** @brief Type-agnostic partial store for 32-bit elements (4 elements max) from 128-bit vector. */
+NK_INTERNAL void nk_partial_store_b32x4_serial_(nk_b128_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u32_t *d = (nk_u32_t *)dst;
+    switch (n) {
+    default:
+    case 4: d[3] = src->u32s[3]; // fallthrough
+    case 3: d[2] = src->u32s[2]; // fallthrough
+    case 2: d[1] = src->u32s[1]; // fallthrough
+    case 1: d[0] = src->u32s[0]; // fallthrough
+    case 0: break;
+    }
 }
-/**
+/** @brief Type-agnostic partial load for 32-bit elements (2 elements max) into 64-bit vector. */
+NK_INTERNAL void nk_partial_load_b32x2_serial_(void const *src, nk_b64_vec_t *dst, nk_size_t n) {
+    dst->u64 = 0;
+    nk_u32_t const *s = (nk_u32_t const *)src;
+    switch (n) {
+    default:
+    case 2: dst->u32s[1] = s[1]; // fallthrough
+    case 1: dst->u32s[0] = s[0]; // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial load for 16-bit elements (8 elements max) into 128-bit vector. */
+NK_INTERNAL void nk_partial_load_b16x8_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    nk_u16_t const *s = (nk_u16_t const *)src;
+    switch (n) {
+    default:
+    case 8: dst->u16s[7] = s[7]; // fallthrough
+    case 7: dst->u16s[6] = s[6]; // fallthrough
+    case 6: dst->u16s[5] = s[5]; // fallthrough
+    case 5: dst->u16s[4] = s[4]; // fallthrough
+    case 4: dst->u16s[3] = s[3]; // fallthrough
+    case 3: dst->u16s[2] = s[2]; // fallthrough
+    case 2: dst->u16s[1] = s[1]; // fallthrough
+    case 1: dst->u16s[0] = s[0]; // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial store for 16-bit elements (8 elements max) from 128-bit vector. */
+NK_INTERNAL void nk_partial_store_b16x8_serial_(nk_b128_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u16_t *d = (nk_u16_t *)dst;
+    switch (n) {
+    default:
+    case 8: d[7] = src->u16s[7]; // fallthrough
+    case 7: d[6] = src->u16s[6]; // fallthrough
+    case 6: d[5] = src->u16s[5]; // fallthrough
+    case 5: d[4] = src->u16s[4]; // fallthrough
+    case 4: d[3] = src->u16s[3]; // fallthrough
+    case 3: d[2] = src->u16s[2]; // fallthrough
+    case 2: d[1] = src->u16s[1]; // fallthrough
+    case 1: d[0] = src->u16s[0]; // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial load for 16-bit elements (16 elements max) into 256-bit vector. */
+NK_INTERNAL void nk_partial_load_b16x16_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
+    nk_u16_t const *s = (nk_u16_t const *)src;
+    switch (n) {
+    default:
+    case 16: dst->u16s[15] = s[15]; // fallthrough
+    case 15: dst->u16s[14] = s[14]; // fallthrough
+    case 14: dst->u16s[13] = s[13]; // fallthrough
+    case 13: dst->u16s[12] = s[12]; // fallthrough
+    case 12: dst->u16s[11] = s[11]; // fallthrough
+    case 11: dst->u16s[10] = s[10]; // fallthrough
+    case 10: dst->u16s[9] = s[9];   // fallthrough
+    case 9: dst->u16s[8] = s[8];    // fallthrough
+    case 8: dst->u16s[7] = s[7];    // fallthrough
+    case 7: dst->u16s[6] = s[6];    // fallthrough
+    case 6: dst->u16s[5] = s[5];    // fallthrough
+    case 5: dst->u16s[4] = s[4];    // fallthrough
+    case 4: dst->u16s[3] = s[3];    // fallthrough
+    case 3: dst->u16s[2] = s[2];    // fallthrough
+    case 2: dst->u16s[1] = s[1];    // fallthrough
+    case 1: dst->u16s[0] = s[0];    // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial store for 16-bit elements (16 elements max) from 256-bit vector. */
+NK_INTERNAL void nk_partial_store_b16x16_serial_(nk_b256_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u16_t *d = (nk_u16_t *)dst;
+    switch (n) {
+    default:
+    case 16: d[15] = src->u16s[15]; // fallthrough
+    case 15: d[14] = src->u16s[14]; // fallthrough
+    case 14: d[13] = src->u16s[13]; // fallthrough
+    case 13: d[12] = src->u16s[12]; // fallthrough
+    case 12: d[11] = src->u16s[11]; // fallthrough
+    case 11: d[10] = src->u16s[10]; // fallthrough
+    case 10: d[9] = src->u16s[9];   // fallthrough
+    case 9: d[8] = src->u16s[8];    // fallthrough
+    case 8: d[7] = src->u16s[7];    // fallthrough
+    case 7: d[6] = src->u16s[6];    // fallthrough
+    case 6: d[5] = src->u16s[5];    // fallthrough
+    case 5: d[4] = src->u16s[4];    // fallthrough
+    case 4: d[3] = src->u16s[3];    // fallthrough
+    case 3: d[2] = src->u16s[2];    // fallthrough
+    case 2: d[1] = src->u16s[1];    // fallthrough
+    case 1: d[0] = src->u16s[0];    // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial load for 16-bit elements (4 elements max) into 64-bit vector. */
+NK_INTERNAL void nk_partial_load_b16x4_serial_(void const *src, nk_b64_vec_t *dst, nk_size_t n) {
+    dst->u64 = 0;
+    nk_u16_t const *s = (nk_u16_t const *)src;
+    switch (n) {
+    default:
+    case 4: dst->u16s[3] = s[3]; // fallthrough
+    case 3: dst->u16s[2] = s[2]; // fallthrough
+    case 2: dst->u16s[1] = s[1]; // fallthrough
+    case 1: dst->u16s[0] = s[0]; // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial store for 16-bit elements (4 elements max) from 64-bit vector. */
+NK_INTERNAL void nk_partial_store_b16x4_serial_(void *dst, nk_b64_vec_t const *src, nk_size_t n) {
+    nk_u16_t *d = (nk_u16_t *)dst;
+    switch (n) {
+    default:
+    case 4: d[3] = src->u16s[3]; // fallthrough
+    case 3: d[2] = src->u16s[2]; // fallthrough
+    case 2: d[1] = src->u16s[1]; // fallthrough
+    case 1: d[0] = src->u16s[0]; // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial load for 8-bit elements (8 elements max) into 64-bit vector. */
+NK_INTERNAL void nk_partial_load_b8x8_serial_(void const *src, nk_b64_vec_t *dst, nk_size_t n) {
+    dst->u64 = 0;
+    nk_u8_t const *s = (nk_u8_t const *)src;
+    switch (n) {
+    default:
+    case 8: dst->u8s[7] = s[7]; // fallthrough
+    case 7: dst->u8s[6] = s[6]; // fallthrough
+    case 6: dst->u8s[5] = s[5]; // fallthrough
+    case 5: dst->u8s[4] = s[4]; // fallthrough
+    case 4: dst->u8s[3] = s[3]; // fallthrough
+    case 3: dst->u8s[2] = s[2]; // fallthrough
+    case 2: dst->u8s[1] = s[1]; // fallthrough
+    case 1: dst->u8s[0] = s[0]; // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial store for 8-bit elements (8 elements max) from 64-bit vector. */
+NK_INTERNAL void nk_partial_store_b8x8_serial_(nk_b64_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u8_t *d = (nk_u8_t *)dst;
+    switch (n) {
+    default:
+    case 8: d[7] = src->u8s[7]; // fallthrough
+    case 7: d[6] = src->u8s[6]; // fallthrough
+    case 6: d[5] = src->u8s[5]; // fallthrough
+    case 5: d[4] = src->u8s[4]; // fallthrough
+    case 4: d[3] = src->u8s[3]; // fallthrough
+    case 3: d[2] = src->u8s[2]; // fallthrough
+    case 2: d[1] = src->u8s[1]; // fallthrough
+    case 1: d[0] = src->u8s[0]; // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial store for 8-bit elements (16 elements max) from 128-bit vector. */
+NK_INTERNAL void nk_partial_store_b8x16_serial_(nk_b128_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u8_t *d = (nk_u8_t *)dst;
+    switch (n) {
+    default:
+    case 16: d[15] = src->u8s[15]; // fallthrough
+    case 15: d[14] = src->u8s[14]; // fallthrough
+    case 14: d[13] = src->u8s[13]; // fallthrough
+    case 13: d[12] = src->u8s[12]; // fallthrough
+    case 12: d[11] = src->u8s[11]; // fallthrough
+    case 11: d[10] = src->u8s[10]; // fallthrough
+    case 10: d[9] = src->u8s[9];   // fallthrough
+    case 9: d[8] = src->u8s[8];    // fallthrough
+    case 8: d[7] = src->u8s[7];    // fallthrough
+    case 7: d[6] = src->u8s[6];    // fallthrough
+    case 6: d[5] = src->u8s[5];    // fallthrough
+    case 5: d[4] = src->u8s[4];    // fallthrough
+    case 4: d[3] = src->u8s[3];    // fallthrough
+    case 3: d[2] = src->u8s[2];    // fallthrough
+    case 2: d[1] = src->u8s[1];    // fallthrough
+    case 1: d[0] = src->u8s[0];    // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial store for 8-bit elements (32 elements max) from 256-bit vector. */
+NK_INTERNAL void nk_partial_store_b8x32_serial_(nk_b256_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u8_t *d = (nk_u8_t *)dst;
+    switch (n) {
+    default:
+    case 32: d[31] = src->u8s[31]; // fallthrough
+    case 31: d[30] = src->u8s[30]; // fallthrough
+    case 30: d[29] = src->u8s[29]; // fallthrough
+    case 29: d[28] = src->u8s[28]; // fallthrough
+    case 28: d[27] = src->u8s[27]; // fallthrough
+    case 27: d[26] = src->u8s[26]; // fallthrough
+    case 26: d[25] = src->u8s[25]; // fallthrough
+    case 25: d[24] = src->u8s[24]; // fallthrough
+    case 24: d[23] = src->u8s[23]; // fallthrough
+    case 23: d[22] = src->u8s[22]; // fallthrough
+    case 22: d[21] = src->u8s[21]; // fallthrough
+    case 21: d[20] = src->u8s[20]; // fallthrough
+    case 20: d[19] = src->u8s[19]; // fallthrough
+    case 19: d[18] = src->u8s[18]; // fallthrough
+    case 18: d[17] = src->u8s[17]; // fallthrough
+    case 17: d[16] = src->u8s[16]; // fallthrough
+    case 16: d[15] = src->u8s[15]; // fallthrough
+    case 15: d[14] = src->u8s[14]; // fallthrough
+    case 14: d[13] = src->u8s[13]; // fallthrough
+    case 13: d[12] = src->u8s[12]; // fallthrough
+    case 12: d[11] = src->u8s[11]; // fallthrough
+    case 11: d[10] = src->u8s[10]; // fallthrough
+    case 10: d[9] = src->u8s[9];   // fallthrough
+    case 9: d[8] = src->u8s[8];    // fallthrough
+    case 8: d[7] = src->u8s[7];    // fallthrough
+    case 7: d[6] = src->u8s[6];    // fallthrough
+    case 6: d[5] = src->u8s[5];    // fallthrough
+    case 5: d[4] = src->u8s[4];    // fallthrough
+    case 4: d[3] = src->u8s[3];    // fallthrough
+    case 3: d[2] = src->u8s[2];    // fallthrough
+    case 2: d[1] = src->u8s[1];    // fallthrough
+    case 1: d[0] = src->u8s[0];    // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial load for 8-bit elements (16 elements max) into 128-bit vector. */
+NK_INTERNAL void nk_partial_load_b8x16_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    nk_u8_t const *s = (nk_u8_t const *)src;
+    switch (n) {
+    default:
+    case 16: dst->u8s[15] = s[15]; // fallthrough
+    case 15: dst->u8s[14] = s[14]; // fallthrough
+    case 14: dst->u8s[13] = s[13]; // fallthrough
+    case 13: dst->u8s[12] = s[12]; // fallthrough
+    case 12: dst->u8s[11] = s[11]; // fallthrough
+    case 11: dst->u8s[10] = s[10]; // fallthrough
+    case 10: dst->u8s[9] = s[9];   // fallthrough
+    case 9: dst->u8s[8] = s[8];    // fallthrough
+    case 8: dst->u8s[7] = s[7];    // fallthrough
+    case 7: dst->u8s[6] = s[6];    // fallthrough
+    case 6: dst->u8s[5] = s[5];    // fallthrough
+    case 5: dst->u8s[4] = s[4];    // fallthrough
+    case 4: dst->u8s[3] = s[3];    // fallthrough
+    case 3: dst->u8s[2] = s[2];    // fallthrough
+    case 2: dst->u8s[1] = s[1];    // fallthrough
+    case 1: dst->u8s[0] = s[0];    // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Type-agnostic partial load for 8-bit elements (4 elements max) into 32-bit vector. */
+NK_INTERNAL nk_b32_vec_t nk_partial_load_b8x4_serial_(void const *src, nk_size_t n) {
+    nk_b32_vec_t dst = {0};
+    nk_u8_t const *s = (nk_u8_t const *)src;
+    switch (n) {
+    default:
+    case 4: dst.u8s[3] = s[3]; // fallthrough
+    case 3: dst.u8s[2] = s[2]; // fallthrough
+    case 2: dst.u8s[1] = s[1]; // fallthrough
+    case 1: dst.u8s[0] = s[0]; // fallthrough
+    case 0: break;
+    }
+    return dst;
+}
+/** @brief Partial store for 8-bit elements (up to 4) from nk_b32_vec_t. */
+NK_INTERNAL void nk_partial_store_b8x4_serial_(nk_b32_vec_t const *src, void *dst, nk_size_t n) {
+    nk_u8_t *d = (nk_u8_t *)dst;
+    switch (n) {
+    default:
+    case 4: d[3] = src->u8s[3]; // fallthrough
+    case 3: d[2] = src->u8s[2]; // fallthrough
+    case 2: d[1] = src->u8s[1]; // fallthrough
+    case 1: d[0] = src->u8s[0]; // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Partial load for 8-bit elements (32 max) into 256-bit vector (zeros in remaining slots). */
+NK_INTERNAL void nk_partial_load_b8x32_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
+    nk_u8_t const *s = (nk_u8_t const *)src;
+    switch (n) {
+    default:
+    case 32: dst->u8s[31] = s[31]; // fallthrough
+    case 31: dst->u8s[30] = s[30]; // fallthrough
+    case 30: dst->u8s[29] = s[29]; // fallthrough
+    case 29: dst->u8s[28] = s[28]; // fallthrough
+    case 28: dst->u8s[27] = s[27]; // fallthrough
+    case 27: dst->u8s[26] = s[26]; // fallthrough
+    case 26: dst->u8s[25] = s[25]; // fallthrough
+    case 25: dst->u8s[24] = s[24]; // fallthrough
+    case 24: dst->u8s[23] = s[23]; // fallthrough
+    case 23: dst->u8s[22] = s[22]; // fallthrough
+    case 22: dst->u8s[21] = s[21]; // fallthrough
+    case 21: dst->u8s[20] = s[20]; // fallthrough
+    case 20: dst->u8s[19] = s[19]; // fallthrough
+    case 19: dst->u8s[18] = s[18]; // fallthrough
+    case 18: dst->u8s[17] = s[17]; // fallthrough
+    case 17: dst->u8s[16] = s[16]; // fallthrough
+    case 16: dst->u8s[15] = s[15]; // fallthrough
+    case 15: dst->u8s[14] = s[14]; // fallthrough
+    case 14: dst->u8s[13] = s[13]; // fallthrough
+    case 13: dst->u8s[12] = s[12]; // fallthrough
+    case 12: dst->u8s[11] = s[11]; // fallthrough
+    case 11: dst->u8s[10] = s[10]; // fallthrough
+    case 10: dst->u8s[9] = s[9];   // fallthrough
+    case 9: dst->u8s[8] = s[8];    // fallthrough
+    case 8: dst->u8s[7] = s[7];    // fallthrough
+    case 7: dst->u8s[6] = s[6];    // fallthrough
+    case 6: dst->u8s[5] = s[5];    // fallthrough
+    case 5: dst->u8s[4] = s[4];    // fallthrough
+    case 4: dst->u8s[3] = s[3];    // fallthrough
+    case 3: dst->u8s[2] = s[2];    // fallthrough
+    case 2: dst->u8s[1] = s[1];    // fallthrough
+    case 1: dst->u8s[0] = s[0];    // fallthrough
+    case 0: break;
+    }
+}
+/** @brief Partial load for 4-bit nibbles (64 max = 32 bytes) into 256-bit vector (zeros in remaining slots). */
+NK_INTERNAL void nk_partial_load_b4x64_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
+    nk_u8_t const *s = (nk_u8_t const *)src;
+    nk_size_t n_bytes = nk_size_divide_round_up_(n, 2);
+    for (nk_size_t i = 0; i < n_bytes && i < 32; i++) dst->u8s[i] = s[i];
+}
+/** @brief Partial load for 4-bit nibbles (32 max = 16 bytes) into 128-bit vector (zeros in remaining slots). */
+NK_INTERNAL void nk_partial_load_b4x32_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    nk_u8_t const *s = (nk_u8_t const *)src;
+    nk_size_t n_bytes = nk_size_divide_round_up_(n, 2);
+    for (nk_size_t i = 0; i < n_bytes && i < 16; i++) dst->u8s[i] = s[i];
+}
+/** @brief Partial load for 1-bit elements (128 max = 16 bytes) into 128-bit vector (zeros in remaining slots). */
+NK_INTERNAL void nk_partial_load_b1x128_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n_bits) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    nk_u8_t const *s = (nk_u8_t const *)src;
+    nk_size_t n_bytes = nk_size_divide_round_up_(n_bits, 8);
+    for (nk_size_t i = 0; i < n_bytes && i < 16; i++) dst->u8s[i] = s[i];
+}
+/** @brief Partial load for binary (u1) data into 256-bit vector, converting n_bits → n_bytes. */
+NK_INTERNAL void nk_partial_load_b1x256_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n_bits) {
+    nk_size_t n_bytes = nk_size_divide_round_up_(n_bits, 8);
+    nk_partial_load_b8x32_serial_(src, dst, n_bytes);
+}
+/** @brief Partial load for 4-bit nibbles (16 max = 8 bytes) into 64-bit vector (zeros in remaining slots). */
+NK_INTERNAL void nk_partial_load_b4x16_serial_(void const *src, nk_b64_vec_t *dst, nk_size_t n) {
+    dst->u64 = 0;
+    nk_u8_t const *s = (nk_u8_t const *)src;
+    nk_size_t n_bytes = nk_size_divide_round_up_(n, 2);
+    for (nk_size_t i = 0; i < n_bytes && i < 8; i++) ((nk_u8_t *)&dst->u64)[i] = s[i];
+}
+/** @brief Strided partial load for 32-bit elements (4 max) into 128-bit vector. */
+NK_INTERNAL void nk_strided_load_b32x4_serial_(void const *src, nk_size_t stride_elements, nk_b128_vec_t *dst,
+                                               nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    nk_u32_t const *s = (nk_u32_t const *)src;
+    for (nk_size_t i = 0; i < n && i < 4; ++i) dst->u32s[i] = s[i * stride_elements];
+}
+/** @brief Strided partial load for 16-bit elements (8 max) into 128-bit vector. */
+NK_INTERNAL void nk_strided_load_b16x8_serial_(void const *src, nk_size_t stride_elements, nk_b128_vec_t *dst,
+                                               nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    nk_u16_t const *s = (nk_u16_t const *)src;
+    for (nk_size_t i = 0; i < n && i < 8; ++i) dst->u16s[i] = s[i * stride_elements];
+}
+/** @brief Strided partial load for 8-bit elements (16 max) into 128-bit vector. */
+NK_INTERNAL void nk_strided_load_b8x16_serial_(void const *src, nk_size_t stride_elements, nk_b128_vec_t *dst,
+                                               nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    nk_u8_t const *s = (nk_u8_t const *)src;
+    for (nk_size_t i = 0; i < n && i < 16; ++i) dst->u8s[i] = s[i * stride_elements];
+}
+#pragma endregion Type Punned Loads and Stores
+/**
+ *  @brief Expands an `f16` (IEEE-754 16-bit) to a `float`.
+ *
+ *  Handles all IEEE-754 edge cases:
+ *
+ *       Input        F16 Hex   F32 Hex       Description
+ *       +0           0x0000    0x00000000    Positive zero
+ *       -0           0x8000    0x80000000    Negative zero
+ *       +inf         0x7C00    0x7F800000    Positive infinity
+ *       -inf         0xFC00    0xFF800000    Negative infinity
+ *       NaN          0x7E00    0x7FC00000    Quiet NaN (payload preserved)
+ *       Min normal   0x0400    0x38800000    2⁻¹⁴
+ *       Max normal   0x7BFF    0x477FE000    65504
+ *       Min denorm   0x0001    0x33800000    2⁻²⁴
+ *       Max denorm   0x03FF    0x387FC000    2⁻¹⁴ - 2⁻²⁴
+ *
+ *  https://stackoverflow.com/a/60047308
+ *  https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
+ *  https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
+ */
+NK_PUBLIC void nk_f16_to_f32_serial(nk_f16_t const *src, nk_f32_t *dest) {
+#if NK_NATIVE_F16
+    *dest = (nk_f32_t)(*src);
+#else
+    unsigned short x;
+    nk_copy_bytes_(&x, src, 2);
+    unsigned int sign = (x >> 15) & 1;
+    unsigned int exponent = (x >> 10) & 0x1F;
+    unsigned int mantissa = x & 0x03FF;
+    nk_fui32_t conv;
+    if (exponent == 0) {
+        if (mantissa == 0) {
+            // Zero (preserve sign)
+            conv.u = sign << 31;
+        }
+        else {
+            // Denormal: value = mantissa × 2⁻²⁴
+            // Use FPU normalization, then subtract 24 from exponent
+            nk_fui32_t temp;
+            temp.f = (float)mantissa;
+            conv.u = (sign << 31) | (temp.u - 0x0C000000);
+        }
+    }
+    else if (exponent == 31) {
+        // Infinity (mantissa=0) or NaN (mantissa!=0)
+        conv.u = (sign << 31) | 0x7F800000 | (mantissa << 13);
+    }
+    else {
+        // Normal: rebias exponent (127-15=112), shift mantissa
+        conv.u = (sign << 31) | ((exponent + 112) << 23) | (mantissa << 13);
+    }
+    *dest = conv.f;
+#endif
+}
+/** @brief Load 4 × f16 from memory and upcast them to 4 × f32. */
+NK_INTERNAL void nk_load_f16x4_to_f32x4_serial_(void const *src, nk_b128_vec_t *dst) {
+    nk_f16_t const *scalars = (nk_f16_t const *)src;
+    nk_f16_to_f32_serial(scalars + 0, dst->f32s + 0);
+    nk_f16_to_f32_serial(scalars + 1, dst->f32s + 1);
+    nk_f16_to_f32_serial(scalars + 2, dst->f32s + 2);
+    nk_f16_to_f32_serial(scalars + 3, dst->f32s + 3);
+}
+/** @brief Partial load for up to 4 × f16 with upcast to 4 × f32. */
+NK_INTERNAL void nk_partial_load_f16x4_to_f32x4_serial_(nk_f16_t const *src, nk_b128_vec_t *dst, nk_size_t n) {
+    dst->u64s[0] = 0, dst->u64s[1] = 0;
+    switch (n) {
+    default:
+    case 4: nk_f16_to_f32_serial(src + 3, dst->f32s + 3); // fallthrough
+    case 3: nk_f16_to_f32_serial(src + 2, dst->f32s + 2); // fallthrough
+    case 2: nk_f16_to_f32_serial(src + 1, dst->f32s + 1); // fallthrough
+    case 1: nk_f16_to_f32_serial(src + 0, dst->f32s + 0); // fallthrough
+    case 0: break;
+    }
+}
+/**
+ *  @brief Compresses a `float` to an `f16` (IEEE-754 16-bit).
+ *
+ *  Handles all IEEE-754 edge cases with round-to-nearest:
+ *
+ *      Input           F32 Hex       F16 Hex   Description
+ *      +0              0x00000000    0x0000    Positive zero
+ *      -0              0x80000000    0x8000    Negative zero
+ *      +inf            0x7F800000    0x7C00    Positive infinity
+ *      -inf            0xFF800000    0xFC00    Negative infinity
+ *      NaN             0x7FC00000    0x7E00    Quiet NaN (payload truncated)
+ *      1.0             0x3F800000    0x3C00    Normal number
+ *      65504           0x477FE000    0x7BFF    Max f16 normal
+ *      65520+          >0x477FE000   0x7C00    Overflow → infinity
+ *      2⁻¹⁴           0x38800000    0x0400    Min f16 normal
+ *      2⁻²⁴           0x33800000    0x0001    Min f16 denormal
+ *      <2⁻²⁵          <0x33000000   0x0000    Underflow → zero
+ *
+ *  https://stackoverflow.com/a/60047308
+ *  https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
+ *  https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
+ */
+NK_PUBLIC void nk_f32_to_f16_serial(nk_f32_t const *src, nk_f16_t *dest) {
+#if NK_NATIVE_F16
+    *dest = (nk_f16_t)(*src);
+#else
+    nk_fui32_t conv;
+    conv.f = *src;
+    unsigned int sign = (conv.u >> 31) & 1;
+    unsigned int exponent = (conv.u >> 23) & 0xFF;
+    unsigned int mantissa = conv.u & 0x007FFFFF;
+    unsigned short result;
+    if (exponent == 0) {
+        // Zero or f32 denormal → f16 zero
+        result = (unsigned short)(sign << 15);
+    }
+    else if (exponent == 255) {
+        // Infinity or NaN
+        unsigned short payload = (unsigned short)(mantissa >> 13);
+        if (mantissa != 0 && payload == 0) payload = 1; // Preserve NaN-ness
+        result = (unsigned short)((sign << 15) | 0x7C00 | payload);
+    }
+    else if (exponent <= 102) {
+        // Below or at f16 denormal threshold
+        // exp=102 with mant=0 is exactly 2^-25 (tie point, rounds to 0 per round-to-even)
+        // exp=102 with mant>0 is above tie point (rounds to smallest denormal 0x0001)
+        if (exponent == 102 && mantissa > 0) result = (unsigned short)((sign << 15) | 0x0001);
+        else result = (unsigned short)(sign << 15);
+    }
+    else if (exponent < 113) {
+        // F16 denormal range (exp 103-112) with IEEE 754 round-to-nearest-even
+        unsigned int shift = 113 - exponent;
+        unsigned int shift_amount = shift + 13;
+        unsigned long long full_mant = 0x00800000ULL | mantissa;
+        // Extract result before rounding
+        unsigned int mant = (unsigned int)(full_mant >> shift_amount);
+        // IEEE 754 round-to-nearest-even: round up if round_bit is set AND
+        // (sticky_bits are nonzero OR result is odd)
+        unsigned int round_bit = (full_mant >> (shift_amount - 1)) & 1;
+        unsigned long long sticky_bits = full_mant & ((1ULL << (shift_amount - 1)) - 1);
+        if (round_bit && (sticky_bits || (mant & 1))) mant++;
+        result = (unsigned short)((sign << 15) | mant);
+    }
+    else if (exponent < 143) {
+        // Normal f16 range with IEEE 754 round-to-nearest-even
+        unsigned int f16_exp = exponent - 112;
+        unsigned int f16_mant = mantissa >> 13;
+        // IEEE 754 rounding: check round bit (bit 12) and sticky bits (bits 0-11)
+        unsigned int round_bit = (mantissa >> 12) & 1;
+        unsigned int sticky_bits = mantissa & 0xFFF;
+        if (round_bit && (sticky_bits || (f16_mant & 1))) {
+            f16_mant++;
+            if (f16_mant > 0x3FF) f16_mant = 0, f16_exp++;
+        }
+        if (f16_exp > 30) result = (unsigned short)((sign << 15) | 0x7C00);
+        else result = (unsigned short)((sign << 15) | (f16_exp << 10) | f16_mant);
+    }
+    else {
+        // Overflow → infinity
+        result = (unsigned short)((sign << 15) | 0x7C00);
+    }
+    nk_copy_bytes_(dest, &result, 2);
+#endif
+}
+/**
  *  @brief For compilers that don't natively support the `__bf16` type,
  *          upcasts contents into a more conventional `float`.
  *
@@ -309,8 +844,8 @@ NK_PUBLIC void nk_e4m3_to_f32_serial(nk_e4m3_t const *src, nk_f32_t *dest) {
  *      NaN          0x7FC00000    0x7F      Quiet NaN
  *      1.0          0x3F800000    0x38      Normal (exp=7, mant=0)
  *      448+         >0x43E00000   0x7E      Overflow → max
- *      2⁻⁶         0x3E800000    0x08      Min normal
- *      <2⁻¹² × ⁵     <0x39800000   0x00      Underflow → zero (RNE boundary)
+ *      2⁻⁶          0x3E800000    0x08      Min normal
+ *      ≤2⁻¹⁰        ≤0x3A800000   0x00      Underflow → zero (RNE boundary)
  *
  *  References:
  *      https://arxiv.org/pdf/2209.05433 (NVIDIA/Intel/Arm FP8 paper)
@@ -552,8 +1087,8 @@ NK_PUBLIC void nk_e5m2_to_f32_serial(nk_e5m2_t const *src, nk_f32_t *dest) {
  *      NaN          0x7FC00000    0x7D      Quiet NaN
  *      1.0          0x3F800000    0x3C      Normal (exp=15, mant=0)
  *      57344+       >0x47600000   0x7C      Overflow → infinity
- *      2⁻¹⁴        0x38800000    0x04      Min normal
- *      <2⁻¹⁷ × ⁵     <0x36800000   0x00      Underflow → zero (RNE boundary)
+ *      2⁻¹⁴         0x38800000    0x04      Min normal
+ *      ≤2⁻¹⁷        ≤0x37000000   0x00      Underflow → zero (RNE boundary)
  *
  *  References:
  *      https://arxiv.org/pdf/2209.05433 (NVIDIA/Intel/Arm FP8 paper)
@@ -1050,565 +1585,156 @@ NK_INTERNAL nk_u64_t nk_rint_even_f64_to_u64_serial_(nk_f64_t x) {
 }
 NK_INTERNAL void nk_f32_to_i8_serial(nk_f32_t const *x, nk_i8_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else *y = (nk_i8_t)nk_rint_even_f64_to_i64_serial_(*x > 127.0f ? 127.0 : (*x < -128.0f ? -128.0 : (nk_f64_t)*x));
-}
-NK_INTERNAL void nk_f32_to_u8_serial(nk_f32_t const *x, nk_u8_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else *y = (nk_u8_t)nk_rint_even_f64_to_u64_serial_(*x > 255.0f ? 255.0 : (*x < 0 ? 0.0 : (nk_f64_t)*x));
-}
-NK_INTERNAL void nk_f32_to_i16_serial(nk_f32_t const *x, nk_i16_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else
-        *y = (nk_i16_t)nk_rint_even_f64_to_i64_serial_(*x > 32767.0f ? 32767.0
-                                                                     : (*x < -32768.0f ? -32768.0 : (nk_f64_t)*x));
-}
-NK_INTERNAL void nk_f32_to_u16_serial(nk_f32_t const *x, nk_u16_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else *y = (nk_u16_t)nk_rint_even_f64_to_u64_serial_(*x > 65535.0f ? 65535.0 : (*x < 0 ? 0.0 : (nk_f64_t)*x));
-}
-NK_INTERNAL void nk_f64_to_i8_serial(nk_f64_t const *x, nk_i8_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else *y = (nk_i8_t)nk_rint_even_f64_to_i64_serial_(*x > 127.0 ? 127.0 : (*x < -128.0 ? -128.0 : *x));
-}
-NK_INTERNAL void nk_f64_to_u8_serial(nk_f64_t const *x, nk_u8_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else *y = (nk_u8_t)nk_rint_even_f64_to_u64_serial_(*x > 255.0 ? 255.0 : (*x < 0 ? 0.0 : *x));
-}
-NK_INTERNAL void nk_f64_to_i16_serial(nk_f64_t const *x, nk_i16_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else *y = (nk_i16_t)nk_rint_even_f64_to_i64_serial_(*x > 32767.0 ? 32767.0 : (*x < -32768.0 ? -32768.0 : *x));
-}
-NK_INTERNAL void nk_f64_to_u16_serial(nk_f64_t const *x, nk_u16_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else *y = (nk_u16_t)nk_rint_even_f64_to_u64_serial_(*x > 65535.0 ? 65535.0 : (*x < 0 ? 0.0 : *x));
-}
-NK_INTERNAL void nk_f64_to_i32_serial(nk_f64_t const *x, nk_i32_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else
-        *y = (nk_i32_t)nk_rint_even_f64_to_i64_serial_(*x > 2147483647.0 ? 2147483647.0
-                                                                         : (*x < -2147483648.0 ? -2147483648.0 : *x));
-}
-NK_INTERNAL void nk_f64_to_u32_serial(nk_f64_t const *x, nk_u32_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else *y = (nk_u32_t)nk_rint_even_f64_to_u64_serial_(*x > 4294967295.0 ? 4294967295.0 : (*x < 0 ? 0.0 : *x));
-}
-NK_INTERNAL void nk_f64_to_i64_serial(nk_f64_t const *x, nk_i64_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else
-        *y = nk_rint_even_f64_to_i64_serial_(*x > 9223372036854775807.0
-                                                 ? 9223372036854775807.0
-                                                 : (*x < -9223372036854775808.0 ? -9223372036854775808.0 : *x));
-}
-NK_INTERNAL void nk_f64_to_u64_serial(nk_f64_t const *x, nk_u64_t *y) {
-    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
-    else
-        *y = nk_rint_even_f64_to_u64_serial_(*x > 18446744073709551615.0 ? 18446744073709551615.0
-                                                                         : (*x < 0 ? 0.0 : *x));
-}
-NK_INTERNAL void nk_i64_to_i8_serial(nk_i64_t const *x, nk_i8_t *y) {
-    *y = (nk_i8_t)(*x > 127ll ? 127ll : (*x < -128ll ? -128ll : *x));
-}
-NK_INTERNAL void nk_i64_to_u8_serial(nk_i64_t const *x, nk_u8_t *y) {
-    *y = (nk_u8_t)(*x > 255ll ? 255ll : (*x < 0ll ? 0ll : *x));
-}
-NK_INTERNAL void nk_i64_to_i16_serial(nk_i64_t const *x, nk_i16_t *y) {
-    *y = (nk_i16_t)(*x > 32767ll ? 32767ll : (*x < -32768ll ? -32768ll : *x));
-}
-NK_INTERNAL void nk_i64_to_u16_serial(nk_i64_t const *x, nk_u16_t *y) {
-    *y = (nk_u16_t)(*x > 65535ll ? 65535ll : (*x < 0ll ? 0ll : *x));
-}
-NK_INTERNAL void nk_i64_to_i32_serial(nk_i64_t const *x, nk_i32_t *y) {
-    *y = (nk_i32_t)(*x > 2147483647ll ? 2147483647ll : (*x < -2147483648ll ? -2147483648ll : *x));
-}
-NK_INTERNAL void nk_i64_to_u32_serial(nk_i64_t const *x, nk_u32_t *y) {
-    *y = (nk_u32_t)(*x > 4294967295ll ? 4294967295ll : (*x < 0ll ? 0ll : *x));
-}
-NK_INTERNAL void nk_u64_to_i8_serial(nk_u64_t const *x, nk_i8_t *y) { *y = (nk_i8_t)(*x > 127ull ? 127ull : *x); }
-NK_INTERNAL void nk_u64_to_u8_serial(nk_u64_t const *x, nk_u8_t *y) { *y = (nk_u8_t)(*x > 255ull ? 255ull : *x); }
-NK_INTERNAL void nk_u64_to_i16_serial(nk_u64_t const *x, nk_i16_t *y) {
-    *y = (nk_i16_t)(*x > 32767ull ? 32767ull : *x);
-}
-NK_INTERNAL void nk_u64_to_u16_serial(nk_u64_t const *x, nk_u16_t *y) {
-    *y = (nk_u16_t)(*x > 65535ull ? 65535ull : *x);
-}
-NK_INTERNAL void nk_u64_to_i32_serial(nk_u64_t const *x, nk_i32_t *y) {
-    *y = (nk_i32_t)(*x > 2147483647ull ? 2147483647ull : *x);
-}
-NK_INTERNAL void nk_u64_to_u32_serial(nk_u64_t const *x, nk_u32_t *y) {
-    *y = (nk_u32_t)(*x > 4294967295ull ? 4294967295ull : *x);
-}
-NK_PUBLIC void nk_f16_to_f64_(nk_f16_t const *src, nk_f64_t *dest) {
-    nk_f32_t f32;
-    nk_f16_to_f32_serial(src, &f32);
-    *dest = f32;
-}
-NK_PUBLIC void nk_bf16_to_f64_(nk_bf16_t const *src, nk_f64_t *dest) {
-    nk_f32_t f32;
-    nk_bf16_to_f32_serial(src, &f32);
-    *dest = f32;
-}
-NK_INTERNAL void nk_u64_to_i64_serial(nk_u64_t const *x, nk_i64_t *y) {
-    *y = (nk_i64_t)(*x >= 9223372036854775807ull ? 9223372036854775807ll : *x);
-}
-NK_INTERNAL void nk_i8_to_u64_serial(nk_i8_t const *x, nk_u64_t *y) { *y = (nk_u64_t)(*x < 0 ? 0 : *x); }
-NK_INTERNAL void nk_i16_to_u64_serial(nk_i16_t const *x, nk_u64_t *y) { *y = (nk_u64_t)(*x < 0 ? 0 : *x); }
-NK_INTERNAL void nk_i32_to_u64_serial(nk_i32_t const *x, nk_u64_t *y) { *y = (nk_u64_t)(*x < 0 ? 0 : *x); }
-NK_INTERNAL void nk_i64_to_u64_serial(nk_i64_t const *x, nk_u64_t *y) { *y = (nk_u64_t)(*x < 0 ? 0 : *x); }
-NK_INTERNAL void nk_i64_to_f16_serial(nk_i64_t const *x, nk_f16_t *y) {
-    nk_f32_t f32 = (nk_f32_t)*x;
-    nk_f32_to_f16_serial(&f32, y);
-}
-NK_INTERNAL void nk_i64_to_bf16_serial(nk_i64_t const *x, nk_bf16_t *y) {
-    nk_f32_t f32 = (nk_f32_t)*x;
-    nk_f32_to_bf16_serial(&f32, y);
-}
-NK_INTERNAL void nk_u64_to_f16_serial(nk_u64_t const *x, nk_f16_t *y) {
-    nk_f32_t f32 = (nk_f32_t)*x;
-    nk_f32_to_f16_serial(&f32, y);
-}
-NK_INTERNAL void nk_u64_to_bf16_serial(nk_u64_t const *x, nk_bf16_t *y) {
-    nk_f32_t f32 = (nk_f32_t)*x;
-    nk_f32_to_bf16_serial(&f32, y);
-}
-#pragma region - Type Punned Loads and Stores
-/** @brief Type-agnostic 256-bit full load. */
-NK_INTERNAL void nk_load_b256_serial_(void const *src, nk_b256_vec_t *dst) {
-    nk_u64_t const *s = (nk_u64_t const *)src;
-    dst->u64s[0] = s[0], dst->u64s[1] = s[1], dst->u64s[2] = s[2], dst->u64s[3] = s[3];
-}
-/** @brief Type-agnostic 128-bit full load. */
-NK_INTERNAL void nk_load_b128_serial_(void const *src, nk_b128_vec_t *dst) {
-    nk_u64_t const *s = (nk_u64_t const *)src;
-    dst->u64s[0] = s[0], dst->u64s[1] = s[1];
-}
-/** @brief Type-agnostic 64-bit full load. */
-NK_INTERNAL void nk_load_b64_serial_(void const *src, nk_b64_vec_t *dst) { dst->u64 = *(nk_u64_t const *)src; }
-/** @brief Type-agnostic partial load for 32-bit elements (8 elements max) into 256-bit vector. */
-NK_INTERNAL void nk_partial_load_b32x8_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
-    nk_u32_t const *s = (nk_u32_t const *)src;
-    switch (n) {
-    default:
-    case 8: dst->u32s[7] = s[7]; // fallthrough
-    case 7: dst->u32s[6] = s[6]; // fallthrough
-    case 6: dst->u32s[5] = s[5]; // fallthrough
-    case 5: dst->u32s[4] = s[4]; // fallthrough
-    case 4: dst->u32s[3] = s[3]; // fallthrough
-    case 3: dst->u32s[2] = s[2]; // fallthrough
-    case 2: dst->u32s[1] = s[1]; // fallthrough
-    case 1: dst->u32s[0] = s[0]; // fallthrough
-    case 0: break;
-    }
-}
-/** @brief Type-agnostic partial load for 32-bit elements (4 elements max) into 128-bit vector. */
-NK_INTERNAL void nk_partial_load_b32x4_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0;
-    nk_u32_t const *s = (nk_u32_t const *)src;
-    switch (n) {
-    default:
-    case 4: dst->u32s[3] = s[3]; // fallthrough
-    case 3: dst->u32s[2] = s[2]; // fallthrough
-    case 2: dst->u32s[1] = s[1]; // fallthrough
-    case 1: dst->u32s[0] = s[0]; // fallthrough
-    case 0: break;
-    }
-}
-/** @brief Type-agnostic partial load for 8-bit elements (8 elements max) into 64-bit vector. */
-NK_INTERNAL void nk_partial_load_b8x8_serial_(void const *src, nk_b64_vec_t *dst, nk_size_t n) {
-    dst->u64 = 0;
-    nk_u8_t const *s = (nk_u8_t const *)src;
-    switch (n) {
-    default:
-    case 8: dst->u8s[7] = s[7]; // fallthrough
-    case 7: dst->u8s[6] = s[6]; // fallthrough
-    case 6: dst->u8s[5] = s[5]; // fallthrough
-    case 5: dst->u8s[4] = s[4]; // fallthrough
-    case 4: dst->u8s[3] = s[3]; // fallthrough
-    case 3: dst->u8s[2] = s[2]; // fallthrough
-    case 2: dst->u8s[1] = s[1]; // fallthrough
-    case 1: dst->u8s[0] = s[0]; // fallthrough
-    case 0: break;
-    }
-}
-/** @brief Type-agnostic partial load for 8-bit elements (4 elements max) into 32-bit vector. */
-NK_INTERNAL nk_b32_vec_t nk_partial_load_b8x4_serial_(void const *src, nk_size_t n) {
-    nk_b32_vec_t dst = {0};
-    nk_u8_t const *s = (nk_u8_t const *)src;
-    switch (n) {
-    default:
-    case 4: dst.u8s[3] = s[3]; // fallthrough
-    case 3: dst.u8s[2] = s[2]; // fallthrough
-    case 2: dst.u8s[1] = s[1]; // fallthrough
-    case 1: dst.u8s[0] = s[0]; // fallthrough
-    case 0: break;
-    }
-    return dst;
-}
-/** @brief Partial store for 8-bit elements (up to 4) from nk_b32_vec_t. */
-NK_INTERNAL void nk_partial_store_b8x4_serial_(nk_b32_vec_t const *src, void *dst, nk_size_t n) {
-    nk_u8_t *d = (nk_u8_t *)dst;
-    switch (n) {
-    default:
-    case 4: d[3] = src->u8s[3]; // fallthrough
-    case 3: d[2] = src->u8s[2]; // fallthrough
-    case 2: d[1] = src->u8s[1]; // fallthrough
-    case 1: d[0] = src->u8s[0]; // fallthrough
-    case 0: break;
-    }
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else *y = (nk_i8_t)nk_rint_even_f64_to_i64_serial_(*x > 127.0f ? 127.0 : (*x < -128.0f ? -128.0 : (nk_f64_t)*x));
 }
-/** @brief Type-agnostic partial load for 16-bit elements (8 elements max) into 128-bit vector. */
-NK_INTERNAL void nk_partial_load_b16x8_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0;
-    nk_u16_t const *s = (nk_u16_t const *)src;
-    switch (n) {
-    default:
-    case 8: dst->u16s[7] = s[7]; // fallthrough
-    case 7: dst->u16s[6] = s[6]; // fallthrough
-    case 6: dst->u16s[5] = s[5]; // fallthrough
-    case 5: dst->u16s[4] = s[4]; // fallthrough
-    case 4: dst->u16s[3] = s[3]; // fallthrough
-    case 3: dst->u16s[2] = s[2]; // fallthrough
-    case 2: dst->u16s[1] = s[1]; // fallthrough
-    case 1: dst->u16s[0] = s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f32_to_u8_serial(nk_f32_t const *x, nk_u8_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else *y = (nk_u8_t)nk_rint_even_f64_to_u64_serial_(*x > 255.0f ? 255.0 : (*x < 0 ? 0.0 : (nk_f64_t)*x));
 }
-/** @brief Type-agnostic partial load for 8-bit elements (16 elements max) into 128-bit vector. */
-NK_INTERNAL void nk_partial_load_b8x16_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0;
-    nk_u8_t const *s = (nk_u8_t const *)src;
-    switch (n) {
-    default:
-    case 16: dst->u8s[15] = s[15]; // fallthrough
-    case 15: dst->u8s[14] = s[14]; // fallthrough
-    case 14: dst->u8s[13] = s[13]; // fallthrough
-    case 13: dst->u8s[12] = s[12]; // fallthrough
-    case 12: dst->u8s[11] = s[11]; // fallthrough
-    case 11: dst->u8s[10] = s[10]; // fallthrough
-    case 10: dst->u8s[9] = s[9];   // fallthrough
-    case 9: dst->u8s[8] = s[8];    // fallthrough
-    case 8: dst->u8s[7] = s[7];    // fallthrough
-    case 7: dst->u8s[6] = s[6];    // fallthrough
-    case 6: dst->u8s[5] = s[5];    // fallthrough
-    case 5: dst->u8s[4] = s[4];    // fallthrough
-    case 4: dst->u8s[3] = s[3];    // fallthrough
-    case 3: dst->u8s[2] = s[2];    // fallthrough
-    case 2: dst->u8s[1] = s[1];    // fallthrough
-    case 1: dst->u8s[0] = s[0];    // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f32_to_i16_serial(nk_f32_t const *x, nk_i16_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else
+        *y = (nk_i16_t)nk_rint_even_f64_to_i64_serial_(*x > 32767.0f ? 32767.0
+                                                                     : (*x < -32768.0f ? -32768.0 : (nk_f64_t)*x));
 }
-/** @brief Type-agnostic partial load for 16-bit elements (16 elements max) into 256-bit vector. */
-NK_INTERNAL void nk_partial_load_b16x16_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
-    nk_u16_t const *s = (nk_u16_t const *)src;
-    switch (n) {
-    default:
-    case 16: dst->u16s[15] = s[15]; // fallthrough
-    case 15: dst->u16s[14] = s[14]; // fallthrough
-    case 14: dst->u16s[13] = s[13]; // fallthrough
-    case 13: dst->u16s[12] = s[12]; // fallthrough
-    case 12: dst->u16s[11] = s[11]; // fallthrough
-    case 11: dst->u16s[10] = s[10]; // fallthrough
-    case 10: dst->u16s[9] = s[9];   // fallthrough
-    case 9: dst->u16s[8] = s[8];    // fallthrough
-    case 8: dst->u16s[7] = s[7];    // fallthrough
-    case 7: dst->u16s[6] = s[6];    // fallthrough
-    case 6: dst->u16s[5] = s[5];    // fallthrough
-    case 5: dst->u16s[4] = s[4];    // fallthrough
-    case 4: dst->u16s[3] = s[3];    // fallthrough
-    case 3: dst->u16s[2] = s[2];    // fallthrough
-    case 2: dst->u16s[1] = s[1];    // fallthrough
-    case 1: dst->u16s[0] = s[0];    // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f32_to_u16_serial(nk_f32_t const *x, nk_u16_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else *y = (nk_u16_t)nk_rint_even_f64_to_u64_serial_(*x > 65535.0f ? 65535.0 : (*x < 0 ? 0.0 : (nk_f64_t)*x));
 }
-/** @brief Partial load for 8-bit elements (32 max) into 256-bit vector (zeros in remaining slots). */
-NK_INTERNAL void nk_partial_load_b8x32_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
-    nk_u8_t const *s = (nk_u8_t const *)src;
-    switch (n) {
-    default:
-    case 32: dst->u8s[31] = s[31]; // fallthrough
-    case 31: dst->u8s[30] = s[30]; // fallthrough
-    case 30: dst->u8s[29] = s[29]; // fallthrough
-    case 29: dst->u8s[28] = s[28]; // fallthrough
-    case 28: dst->u8s[27] = s[27]; // fallthrough
-    case 27: dst->u8s[26] = s[26]; // fallthrough
-    case 26: dst->u8s[25] = s[25]; // fallthrough
-    case 25: dst->u8s[24] = s[24]; // fallthrough
-    case 24: dst->u8s[23] = s[23]; // fallthrough
-    case 23: dst->u8s[22] = s[22]; // fallthrough
-    case 22: dst->u8s[21] = s[21]; // fallthrough
-    case 21: dst->u8s[20] = s[20]; // fallthrough
-    case 20: dst->u8s[19] = s[19]; // fallthrough
-    case 19: dst->u8s[18] = s[18]; // fallthrough
-    case 18: dst->u8s[17] = s[17]; // fallthrough
-    case 17: dst->u8s[16] = s[16]; // fallthrough
-    case 16: dst->u8s[15] = s[15]; // fallthrough
-    case 15: dst->u8s[14] = s[14]; // fallthrough
-    case 14: dst->u8s[13] = s[13]; // fallthrough
-    case 13: dst->u8s[12] = s[12]; // fallthrough
-    case 12: dst->u8s[11] = s[11]; // fallthrough
-    case 11: dst->u8s[10] = s[10]; // fallthrough
-    case 10: dst->u8s[9] = s[9];   // fallthrough
-    case 9: dst->u8s[8] = s[8];    // fallthrough
-    case 8: dst->u8s[7] = s[7];    // fallthrough
-    case 7: dst->u8s[6] = s[6];    // fallthrough
-    case 6: dst->u8s[5] = s[5];    // fallthrough
-    case 5: dst->u8s[4] = s[4];    // fallthrough
-    case 4: dst->u8s[3] = s[3];    // fallthrough
-    case 3: dst->u8s[2] = s[2];    // fallthrough
-    case 2: dst->u8s[1] = s[1];    // fallthrough
-    case 1: dst->u8s[0] = s[0];    // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f64_to_i8_serial(nk_f64_t const *x, nk_i8_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else *y = (nk_i8_t)nk_rint_even_f64_to_i64_serial_(*x > 127.0 ? 127.0 : (*x < -128.0 ? -128.0 : *x));
 }
-/** @brief Type-agnostic partial store for 32-bit elements (8 elements max) from 256-bit vector. */
-NK_INTERNAL void nk_partial_store_b32x8_serial_(nk_b256_vec_t const *src, void *dst, nk_size_t n) {
-    nk_u32_t *d = (nk_u32_t *)dst;
-    switch (n) {
-    default:
-    case 8: d[7] = src->u32s[7]; // fallthrough
-    case 7: d[6] = src->u32s[6]; // fallthrough
-    case 6: d[5] = src->u32s[5]; // fallthrough
-    case 5: d[4] = src->u32s[4]; // fallthrough
-    case 4: d[3] = src->u32s[3]; // fallthrough
-    case 3: d[2] = src->u32s[2]; // fallthrough
-    case 2: d[1] = src->u32s[1]; // fallthrough
-    case 1: d[0] = src->u32s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f64_to_u8_serial(nk_f64_t const *x, nk_u8_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else *y = (nk_u8_t)nk_rint_even_f64_to_u64_serial_(*x > 255.0 ? 255.0 : (*x < 0 ? 0.0 : *x));
 }
-/** @brief Type-agnostic partial store for 32-bit elements (4 elements max) from 128-bit vector. */
-NK_INTERNAL void nk_partial_store_b32x4_serial_(nk_b128_vec_t const *src, void *dst, nk_size_t n) {
-    nk_u32_t *d = (nk_u32_t *)dst;
-    switch (n) {
-    default:
-    case 4: d[3] = src->u32s[3]; // fallthrough
-    case 3: d[2] = src->u32s[2]; // fallthrough
-    case 2: d[1] = src->u32s[1]; // fallthrough
-    case 1: d[0] = src->u32s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f64_to_i16_serial(nk_f64_t const *x, nk_i16_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else *y = (nk_i16_t)nk_rint_even_f64_to_i64_serial_(*x > 32767.0 ? 32767.0 : (*x < -32768.0 ? -32768.0 : *x));
 }
-/** @brief Type-agnostic partial store for 16-bit elements (8 elements max) from 128-bit vector. */
-NK_INTERNAL void nk_partial_store_b16x8_serial_(nk_b128_vec_t const *src, void *dst, nk_size_t n) {
-    nk_u16_t *d = (nk_u16_t *)dst;
-    switch (n) {
-    default:
-    case 8: d[7] = src->u16s[7]; // fallthrough
-    case 7: d[6] = src->u16s[6]; // fallthrough
-    case 6: d[5] = src->u16s[5]; // fallthrough
-    case 5: d[4] = src->u16s[4]; // fallthrough
-    case 4: d[3] = src->u16s[3]; // fallthrough
-    case 3: d[2] = src->u16s[2]; // fallthrough
-    case 2: d[1] = src->u16s[1]; // fallthrough
-    case 1: d[0] = src->u16s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f64_to_u16_serial(nk_f64_t const *x, nk_u16_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else *y = (nk_u16_t)nk_rint_even_f64_to_u64_serial_(*x > 65535.0 ? 65535.0 : (*x < 0 ? 0.0 : *x));
 }
-/** @brief Type-agnostic partial store for 16-bit elements (4 elements max) from 64-bit vector. */
-NK_INTERNAL void nk_partial_store_b16x4_serial_(void *dst, nk_b64_vec_t const *src, nk_size_t n) {
-    nk_u16_t *d = (nk_u16_t *)dst;
-    switch (n) {
-    default:
-    case 4: d[3] = src->u16s[3]; // fallthrough
-    case 3: d[2] = src->u16s[2]; // fallthrough
-    case 2: d[1] = src->u16s[1]; // fallthrough
-    case 1: d[0] = src->u16s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f64_to_i32_serial(nk_f64_t const *x, nk_i32_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else
+        *y = (nk_i32_t)nk_rint_even_f64_to_i64_serial_(*x > 2147483647.0 ? 2147483647.0
+                                                                         : (*x < -2147483648.0 ? -2147483648.0 : *x));
 }
-/** @brief Type-agnostic partial store for 8-bit elements (8 elements max) from 64-bit vector. */
-NK_INTERNAL void nk_partial_store_b8x8_serial_(nk_b64_vec_t const *src, void *dst, nk_size_t n) {
-    nk_u8_t *d = (nk_u8_t *)dst;
-    switch (n) {
-    default:
-    case 8: d[7] = src->u8s[7]; // fallthrough
-    case 7: d[6] = src->u8s[6]; // fallthrough
-    case 6: d[5] = src->u8s[5]; // fallthrough
-    case 5: d[4] = src->u8s[4]; // fallthrough
-    case 4: d[3] = src->u8s[3]; // fallthrough
-    case 3: d[2] = src->u8s[2]; // fallthrough
-    case 2: d[1] = src->u8s[1]; // fallthrough
-    case 1: d[0] = src->u8s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f64_to_u32_serial(nk_f64_t const *x, nk_u32_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else *y = (nk_u32_t)nk_rint_even_f64_to_u64_serial_(*x > 4294967295.0 ? 4294967295.0 : (*x < 0 ? 0.0 : *x));
 }
-/** @brief Type-agnostic partial load for 64-bit elements (4 elements max) into 256-bit vector. */
-NK_INTERNAL void nk_partial_load_b64x4_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
-    nk_u64_t const *s = (nk_u64_t const *)src;
-    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
-    switch (n) {
-    default:
-    case 4: dst->u64s[3] = s[3]; // fallthrough
-    case 3: dst->u64s[2] = s[2]; // fallthrough
-    case 2: dst->u64s[1] = s[1]; // fallthrough
-    case 1: dst->u64s[0] = s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f64_to_i64_serial(nk_f64_t const *x, nk_i64_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else
+        *y = nk_rint_even_f64_to_i64_serial_(*x > 9223372036854775807.0
+                                                 ? 9223372036854775807.0
+                                                 : (*x < -9223372036854775808.0 ? -9223372036854775808.0 : *x));
 }
-/** @brief Type-agnostic partial store for 64-bit elements (4 elements max) from 256-bit vector. */
-NK_INTERNAL void nk_partial_store_b64x4_serial_(nk_b256_vec_t const *src, void *dst, nk_size_t n) {
-    nk_u64_t *d = (nk_u64_t *)dst;
-    switch (n) {
-    default:
-    case 4: d[3] = src->u64s[3]; // fallthrough
-    case 3: d[2] = src->u64s[2]; // fallthrough
-    case 2: d[1] = src->u64s[1]; // fallthrough
-    case 1: d[0] = src->u64s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_f64_to_u64_serial(nk_f64_t const *x, nk_u64_t *y) {
+    if (*x != *x) *y = 0; // For IEEE floating-point, NaN is the one value that is not equal to itself
+    else
+        *y = nk_rint_even_f64_to_u64_serial_(*x > 18446744073709551615.0 ? 18446744073709551615.0
+                                                                         : (*x < 0 ? 0.0 : *x));
 }
-/** @brief Type-agnostic partial load for 32-bit elements (2 elements max) into 64-bit vector. */
-NK_INTERNAL void nk_partial_load_b32x2_serial_(void const *src, nk_b64_vec_t *dst, nk_size_t n) {
-    dst->u64 = 0;
-    nk_u32_t const *s = (nk_u32_t const *)src;
-    switch (n) {
-    default:
-    case 2: dst->u32s[1] = s[1]; // fallthrough
-    case 1: dst->u32s[0] = s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_i64_to_i8_serial(nk_i64_t const *x, nk_i8_t *y) {
+    *y = (nk_i8_t)(*x > 127ll ? 127ll : (*x < -128ll ? -128ll : *x));
+}
+NK_INTERNAL void nk_i64_to_u8_serial(nk_i64_t const *x, nk_u8_t *y) {
+    *y = (nk_u8_t)(*x > 255ll ? 255ll : (*x < 0ll ? 0ll : *x));
+}
+NK_INTERNAL void nk_i64_to_i16_serial(nk_i64_t const *x, nk_i16_t *y) {
+    *y = (nk_i16_t)(*x > 32767ll ? 32767ll : (*x < -32768ll ? -32768ll : *x));
 }
-/** @brief Type-agnostic partial load for 16-bit elements (4 elements max) into 64-bit vector. */
-NK_INTERNAL void nk_partial_load_b16x4_serial_(void const *src, nk_b64_vec_t *dst, nk_size_t n) {
-    dst->u64 = 0;
-    nk_u16_t const *s = (nk_u16_t const *)src;
-    switch (n) {
-    default:
-    case 4: dst->u16s[3] = s[3]; // fallthrough
-    case 3: dst->u16s[2] = s[2]; // fallthrough
-    case 2: dst->u16s[1] = s[1]; // fallthrough
-    case 1: dst->u16s[0] = s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_i64_to_u16_serial(nk_i64_t const *x, nk_u16_t *y) {
+    *y = (nk_u16_t)(*x > 65535ll ? 65535ll : (*x < 0ll ? 0ll : *x));
 }
-/** @brief Partial load for 4-bit nibbles (64 max = 32 bytes) into 256-bit vector (zeros in remaining slots). */
-NK_INTERNAL void nk_partial_load_b4x64_serial_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0, dst->u64s[2] = 0, dst->u64s[3] = 0;
-    nk_u8_t const *s = (nk_u8_t const *)src;
-    nk_size_t n_bytes = nk_size_divide_round_up_(n, 2);
-    for (nk_size_t i = 0; i < n_bytes && i < 32; i++) dst->u8s[i] = s[i];
+NK_INTERNAL void nk_i64_to_i32_serial(nk_i64_t const *x, nk_i32_t *y) {
+    *y = (nk_i32_t)(*x > 2147483647ll ? 2147483647ll : (*x < -2147483648ll ? -2147483648ll : *x));
 }
-/** @brief Partial load for 4-bit nibbles (32 max = 16 bytes) into 128-bit vector (zeros in remaining slots). */
-NK_INTERNAL void nk_partial_load_b4x32_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0;
-    nk_u8_t const *s = (nk_u8_t const *)src;
-    nk_size_t n_bytes = nk_size_divide_round_up_(n, 2);
-    for (nk_size_t i = 0; i < n_bytes && i < 16; i++) dst->u8s[i] = s[i];
+NK_INTERNAL void nk_i64_to_u32_serial(nk_i64_t const *x, nk_u32_t *y) {
+    *y = (nk_u32_t)(*x > 4294967295ll ? 4294967295ll : (*x < 0ll ? 0ll : *x));
 }
-/** @brief Partial load for 1-bit elements (128 max = 16 bytes) into 128-bit vector (zeros in remaining slots). */
-NK_INTERNAL void nk_partial_load_b1x128_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n_bits) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0;
-    nk_u8_t const *s = (nk_u8_t const *)src;
-    nk_size_t n_bytes = nk_size_divide_round_up_(n_bits, 8);
-    for (nk_size_t i = 0; i < n_bytes && i < 16; i++) dst->u8s[i] = s[i];
+NK_INTERNAL void nk_u64_to_i8_serial(nk_u64_t const *x, nk_i8_t *y) { *y = (nk_i8_t)(*x > 127ull ? 127ull : *x); }
+NK_INTERNAL void nk_u64_to_u8_serial(nk_u64_t const *x, nk_u8_t *y) { *y = (nk_u8_t)(*x > 255ull ? 255ull : *x); }
+NK_INTERNAL void nk_u64_to_i16_serial(nk_u64_t const *x, nk_i16_t *y) {
+    *y = (nk_i16_t)(*x > 32767ull ? 32767ull : *x);
+}
+NK_INTERNAL void nk_u64_to_u16_serial(nk_u64_t const *x, nk_u16_t *y) {
+    *y = (nk_u16_t)(*x > 65535ull ? 65535ull : *x);
 }
-/** @brief Partial load for 4-bit nibbles (16 max = 8 bytes) into 64-bit vector (zeros in remaining slots). */
-NK_INTERNAL void nk_partial_load_b4x16_serial_(void const *src, nk_b64_vec_t *dst, nk_size_t n) {
-    dst->u64 = 0;
-    nk_u8_t const *s = (nk_u8_t const *)src;
-    nk_size_t n_bytes = nk_size_divide_round_up_(n, 2);
-    for (nk_size_t i = 0; i < n_bytes && i < 8; i++) ((nk_u8_t *)&dst->u64)[i] = s[i];
+NK_INTERNAL void nk_u64_to_i32_serial(nk_u64_t const *x, nk_i32_t *y) {
+    *y = (nk_i32_t)(*x > 2147483647ull ? 2147483647ull : *x);
 }
-NK_INTERNAL void nk_partial_load_b64x2_serial_(void const *src, nk_b128_vec_t *dst, nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0;
-    nk_u64_t const *s = (nk_u64_t const *)src;
-    switch (n) {
-    default:
-    case 2: dst->u64s[1] = s[1]; // fallthrough
-    case 1: dst->u64s[0] = s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_u64_to_u32_serial(nk_u64_t const *x, nk_u32_t *y) {
+    *y = (nk_u32_t)(*x > 4294967295ull ? 4294967295ull : *x);
 }
-/** @brief Type-agnostic partial store for 64-bit elements (2 elements max) from 128-bit vector. */
-NK_INTERNAL void nk_partial_store_b64x2_serial_(nk_b128_vec_t const *src, void *dst, nk_size_t n) {
-    nk_u64_t *d = (nk_u64_t *)dst;
-    switch (n) {
-    default:
-    case 2: d[1] = src->u64s[1]; // fallthrough
-    case 1: d[0] = src->u64s[0]; // fallthrough
-    case 0: break;
-    }
+NK_INTERNAL void nk_u64_to_i64_serial(nk_u64_t const *x, nk_i64_t *y) {
+    *y = (nk_i64_t)(*x >= 9223372036854775807ull ? 9223372036854775807ll : *x);
 }
-/** @brief Strided partial load for 32-bit elements (4 max) into 128-bit vector. */
-NK_INTERNAL void nk_strided_load_b32x4_serial_(void const *src, nk_size_t stride_elements, nk_b128_vec_t *dst,
-                                               nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0;
-    nk_u32_t const *s = (nk_u32_t const *)src;
-    for (nk_size_t i = 0; i < n && i < 4; ++i) dst->u32s[i] = s[i * stride_elements];
+NK_INTERNAL void nk_i8_to_u64_serial(nk_i8_t const *x, nk_u64_t *y) { *y = (nk_u64_t)(*x < 0 ? 0 : *x); }
+NK_INTERNAL void nk_i16_to_u64_serial(nk_i16_t const *x, nk_u64_t *y) { *y = (nk_u64_t)(*x < 0 ? 0 : *x); }
+NK_INTERNAL void nk_i32_to_u64_serial(nk_i32_t const *x, nk_u64_t *y) { *y = (nk_u64_t)(*x < 0 ? 0 : *x); }
+NK_INTERNAL void nk_i64_to_u64_serial(nk_i64_t const *x, nk_u64_t *y) { *y = (nk_u64_t)(*x < 0 ? 0 : *x); }
+NK_INTERNAL void nk_i64_to_f16_serial(nk_i64_t const *x, nk_f16_t *y) {
+    nk_f32_t f32 = (nk_f32_t)*x;
+    nk_f32_to_f16_serial(&f32, y);
+}
+NK_INTERNAL void nk_i64_to_bf16_serial(nk_i64_t const *x, nk_bf16_t *y) {
+    nk_f32_t f32 = (nk_f32_t)*x;
+    nk_f32_to_bf16_serial(&f32, y);
+}
+NK_INTERNAL void nk_u64_to_f16_serial(nk_u64_t const *x, nk_f16_t *y) {
+    nk_f32_t f32 = (nk_f32_t)*x;
+    nk_f32_to_f16_serial(&f32, y);
+}
+NK_INTERNAL void nk_u64_to_bf16_serial(nk_u64_t const *x, nk_bf16_t *y) {
+    nk_f32_t f32 = (nk_f32_t)*x;
+    nk_f32_to_bf16_serial(&f32, y);
 }
-/** @brief Strided partial load for 16-bit elements (8 max) into 128-bit vector. */
-NK_INTERNAL void nk_strided_load_b16x8_serial_(void const *src, nk_size_t stride_elements, nk_b128_vec_t *dst,
-                                               nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0;
-    nk_u16_t const *s = (nk_u16_t const *)src;
-    for (nk_size_t i = 0; i < n && i < 8; ++i) dst->u16s[i] = s[i * stride_elements];
+/** @brief Convert a pair of i4 (4-bit signed integer, -8 to 7) nibbles into signed integers. */
+NK_PUBLIC void nk_i4x2_to_i8x2_serial(nk_i4x2_t const *src, nk_i8_t *dest) {
+    nk_u8_t byte = *(nk_u8_t const *)src;
+    nk_u8_t high_nibble = byte >> 4;
+    nk_u8_t low_nibble = byte & 0x0F;
+    // Sign extend: 0-7 → 0-7, 8-15 → -8 to -1
+    dest[0] = (nk_i8_t)((high_nibble ^ 8) - 8);
+    dest[1] = (nk_i8_t)((low_nibble ^ 8) - 8);
 }
-/** @brief Strided partial load for 8-bit elements (16 max) into 128-bit vector. */
-NK_INTERNAL void nk_strided_load_b8x16_serial_(void const *src, nk_size_t stride_elements, nk_b128_vec_t *dst,
-                                               nk_size_t n) {
-    dst->u64s[0] = 0, dst->u64s[1] = 0;
-    nk_u8_t const *s = (nk_u8_t const *)src;
-    for (nk_size_t i = 0; i < n && i < 16; ++i) dst->u8s[i] = s[i * stride_elements];
+/** @brief Convert a pair of u4 (4-bit unsigned integer, 0 to 15) nibbles into unsigned integers. */
+NK_PUBLIC void nk_u4x2_to_u8x2_serial(nk_u4x2_t const *src, nk_u8_t *dest) {
+    nk_u8_t byte = *(nk_u8_t const *)src;
+    dest[0] = byte >> 4;
+    dest[1] = byte & 0x0F;
 }
 /**
@@ -1619,7 +1745,7 @@ NK_INTERNAL void nk_strided_load_b8x16_serial_(void const *src, nk_size_t stride
  *  The caller fills the appropriate union member based on the target dtype,
  *  then passes the union address as `void const *` to kernel functions.
  */
-typedef union nk_scalar_buffer_t {
+typedef union NK_MAY_ALIAS_ nk_scalar_buffer_t {
     nk_u8_t bytes[16];
     nk_f64_t f64;
     nk_f32_t f32;
@@ -1639,115 +1765,78 @@ typedef union nk_scalar_buffer_t {
     nk_u8_t u8;
 } nk_scalar_buffer_t;
+/** @brief Reads a typed scalar from @p buf and writes the widened f64c into @p result.
+ *  Real types set `.imag = 0`. Safe when @p result aliases @p buf (in-place conversion).
+ *  @return 1 on success, 0 for unsupported types (sub-byte, unknown). */
+NK_INTERNAL int nk_scalar_buffer_to_f64c(nk_scalar_buffer_t const *buf, nk_dtype_t dtype, nk_f64c_t *result) {
+    // Snapshot input so `result` may alias `buf` (e.g. in-place conversion within a union).
+    nk_scalar_buffer_t local;
+    local.f64c = buf->f64c;
+    result->real = 0, result->imag = 0;
+    switch (dtype) {
+    case nk_f64_k: result->real = local.f64; break;
+    case nk_f32_k: result->real = (nk_f64_t)local.f32; break;
+    case nk_f16_k:
+        nk_f16_to_f32_serial(&local.f16, &local.f32);
+        result->real = (nk_f64_t)local.f32;
+        break;
+    case nk_bf16_k:
+        nk_bf16_to_f32_serial(&local.bf16, &local.f32);
+        result->real = (nk_f64_t)local.f32;
+        break;
+    case nk_f64c_k: result->real = local.f64c.real, result->imag = local.f64c.imag; break;
+    case nk_f32c_k: result->real = (nk_f64_t)local.f32c.real, result->imag = (nk_f64_t)local.f32c.imag; break;
+    case nk_f16c_k:
+        nk_f16_to_f32_serial(&local.f16c.real, &local.f32);
+        result->real = (nk_f64_t)local.f32;
+        nk_f16_to_f32_serial(&local.f16c.imag, &local.f32);
+        result->imag = (nk_f64_t)local.f32;
+        break;
+    case nk_bf16c_k:
+        nk_bf16_to_f32_serial(&local.bf16c.real, &local.f32);
+        result->real = (nk_f64_t)local.f32;
+        nk_bf16_to_f32_serial(&local.bf16c.imag, &local.f32);
+        result->imag = (nk_f64_t)local.f32;
+        break;
+    case nk_i64_k: result->real = (nk_f64_t)local.i64; break;
+    case nk_u64_k: result->real = (nk_f64_t)local.u64; break;
+    case nk_i32_k: result->real = (nk_f64_t)local.i32; break;
+    case nk_u32_k: result->real = (nk_f64_t)local.u32; break;
+    case nk_i16_k: result->real = (nk_f64_t)local.i16; break;
+    case nk_u16_k: result->real = (nk_f64_t)local.u16; break;
+    case nk_i8_k: result->real = (nk_f64_t)local.i8; break;
+    case nk_u8_k: result->real = (nk_f64_t)local.u8; break;
+    case nk_e4m3_k:
+        nk_e4m3_to_f32_serial(&local.u8, &local.f32);
+        result->real = (nk_f64_t)local.f32;
+        break;
+    case nk_e5m2_k:
+        nk_e5m2_to_f32_serial(&local.u8, &local.f32);
+        result->real = (nk_f64_t)local.f32;
+        break;
+    case nk_e2m3_k:
+        nk_e2m3_to_f32_serial(&local.u8, &local.f32);
+        result->real = (nk_f64_t)local.f32;
+        break;
+    case nk_e3m2_k:
+        nk_e3m2_to_f32_serial(&local.u8, &local.f32);
+        result->real = (nk_f64_t)local.f32;
+        break;
+    default: return 0;
+    }
+    return 1;
+}
 /**
  *  @brief Converts up to 8x values from `from_ptr` buffer into 8x puned buffer objects
  *  into a complex 64-bit floating point representation.
  */
-NK_INTERNAL void nk_scalar_buffers_fill_f64c_(                         //
+NK_INTERNAL void nk_scalar_buffers_to_f64c_(                           //
     void const *from_ptr, nk_dtype_t from_dtype, nk_size_t from_count, //
     nk_scalar_buffer_t to_buffers[nk_at_least_(8)]) {
-    nk_f32_t temporary_f32;
     nk_size_t i;
     switch (from_dtype) {
-    case nk_f64_k: {
-        nk_f64_t const *p = (nk_f64_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_f32_k: {
-        nk_f32_t const *p = (nk_f32_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_f16_k: {
-        nk_f16_t const *p = (nk_f16_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i)
-            nk_f16_to_f32_serial(&p[i], &temporary_f32), to_buffers[i].f64c.real = temporary_f32,
-                                                         to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_bf16_k: {
-        nk_bf16_t const *p = (nk_bf16_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i)
-            nk_bf16_to_f32_serial(&p[i], &temporary_f32), to_buffers[i].f64c.real = temporary_f32,
-                                                          to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_e4m3_k: {
-        nk_u8_t const *p = (nk_u8_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i)
-            nk_e4m3_to_f32_serial(&p[i], &temporary_f32), to_buffers[i].f64c.real = temporary_f32,
-                                                          to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_e5m2_k: {
-        nk_u8_t const *p = (nk_u8_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i)
-            nk_e5m2_to_f32_serial(&p[i], &temporary_f32), to_buffers[i].f64c.real = temporary_f32,
-                                                          to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_e2m3_k: {
-        nk_u8_t const *p = (nk_u8_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i)
-            nk_e2m3_to_f32_serial(&p[i], &temporary_f32), to_buffers[i].f64c.real = temporary_f32,
-                                                          to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_e3m2_k: {
-        nk_u8_t const *p = (nk_u8_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i)
-            nk_e3m2_to_f32_serial(&p[i], &temporary_f32), to_buffers[i].f64c.real = temporary_f32,
-                                                          to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_i64_k: {
-        nk_i64_t const *p = (nk_i64_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = (nk_f64_t)p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_i32_k: {
-        nk_i32_t const *p = (nk_i32_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_i16_k: {
-        nk_i16_t const *p = (nk_i16_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_i8_k: {
-        nk_i8_t const *p = (nk_i8_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_u64_k: {
-        nk_u64_t const *p = (nk_u64_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = (nk_f64_t)p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_u32_k: {
-        nk_u32_t const *p = (nk_u32_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_u16_k: {
-        nk_u16_t const *p = (nk_u16_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_u8_k: {
-        nk_u8_t const *p = (nk_u8_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = p[i], to_buffers[i].f64c.imag = 0;
-    } break;
-    case nk_f64c_k: {
-        nk_f64c_t const *p = (nk_f64c_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c = p[i];
-    } break;
-    case nk_f32c_k: {
-        nk_f32c_t const *p = (nk_f32c_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) to_buffers[i].f64c.real = p[i].real, to_buffers[i].f64c.imag = p[i].imag;
-    } break;
-    case nk_f16c_k: {
-        nk_f16c_t const *p = (nk_f16c_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) {
-            nk_f16_to_f32_serial(&p[i].real, &temporary_f32), to_buffers[i].f64c.real = temporary_f32;
-            nk_f16_to_f32_serial(&p[i].imag, &temporary_f32), to_buffers[i].f64c.imag = temporary_f32;
-        }
-    } break;
-    case nk_bf16c_k: {
-        nk_bf16c_t const *p = (nk_bf16c_t const *)from_ptr;
-        for (i = 0; i < from_count; ++i) {
-            nk_bf16_to_f32_serial(&p[i].real, &temporary_f32), to_buffers[i].f64c.real = temporary_f32;
-            nk_bf16_to_f32_serial(&p[i].imag, &temporary_f32), to_buffers[i].f64c.imag = temporary_f32;
-        }
-    } break;
     // Sub-byte: u1 - 8 bits from 1 byte, MSB-first
     case nk_u1_k: {
         nk_u8_t byte = *(nk_u8_t const *)from_ptr;
@@ -1755,130 +1844,117 @@ NK_INTERNAL void nk_scalar_buffers_fill_f64c_(                         //
     } break;
     // Sub-byte: i4 - 8 nibbles from 4 bytes, high nibble = even index, sign-extended
     case nk_i4_k: {
-        nk_u8_t const *p = (nk_u8_t const *)from_ptr;
+        nk_i4x2_t const *pairs = (nk_i4x2_t const *)from_ptr;
+        nk_i8_t unpacked[2];
         for (i = 0; i < 4; ++i) {
-            nk_i8_t hi = (nk_i8_t)(p[i] >> 4), lo = (nk_i8_t)(p[i] & 0xF);
-            to_buffers[i * 2].f64c.real = (hi ^ 8) - 8, to_buffers[i * 2].f64c.imag = 0;
-            to_buffers[i * 2 + 1].f64c.real = (lo ^ 8) - 8, to_buffers[i * 2 + 1].f64c.imag = 0;
+            nk_i4x2_to_i8x2_serial(&pairs[i], unpacked);
+            to_buffers[i * 2].f64c.real = unpacked[0], to_buffers[i * 2].f64c.imag = 0;
+            to_buffers[i * 2 + 1].f64c.real = unpacked[1], to_buffers[i * 2 + 1].f64c.imag = 0;
         }
     } break;
     // Sub-byte: u4 - 8 nibbles from 4 bytes, high nibble = even index
     case nk_u4_k: {
-        nk_u8_t const *p = (nk_u8_t const *)from_ptr;
+        nk_u4x2_t const *pairs = (nk_u4x2_t const *)from_ptr;
+        nk_u8_t unpacked[2];
         for (i = 0; i < 4; ++i) {
-            to_buffers[i * 2].f64c.real = p[i] >> 4, to_buffers[i * 2].f64c.imag = 0;
-            to_buffers[i * 2 + 1].f64c.real = p[i] & 0xF, to_buffers[i * 2 + 1].f64c.imag = 0;
+            nk_u4x2_to_u8x2_serial(&pairs[i], unpacked);
+            to_buffers[i * 2].f64c.real = unpacked[0], to_buffers[i * 2].f64c.imag = 0;
+            to_buffers[i * 2 + 1].f64c.real = unpacked[1], to_buffers[i * 2 + 1].f64c.imag = 0;
         }
     } break;
-    default:
-        for (i = 0; i < 8; ++i) to_buffers[i].f64c.real = 0, to_buffers[i].f64c.imag = 0;
+    // All byte-or-larger types: stage through a separate buffer to avoid
+    // variable-length memcpy and type-punned read on the same union —
+    // a pattern that triggers an ICE in MSVC's ARM64 optimizer (C1001).
+    default: {
+        nk_size_t stride = nk_dtype_bits(from_dtype) / NK_BITS_PER_BYTE;
+        nk_scalar_buffer_t staged;
+        for (i = 0; i < from_count; ++i) {
+            staged.u64 = 0;
+            nk_copy_bytes_(&staged, (char const *)from_ptr + i * stride, stride);
+            nk_scalar_buffer_to_f64c(&staged, from_dtype, &to_buffers[i].f64c);
+        }
+    } break;
+    }
+}
+/** @brief Narrows an f64c @p value into the appropriate typed member of @p buf.
+ *  Real types use only `.real`; complex types use both components.
+ *  Safe when @p value aliases @p buf (in-place conversion).
+ *  @note Integer targets (i64, i32, ...) go through f64 rounding — values beyond 2^53 may lose precision.
+ *  @return 1 on success, 0 for unsupported types (sub-byte, unknown). */
+NK_INTERNAL int nk_scalar_buffer_from_f64c(nk_f64c_t const *value, nk_scalar_buffer_t *buf, nk_dtype_t dtype) {
+    // Snapshot input so `value` may point into `buf` (e.g. in-place conversion within a union).
+    nk_f64c_t local = *value;
+    nk_f32_t temporary_f32;
+    switch (dtype) {
+    case nk_f64_k: buf->f64 = local.real; break;
+    case nk_f32_k: buf->f32 = (nk_f32_t)local.real; break;
+    case nk_f16_k:
+        temporary_f32 = (nk_f32_t)local.real;
+        nk_f32_to_f16_serial(&temporary_f32, &buf->f16);
+        break;
+    case nk_bf16_k:
+        temporary_f32 = (nk_f32_t)local.real;
+        nk_f32_to_bf16_serial(&temporary_f32, &buf->bf16);
+        break;
+    case nk_f64c_k:
+        buf->f64c.real = local.real;
+        buf->f64c.imag = local.imag;
+        break;
+    case nk_f32c_k:
+        buf->f32c.real = (nk_f32_t)local.real;
+        buf->f32c.imag = (nk_f32_t)local.imag;
+        break;
+    case nk_f16c_k:
+        temporary_f32 = (nk_f32_t)local.real;
+        nk_f32_to_f16_serial(&temporary_f32, &buf->f16c.real);
+        temporary_f32 = (nk_f32_t)local.imag;
+        nk_f32_to_f16_serial(&temporary_f32, &buf->f16c.imag);
+        break;
+    case nk_bf16c_k:
+        temporary_f32 = (nk_f32_t)local.real;
+        nk_f32_to_bf16_serial(&temporary_f32, &buf->bf16c.real);
+        temporary_f32 = (nk_f32_t)local.imag;
+        nk_f32_to_bf16_serial(&temporary_f32, &buf->bf16c.imag);
         break;
+    case nk_i64_k: nk_f64_to_i64_serial(&local.real, &buf->i64); break;
+    case nk_u64_k: nk_f64_to_u64_serial(&local.real, &buf->u64); break;
+    case nk_i32_k: nk_f64_to_i32_serial(&local.real, &buf->i32); break;
+    case nk_u32_k: nk_f64_to_u32_serial(&local.real, &buf->u32); break;
+    case nk_i16_k: nk_f64_to_i16_serial(&local.real, &buf->i16); break;
+    case nk_u16_k: nk_f64_to_u16_serial(&local.real, &buf->u16); break;
+    case nk_i8_k: nk_f64_to_i8_serial(&local.real, &buf->i8); break;
+    case nk_u8_k: nk_f64_to_u8_serial(&local.real, &buf->u8); break;
+    case nk_e4m3_k:
+        temporary_f32 = (nk_f32_t)local.real;
+        nk_f32_to_e4m3_serial(&temporary_f32, &buf->u8);
+        break;
+    case nk_e5m2_k:
+        temporary_f32 = (nk_f32_t)local.real;
+        nk_f32_to_e5m2_serial(&temporary_f32, &buf->u8);
+        break;
+    case nk_e2m3_k:
+        temporary_f32 = (nk_f32_t)local.real;
+        nk_f32_to_e2m3_serial(&temporary_f32, &buf->u8);
+        break;
+    case nk_e3m2_k:
+        temporary_f32 = (nk_f32_t)local.real;
+        nk_f32_to_e3m2_serial(&temporary_f32, &buf->u8);
+        break;
+    default: return 0;
     }
+    return 1;
 }
 /**
  *  @brief Converts up to 8x values from `from_buffers` buffer into 8x typed scalars.
  */
-NK_INTERNAL void nk_scalar_buffers_export_f64c_(            //
+NK_INTERNAL void nk_scalar_buffers_from_f64c_(              //
     nk_scalar_buffer_t const from_buffers[nk_at_least_(8)], //
     void *to_ptr, nk_dtype_t to_dtype, nk_size_t to_count) {
-    nk_f32_t temporary_f32;
     nk_size_t i;
     switch (to_dtype) {
-    case nk_f64_k: {
-        nk_f64_t *p = (nk_f64_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) p[i] = from_buffers[i].f64c.real;
-    } break;
-    case nk_f32_k: {
-        nk_f32_t *p = (nk_f32_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) p[i] = (nk_f32_t)from_buffers[i].f64c.real;
-    } break;
-    case nk_f16_k: {
-        nk_f16_t *p = (nk_f16_t *)to_ptr;
-        for (i = 0; i < to_count; ++i)
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.real, nk_f32_to_f16_serial(&temporary_f32, &p[i]);
-    } break;
-    case nk_bf16_k: {
-        nk_bf16_t *p = (nk_bf16_t *)to_ptr;
-        for (i = 0; i < to_count; ++i)
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.real, nk_f32_to_bf16_serial(&temporary_f32, &p[i]);
-    } break;
-    case nk_e4m3_k: {
-        nk_u8_t *p = (nk_u8_t *)to_ptr;
-        for (i = 0; i < to_count; ++i)
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.real, nk_f32_to_e4m3_serial(&temporary_f32, &p[i]);
-    } break;
-    case nk_e5m2_k: {
-        nk_u8_t *p = (nk_u8_t *)to_ptr;
-        for (i = 0; i < to_count; ++i)
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.real, nk_f32_to_e5m2_serial(&temporary_f32, &p[i]);
-    } break;
-    case nk_e2m3_k: {
-        nk_u8_t *p = (nk_u8_t *)to_ptr;
-        for (i = 0; i < to_count; ++i)
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.real, nk_f32_to_e2m3_serial(&temporary_f32, &p[i]);
-    } break;
-    case nk_e3m2_k: {
-        nk_u8_t *p = (nk_u8_t *)to_ptr;
-        for (i = 0; i < to_count; ++i)
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.real, nk_f32_to_e3m2_serial(&temporary_f32, &p[i]);
-    } break;
-    case nk_i64_k: {
-        nk_i64_t *p = (nk_i64_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) nk_f64_to_i64_serial(&from_buffers[i].f64c.real, &p[i]);
-    } break;
-    case nk_i32_k: {
-        nk_i32_t *p = (nk_i32_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) nk_f64_to_i32_serial(&from_buffers[i].f64c.real, &p[i]);
-    } break;
-    case nk_i16_k: {
-        nk_i16_t *p = (nk_i16_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) nk_f64_to_i16_serial(&from_buffers[i].f64c.real, &p[i]);
-    } break;
-    case nk_i8_k: {
-        nk_i8_t *p = (nk_i8_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) nk_f64_to_i8_serial(&from_buffers[i].f64c.real, &p[i]);
-    } break;
-    case nk_u64_k: {
-        nk_u64_t *p = (nk_u64_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) nk_f64_to_u64_serial(&from_buffers[i].f64c.real, &p[i]);
-    } break;
-    case nk_u32_k: {
-        nk_u32_t *p = (nk_u32_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) nk_f64_to_u32_serial(&from_buffers[i].f64c.real, &p[i]);
-    } break;
-    case nk_u16_k: {
-        nk_u16_t *p = (nk_u16_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) nk_f64_to_u16_serial(&from_buffers[i].f64c.real, &p[i]);
-    } break;
-    case nk_u8_k: {
-        nk_u8_t *p = (nk_u8_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) nk_f64_to_u8_serial(&from_buffers[i].f64c.real, &p[i]);
-    } break;
-    case nk_f64c_k: {
-        nk_f64c_t *p = (nk_f64c_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) p[i] = from_buffers[i].f64c;
-    } break;
-    case nk_f32c_k: {
-        nk_f32c_t *p = (nk_f32c_t *)to_ptr;
-        for (i = 0; i < to_count; ++i)
-            p[i].real = (nk_f32_t)from_buffers[i].f64c.real, p[i].imag = (nk_f32_t)from_buffers[i].f64c.imag;
-    } break;
-    case nk_f16c_k: {
-        nk_f16c_t *p = (nk_f16c_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) {
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.real, nk_f32_to_f16_serial(&temporary_f32, &p[i].real);
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.imag, nk_f32_to_f16_serial(&temporary_f32, &p[i].imag);
-        }
-    } break;
-    case nk_bf16c_k: {
-        nk_bf16c_t *p = (nk_bf16c_t *)to_ptr;
-        for (i = 0; i < to_count; ++i) {
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.real, nk_f32_to_bf16_serial(&temporary_f32, &p[i].real);
-            temporary_f32 = (nk_f32_t)from_buffers[i].f64c.imag, nk_f32_to_bf16_serial(&temporary_f32, &p[i].imag);
-        }
-    } break;
     // Sub-byte: u1 - 8 bits to 1 byte, MSB-first, non-zero → 1
     case nk_u1_k: {
         nk_u8_t *p = (nk_u8_t *)to_ptr;
@@ -1890,32 +1966,38 @@ NK_INTERNAL void nk_scalar_buffers_export_f64c_(            //
     case nk_i4_k: {
         nk_u8_t *p = (nk_u8_t *)to_ptr;
         for (i = 0; i < 4; ++i) {
-            nk_i64_t hi = (nk_i64_t)from_buffers[i * 2].f64c.real;
-            nk_i64_t lo = (nk_i64_t)from_buffers[i * 2 + 1].f64c.real;
-            hi = hi > 7 ? 7 : (hi < -8 ? -8 : hi);
-            lo = lo > 7 ? 7 : (lo < -8 ? -8 : lo);
-            p[i] = (nk_u8_t)(((hi & 0xF) << 4) | (lo & 0xF));
+            nk_f64_t high = from_buffers[i * 2].f64c.real, low = from_buffers[i * 2 + 1].f64c.real;
+            high = high > 7 ? 7 : (high < -8 ? -8 : high);
+            low = low > 7 ? 7 : (low < -8 ? -8 : low);
+            p[i] = (nk_u8_t)((((nk_i8_t)high & 0x0F) << 4) | ((nk_i8_t)low & 0x0F));
         }
     } break;
     // Sub-byte: u4 - 8 nibbles to 4 bytes, high nibble = even index
     case nk_u4_k: {
         nk_u8_t *p = (nk_u8_t *)to_ptr;
         for (i = 0; i < 4; ++i) {
-            nk_u64_t hi = (nk_u64_t)from_buffers[i * 2].f64c.real;
-            nk_u64_t lo = (nk_u64_t)from_buffers[i * 2 + 1].f64c.real;
-            hi = hi > 15 ? 15 : hi;
-            lo = lo > 15 ? 15 : lo;
-            p[i] = (nk_u8_t)((hi << 4) | lo);
+            nk_f64_t high = from_buffers[i * 2].f64c.real, low = from_buffers[i * 2 + 1].f64c.real;
+            high = high > 15 ? 15 : (high < 0 ? 0 : high);
+            low = low > 15 ? 15 : (low < 0 ? 0 : low);
+            p[i] = (nk_u8_t)(((nk_u8_t)high << 4) | (nk_u8_t)low);
+        }
+    } break;
+    // All byte-or-larger types: convert, then store relevant bytes
+    default: {
+        nk_size_t stride = nk_dtype_bits(to_dtype) / NK_BITS_PER_BYTE;
+        nk_scalar_buffer_t tmp;
+        for (i = 0; i < to_count; ++i) {
+            nk_scalar_buffer_from_f64c(&from_buffers[i].f64c, &tmp, to_dtype);
+            nk_copy_bytes_((char *)to_ptr + i * stride, &tmp, stride);
         }
     } break;
-    default: break;
     }
 }
 /**
  *  @brief Load 8 values from typed buffer into `buf[i].i64` (lossless widening for signed integers).
  */
-NK_INTERNAL void nk_scalar_buffers_fill_i64_(                          //
+NK_INTERNAL void nk_scalar_buffers_to_i64_(                            //
     void const *from_ptr, nk_dtype_t from_dtype, nk_size_t from_count, //
     nk_scalar_buffer_t to_buffers[nk_at_least_(8)]) {                  //
     nk_size_t i;
@@ -1938,11 +2020,12 @@ NK_INTERNAL void nk_scalar_buffers_fill_i64_(                          //
     } break;
     // Sub-byte: i4 - 4 bytes to 8 nibbles, sign-extend each nibble
     case nk_i4_k: {
-        nk_u8_t const *p = (nk_u8_t const *)from_ptr;
+        nk_i4x2_t const *pairs = (nk_i4x2_t const *)from_ptr;
         for (i = 0; i < 4; ++i) {
-            nk_i8_t hi = (nk_i8_t)(p[i] >> 4), lo = (nk_i8_t)(p[i] & 0xF);
-            to_buffers[i * 2].i64 = (hi ^ 8) - 8;
-            to_buffers[i * 2 + 1].i64 = (lo ^ 8) - 8;
+            nk_i8_t unpacked[2];
+            nk_i4x2_to_i8x2_serial(&pairs[i], unpacked);
+            to_buffers[i * 2].i64 = unpacked[0];
+            to_buffers[i * 2 + 1].i64 = unpacked[1];
         }
     } break;
     case nk_u64_k: {
@@ -1974,8 +2057,9 @@ NK_INTERNAL void nk_scalar_buffers_fill_i64_(                          //
 /**
  *  @brief Export 8 `buf[i].i64` values to typed buffer with saturation on downcast.
+ *  @note Only handles integer and sub-byte targets. Float/complex targets are silently skipped.
  */
-NK_INTERNAL void nk_scalar_buffers_export_i64_(              //
+NK_INTERNAL void nk_scalar_buffers_from_i64_(                //
     nk_scalar_buffer_t const from_buffers[nk_at_least_(8)],  //
     void *to_ptr, nk_dtype_t to_dtype, nk_size_t to_count) { //
     nk_size_t i;
@@ -2015,12 +2099,12 @@ NK_INTERNAL void nk_scalar_buffers_export_i64_(              //
     } break;
     // Sub-byte: i4 - 8 nibbles to 4 bytes, clamp [-8,7]
     case nk_i4_k: {
-        nk_u8_t *p = (nk_u8_t *)to_ptr;
+        nk_i4x2_t *p = (nk_i4x2_t *)to_ptr;
         for (i = 0; i < 4; ++i) {
-            nk_i64_t hi = from_buffers[i * 2].i64, lo = from_buffers[i * 2 + 1].i64;
-            hi = hi > 7 ? 7 : (hi < -8 ? -8 : hi);
-            lo = lo > 7 ? 7 : (lo < -8 ? -8 : lo);
-            p[i] = (nk_u8_t)(((hi & 0xF) << 4) | (lo & 0xF));
+            nk_i64_t high = from_buffers[i * 2].i64, low = from_buffers[i * 2 + 1].i64;
+            high = high > 7 ? 7 : (high < -8 ? -8 : high);
+            low = low > 7 ? 7 : (low < -8 ? -8 : low);
+            p[i] = (nk_u8_t)(((high & 0xF) << 4) | (low & 0xF));
         }
     } break;
     default: break;
@@ -2030,7 +2114,7 @@ NK_INTERNAL void nk_scalar_buffers_export_i64_(              //
 /**
  *  @brief Load 8 values from typed buffer into `buf[i].u64` (lossless widening for unsigned integers).
  */
-NK_INTERNAL void nk_scalar_buffers_fill_u64_(                          //
+NK_INTERNAL void nk_scalar_buffers_to_u64_(                            //
     void const *from_ptr, nk_dtype_t from_dtype, nk_size_t from_count, //
     nk_scalar_buffer_t to_buffers[nk_at_least_(8)]) {                  //
     nk_size_t i;
@@ -2053,10 +2137,12 @@ NK_INTERNAL void nk_scalar_buffers_fill_u64_(                          //
     } break;
     // Sub-byte: u4 - 4 bytes to 8 nibbles, zero-extend
     case nk_u4_k: {
-        nk_u8_t const *p = (nk_u8_t const *)from_ptr;
+        nk_u4x2_t const *pairs = (nk_u4x2_t const *)from_ptr;
         for (i = 0; i < 4; ++i) {
-            to_buffers[i * 2].u64 = p[i] >> 4;
-            to_buffers[i * 2 + 1].u64 = p[i] & 0xF;
+            nk_u8_t unpacked[2];
+            nk_u4x2_to_u8x2_serial(&pairs[i], unpacked);
+            to_buffers[i * 2].u64 = unpacked[0];
+            to_buffers[i * 2 + 1].u64 = unpacked[1];
         }
     } break;
     // Sub-byte: u1 - 1 byte to 8 bits, MSB-first
@@ -2070,8 +2156,9 @@ NK_INTERNAL void nk_scalar_buffers_fill_u64_(                          //
 /**
  *  @brief Export 8 `buf[i].u64` values to typed buffer with saturation on downcast.
+ *  @note Only handles integer and sub-byte targets. Float/complex targets are silently skipped.
  */
-NK_INTERNAL void nk_scalar_buffers_export_u64_(              //
+NK_INTERNAL void nk_scalar_buffers_from_u64_(                //
     nk_scalar_buffer_t const from_buffers[nk_at_least_(8)],  //
     void *to_ptr, nk_dtype_t to_dtype, nk_size_t to_count) { //
     nk_size_t i;
@@ -2111,12 +2198,12 @@ NK_INTERNAL void nk_scalar_buffers_export_u64_(              //
     } break;
     // Sub-byte: u4 - 8 nibbles to 4 bytes, clamp [0,15]
     case nk_u4_k: {
-        nk_u8_t *p = (nk_u8_t *)to_ptr;
+        nk_u4x2_t *p = (nk_u4x2_t *)to_ptr;
         for (i = 0; i < 4; ++i) {
-            nk_u64_t hi = from_buffers[i * 2].u64, lo = from_buffers[i * 2 + 1].u64;
-            hi = hi > 15 ? 15 : hi;
-            lo = lo > 15 ? 15 : lo;
-            p[i] = (nk_u8_t)((hi << 4) | lo);
+            nk_u64_t high = from_buffers[i * 2].u64, low = from_buffers[i * 2 + 1].u64;
+            high = high > 15 ? 15 : high;
+            low = low > 15 ? 15 : low;
+            p[i] = (nk_u8_t)((high << 4) | low);
         }
     } break;
     // Sub-byte: u1 - 8 bits to 1 byte, MSB-first, non-zero becomes 1
@@ -2130,9 +2217,24 @@ NK_INTERNAL void nk_scalar_buffers_export_u64_(              //
     }
 }
-#pragma endregion - Type Punned Loads and Stores
+/** @brief Widens a typed scalar from @p buf into @p result as f64 (discards imaginary part).
+ *  Safe when @p result aliases @p buf (in-place conversion). */
+NK_INTERNAL int nk_scalar_buffer_to_f64(nk_scalar_buffer_t const *buf, nk_dtype_t dtype, nk_f64_t *result) {
+    nk_f64c_t temporary_f64c;
+    int ok = nk_scalar_buffer_to_f64c(buf, dtype, &temporary_f64c);
+    *result = temporary_f64c.real;
+    return ok;
+}
+/** @brief Narrows an f64 @p value into the appropriate typed member of @p buf.
+ *  Safe when @p value aliases @p buf (in-place: `buf->f64 = x; from_f64(&buf->f64, buf, dtype)`).
+ *  @note Integer targets go through f64 rounding — values beyond 2^53 may lose precision. */
+NK_INTERNAL int nk_scalar_buffer_from_f64(nk_f64_t const *value, nk_scalar_buffer_t *buf, nk_dtype_t dtype) {
+    nk_f64c_t temporary_f64c = {*value, 0};
+    return nk_scalar_buffer_from_f64c(&temporary_f64c, buf, dtype);
+}
-#pragma region - Public API
+#pragma region Public API
 NK_PUBLIC void nk_cast_serial(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type) {
     if (from_type == to_type) {
@@ -2162,12 +2264,12 @@ NK_PUBLIC void nk_cast_serial(void const *from, nk_dtype_t from_type, nk_size_t
     // Both unsigned: u64 hub
     if (from_family == nk_dtype_family_uint_k && to_family == nk_dtype_family_uint_k) {
         for (nk_size_t b = 0; b < batches; ++b, src += from_step, dst += to_step) {
-            nk_scalar_buffers_fill_u64_(src, from_type, NK_BITS_PER_BYTE, bufs);
-            nk_scalar_buffers_export_u64_(bufs, dst, to_type, NK_BITS_PER_BYTE);
+            nk_scalar_buffers_to_u64_(src, from_type, NK_BITS_PER_BYTE, bufs);
+            nk_scalar_buffers_from_u64_(bufs, dst, to_type, NK_BITS_PER_BYTE);
         }
         if (tail) {
-            nk_scalar_buffers_fill_u64_(src, from_type, tail, bufs);
-            nk_scalar_buffers_export_u64_(bufs, dst, to_type, tail);
+            nk_scalar_buffers_to_u64_(src, from_type, tail, bufs);
+            nk_scalar_buffers_from_u64_(bufs, dst, to_type, tail);
         }
         return;
     }
@@ -2176,24 +2278,24 @@ NK_PUBLIC void nk_cast_serial(void const *from, nk_dtype_t from_type, nk_size_t
     if ((from_family == nk_dtype_family_int_k || from_family == nk_dtype_family_uint_k) &&
         (to_family == nk_dtype_family_int_k || to_family == nk_dtype_family_uint_k)) {
         for (nk_size_t b = 0; b < batches; ++b, src += from_step, dst += to_step) {
-            nk_scalar_buffers_fill_i64_(src, from_type, NK_BITS_PER_BYTE, bufs);
-            nk_scalar_buffers_export_i64_(bufs, dst, to_type, NK_BITS_PER_BYTE);
+            nk_scalar_buffers_to_i64_(src, from_type, NK_BITS_PER_BYTE, bufs);
+            nk_scalar_buffers_from_i64_(bufs, dst, to_type, NK_BITS_PER_BYTE);
         }
         if (tail) {
-            nk_scalar_buffers_fill_i64_(src, from_type, tail, bufs);
-            nk_scalar_buffers_export_i64_(bufs, dst, to_type, tail);
+            nk_scalar_buffers_to_i64_(src, from_type, tail, bufs);
+            nk_scalar_buffers_from_i64_(bufs, dst, to_type, tail);
         }
         return;
     }
     // Everything else: f64c hub (floats, complex, cross-category)
     for (nk_size_t b = 0; b < batches; ++b, src += from_step, dst += to_step) {
-        nk_scalar_buffers_fill_f64c_(src, from_type, NK_BITS_PER_BYTE, bufs);
-        nk_scalar_buffers_export_f64c_(bufs, dst, to_type, NK_BITS_PER_BYTE);
+        nk_scalar_buffers_to_f64c_(src, from_type, NK_BITS_PER_BYTE, bufs);
+        nk_scalar_buffers_from_f64c_(bufs, dst, to_type, NK_BITS_PER_BYTE);
     }
     if (tail) {
-        nk_scalar_buffers_fill_f64c_(src, from_type, tail, bufs);
-        nk_scalar_buffers_export_f64c_(bufs, dst, to_type, tail);
+        nk_scalar_buffers_to_f64c_(src, from_type, tail, bufs);
+        nk_scalar_buffers_from_f64c_(bufs, dst, to_type, tail);
     }
 }
@@ -2225,35 +2327,7 @@ NK_PUBLIC void nk_e3m2_to_bf16(nk_e3m2_t const *src, nk_bf16_t *dest) {
     nk_f32_to_bf16_serial(&temp, dest);
 }
-/**
- *  @brief Convert i4 (4-bit signed integer, -8 to 7) to i8.
- *
- *  Nibbles are packed: low nibble in bits [0:3], high nibble in bits [4:7].
- *  Sign extension: XOR with 8 then subtract 8 converts unsigned nibble to signed.
- */
-NK_PUBLIC void nk_i4_to_i8_serial_(nk_i4x2_t const *src, nk_i8_t *dest, nk_size_t count) {
-    nk_u8_t const *bytes = (nk_u8_t const *)src;
-    for (nk_size_t i = 0; i < count; ++i) {
-        nk_u8_t byte = bytes[i / 2];
-        nk_u8_t nibble = (i % 2 == 0) ? (byte & 0x0F) : (byte >> 4);
-        dest[i] = (nk_i8_t)((nibble ^ 8) - 8); // Sign extend: 0-7 → 0-7, 8-15 → -8 to -1
-    }
-}
-/**
- *  @brief Convert u4 (4-bit unsigned integer, 0 to 15) to u8.
- *
- *  Nibbles are packed: low nibble in bits [0:3], high nibble in bits [4:7].
- */
-NK_PUBLIC void nk_u4_to_u8_serial_(nk_u4x2_t const *src, nk_u8_t *dest, nk_size_t count) {
-    nk_u8_t const *bytes = (nk_u8_t const *)src;
-    for (nk_size_t i = 0; i < count; ++i) {
-        nk_u8_t byte = bytes[i / 2];
-        dest[i] = (i % 2 == 0) ? (byte & 0x0F) : (byte >> 4);
-    }
-}
-#pragma endregion - Public API
+#pragma endregion Public API
 #if defined(__cplusplus)
 } // extern "C"