npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/dot/svehalf.h CHANGED Viewed

@@ -8,16 +8,16 @@
  *
  *  @section dot_svehalf_instructions ARM SVE+FP16 Instructions
  *
- *      Intrinsic                   Instruction                     Latency     Throughput
- *      svld1_f16                   LD1H (Z.H, P/Z, [Xn])           4-6cy       2/cy
- *      svld2_f16                   LD2H (Z.H, P/Z, [Xn])           6-8cy       1/cy
- *      svmla_f16_x                 FMLA (Z.H, P/M, Z.H, Z.H)       4cy         2/cy
- *      svmls_f16_x                 FMLS (Z.H, P/M, Z.H, Z.H)       4cy         2/cy
- *      svaddv_f16                  FADDV (H, P, Z.H)               6cy         1/cy
- *      svdup_f16                   DUP (Z.H, #imm)                 1cy         2/cy
- *      svwhilelt_b16               WHILELT (P.H, Xn, Xm)           2cy         1/cy
- *      svptrue_b16                 PTRUE (P.H, pattern)            1cy         2/cy
- *      svcnth                      CNTH (Xd)                       1cy         2/cy
+ *      Intrinsic      Instruction                V1
+ *      svld1_f16      LD1H (Z.H, P/Z, [Xn])      4-6cy @ 2p
+ *      svld2_f16      LD2H (Z.H, P/Z, [Xn])      6-8cy @ 1p
+ *      svmla_f16_x    FMLA (Z.H, P/M, Z.H, Z.H)  4cy @ 2p
+ *      svmls_f16_x    FMLS (Z.H, P/M, Z.H, Z.H)  4cy @ 2p
+ *      svaddv_f16     FADDV (H, P, Z.H)          6cy @ 1p
+ *      svdup_f16      DUP (Z.H, #imm)            1cy @ 2p
+ *      svwhilelt_b16  WHILELT (P.H, Xn, Xm)      2cy @ 1p
+ *      svptrue_b16    PTRUE (P.H, pattern)       1cy @ 2p
+ *      svcnth         CNTH (Xd)                  1cy @ 2p
  *
  *  SVE vector widths vary across implementations: Graviton3 uses 256-bit, while Graviton4/5
  *  and Apple M4+ use 128-bit. Code using svcntb() adapts automatically, but wider vectors
@@ -51,13 +51,21 @@ NK_PUBLIC void nk_dot_f16_svehalf(nk_f16_t const *a_scalars, nk_f16_t const *b_s
     nk_size_t idx_scalars = 0;
     svfloat32_t ab_f32x = svdup_f32(0);
     do {
-        svbool_t predicate_f32x = svwhilelt_b32_u64(idx_scalars, count_scalars);
-        svfloat16_t a_f16x = svld1_f16(predicate_f32x, (nk_f16_for_arm_simd_t const *)(a_scalars) + idx_scalars);
-        svfloat16_t b_f16x = svld1_f16(predicate_f32x, (nk_f16_for_arm_simd_t const *)(b_scalars) + idx_scalars);
-        svfloat32_t a_f32x = svcvt_f32_f16_x(predicate_f32x, a_f16x);
-        svfloat32_t b_f32x = svcvt_f32_f16_x(predicate_f32x, b_f16x);
-        ab_f32x = svmla_f32_x(predicate_f32x, ab_f32x, a_f32x, b_f32x);
-        idx_scalars += svcntw();
+        svbool_t predicate_b16x = svwhilelt_b16_u64(idx_scalars, count_scalars);
+        svfloat16_t a_f16x = svld1_f16(predicate_b16x, (nk_f16_for_arm_simd_t const *)(a_scalars) + idx_scalars);
+        svfloat16_t b_f16x = svld1_f16(predicate_b16x, (nk_f16_for_arm_simd_t const *)(b_scalars) + idx_scalars);
+        nk_size_t remaining = count_scalars - idx_scalars < svcnth() ? count_scalars - idx_scalars : svcnth();
+        // svcvt_f32_f16_x widens only even-indexed f16 elements; svext by 1 shifts odd into even.
+        svbool_t pred_even_b32x = svwhilelt_b32_u64(0u, (remaining + 1) / 2);
+        ab_f32x = svmla_f32_m(pred_even_b32x, ab_f32x, svcvt_f32_f16_x(pred_even_b32x, a_f16x),
+                              svcvt_f32_f16_x(pred_even_b32x, b_f16x));
+        svbool_t pred_odd_b32x = svwhilelt_b32_u64(0u, remaining / 2);
+        ab_f32x = svmla_f32_m(pred_odd_b32x, ab_f32x, svcvt_f32_f16_x(pred_odd_b32x, svext_f16(a_f16x, a_f16x, 1)),
+                              svcvt_f32_f16_x(pred_odd_b32x, svext_f16(b_f16x, b_f16x, 1)));
+        idx_scalars += svcnth();
     } while (idx_scalars < count_scalars);
     *result = svaddv_f32(svptrue_b32(), ab_f32x);
 }
@@ -68,18 +76,36 @@ NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_
     svfloat32_t ab_real_f32x = svdup_f32(0);
     svfloat32_t ab_imag_f32x = svdup_f32(0);
     do {
-        svbool_t predicate_f32x = svwhilelt_b32_u64(idx_scalars, count_pairs);
-        svfloat16x2_t a_f16x2 = svld2_f16(predicate_f32x, (nk_f16_for_arm_simd_t const *)(a_pairs) + idx_scalars * 2);
-        svfloat16x2_t b_f16x2 = svld2_f16(predicate_f32x, (nk_f16_for_arm_simd_t const *)(b_pairs) + idx_scalars * 2);
-        svfloat32_t a_real_f32x = svcvt_f32_f16_x(predicate_f32x, svget2_f16(a_f16x2, 0));
-        svfloat32_t a_imag_f32x = svcvt_f32_f16_x(predicate_f32x, svget2_f16(a_f16x2, 1));
-        svfloat32_t b_real_f32x = svcvt_f32_f16_x(predicate_f32x, svget2_f16(b_f16x2, 0));
-        svfloat32_t b_imag_f32x = svcvt_f32_f16_x(predicate_f32x, svget2_f16(b_f16x2, 1));
-        ab_real_f32x = svmla_f32_x(predicate_f32x, ab_real_f32x, a_real_f32x, b_real_f32x);
-        ab_real_f32x = svmls_f32_x(predicate_f32x, ab_real_f32x, a_imag_f32x, b_imag_f32x);
-        ab_imag_f32x = svmla_f32_x(predicate_f32x, ab_imag_f32x, a_real_f32x, b_imag_f32x);
-        ab_imag_f32x = svmla_f32_x(predicate_f32x, ab_imag_f32x, a_imag_f32x, b_real_f32x);
-        idx_scalars += svcntw();
+        svbool_t predicate_b16x = svwhilelt_b16_u64(idx_scalars, count_pairs);
+        svfloat16x2_t a_f16x2x = svld2_f16(predicate_b16x, (nk_f16_for_arm_simd_t const *)(a_pairs) + idx_scalars * 2);
+        svfloat16x2_t b_f16x2x = svld2_f16(predicate_b16x, (nk_f16_for_arm_simd_t const *)(b_pairs) + idx_scalars * 2);
+        svfloat16_t ar_f16x = svget2_f16(a_f16x2x, 0), ai_f16x = svget2_f16(a_f16x2x, 1);
+        svfloat16_t br_f16x = svget2_f16(b_f16x2x, 0), bi_f16x = svget2_f16(b_f16x2x, 1);
+        nk_size_t remaining = count_pairs - idx_scalars < svcnth() ? count_pairs - idx_scalars : svcnth();
+        // Even-indexed elements of each deinterleaved component
+        svbool_t pred_even_b32x = svwhilelt_b32_u64(0u, (remaining + 1) / 2);
+        svfloat32_t ar_even_f32x = svcvt_f32_f16_x(pred_even_b32x, ar_f16x);
+        svfloat32_t ai_even_f32x = svcvt_f32_f16_x(pred_even_b32x, ai_f16x);
+        svfloat32_t br_even_f32x = svcvt_f32_f16_x(pred_even_b32x, br_f16x);
+        svfloat32_t bi_even_f32x = svcvt_f32_f16_x(pred_even_b32x, bi_f16x);
+        ab_real_f32x = svmla_f32_m(pred_even_b32x, ab_real_f32x, ar_even_f32x, br_even_f32x);
+        ab_real_f32x = svmls_f32_m(pred_even_b32x, ab_real_f32x, ai_even_f32x, bi_even_f32x);
+        ab_imag_f32x = svmla_f32_m(pred_even_b32x, ab_imag_f32x, ar_even_f32x, bi_even_f32x);
+        ab_imag_f32x = svmla_f32_m(pred_even_b32x, ab_imag_f32x, ai_even_f32x, br_even_f32x);
+        // Odd-indexed elements via svext shift-by-1
+        svbool_t pred_odd_b32x = svwhilelt_b32_u64(0u, remaining / 2);
+        svfloat32_t ar_odd_f32x = svcvt_f32_f16_x(pred_odd_b32x, svext_f16(ar_f16x, ar_f16x, 1));
+        svfloat32_t ai_odd_f32x = svcvt_f32_f16_x(pred_odd_b32x, svext_f16(ai_f16x, ai_f16x, 1));
+        svfloat32_t br_odd_f32x = svcvt_f32_f16_x(pred_odd_b32x, svext_f16(br_f16x, br_f16x, 1));
+        svfloat32_t bi_odd_f32x = svcvt_f32_f16_x(pred_odd_b32x, svext_f16(bi_f16x, bi_f16x, 1));
+        ab_real_f32x = svmla_f32_m(pred_odd_b32x, ab_real_f32x, ar_odd_f32x, br_odd_f32x);
+        ab_real_f32x = svmls_f32_m(pred_odd_b32x, ab_real_f32x, ai_odd_f32x, bi_odd_f32x);
+        ab_imag_f32x = svmla_f32_m(pred_odd_b32x, ab_imag_f32x, ar_odd_f32x, bi_odd_f32x);
+        ab_imag_f32x = svmla_f32_m(pred_odd_b32x, ab_imag_f32x, ai_odd_f32x, br_odd_f32x);
+        idx_scalars += svcnth();
     } while (idx_scalars < count_pairs);
     results->real = svaddv_f32(svptrue_b32(), ab_real_f32x);
     results->imag = svaddv_f32(svptrue_b32(), ab_imag_f32x);
@@ -91,18 +117,36 @@ NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b
     svfloat32_t ab_real_f32x = svdup_f32(0);
     svfloat32_t ab_imag_f32x = svdup_f32(0);
     do {
-        svbool_t predicate_f32x = svwhilelt_b32_u64(idx_scalars, count_pairs);
-        svfloat16x2_t a_f16x2 = svld2_f16(predicate_f32x, (nk_f16_for_arm_simd_t const *)(a_pairs) + idx_scalars * 2);
-        svfloat16x2_t b_f16x2 = svld2_f16(predicate_f32x, (nk_f16_for_arm_simd_t const *)(b_pairs) + idx_scalars * 2);
-        svfloat32_t a_real_f32x = svcvt_f32_f16_x(predicate_f32x, svget2_f16(a_f16x2, 0));
-        svfloat32_t a_imag_f32x = svcvt_f32_f16_x(predicate_f32x, svget2_f16(a_f16x2, 1));
-        svfloat32_t b_real_f32x = svcvt_f32_f16_x(predicate_f32x, svget2_f16(b_f16x2, 0));
-        svfloat32_t b_imag_f32x = svcvt_f32_f16_x(predicate_f32x, svget2_f16(b_f16x2, 1));
-        ab_real_f32x = svmla_f32_x(predicate_f32x, ab_real_f32x, a_real_f32x, b_real_f32x);
-        ab_real_f32x = svmla_f32_x(predicate_f32x, ab_real_f32x, a_imag_f32x, b_imag_f32x);
-        ab_imag_f32x = svmla_f32_x(predicate_f32x, ab_imag_f32x, a_real_f32x, b_imag_f32x);
-        ab_imag_f32x = svmls_f32_x(predicate_f32x, ab_imag_f32x, a_imag_f32x, b_real_f32x);
-        idx_scalars += svcntw();
+        svbool_t predicate_b16x = svwhilelt_b16_u64(idx_scalars, count_pairs);
+        svfloat16x2_t a_f16x2x = svld2_f16(predicate_b16x, (nk_f16_for_arm_simd_t const *)(a_pairs) + idx_scalars * 2);
+        svfloat16x2_t b_f16x2x = svld2_f16(predicate_b16x, (nk_f16_for_arm_simd_t const *)(b_pairs) + idx_scalars * 2);
+        svfloat16_t ar_f16x = svget2_f16(a_f16x2x, 0), ai_f16x = svget2_f16(a_f16x2x, 1);
+        svfloat16_t br_f16x = svget2_f16(b_f16x2x, 0), bi_f16x = svget2_f16(b_f16x2x, 1);
+        nk_size_t remaining = count_pairs - idx_scalars < svcnth() ? count_pairs - idx_scalars : svcnth();
+        // Even-indexed elements
+        svbool_t pred_even_b32x = svwhilelt_b32_u64(0u, (remaining + 1) / 2);
+        svfloat32_t ar_even_f32x = svcvt_f32_f16_x(pred_even_b32x, ar_f16x);
+        svfloat32_t ai_even_f32x = svcvt_f32_f16_x(pred_even_b32x, ai_f16x);
+        svfloat32_t br_even_f32x = svcvt_f32_f16_x(pred_even_b32x, br_f16x);
+        svfloat32_t bi_even_f32x = svcvt_f32_f16_x(pred_even_b32x, bi_f16x);
+        ab_real_f32x = svmla_f32_m(pred_even_b32x, ab_real_f32x, ar_even_f32x, br_even_f32x);
+        ab_real_f32x = svmla_f32_m(pred_even_b32x, ab_real_f32x, ai_even_f32x, bi_even_f32x);
+        ab_imag_f32x = svmla_f32_m(pred_even_b32x, ab_imag_f32x, ar_even_f32x, bi_even_f32x);
+        ab_imag_f32x = svmls_f32_m(pred_even_b32x, ab_imag_f32x, ai_even_f32x, br_even_f32x);
+        // Odd-indexed elements via svext shift-by-1
+        svbool_t pred_odd_b32x = svwhilelt_b32_u64(0u, remaining / 2);
+        svfloat32_t ar_odd_f32x = svcvt_f32_f16_x(pred_odd_b32x, svext_f16(ar_f16x, ar_f16x, 1));
+        svfloat32_t ai_odd_f32x = svcvt_f32_f16_x(pred_odd_b32x, svext_f16(ai_f16x, ai_f16x, 1));
+        svfloat32_t br_odd_f32x = svcvt_f32_f16_x(pred_odd_b32x, svext_f16(br_f16x, br_f16x, 1));
+        svfloat32_t bi_odd_f32x = svcvt_f32_f16_x(pred_odd_b32x, svext_f16(bi_f16x, bi_f16x, 1));
+        ab_real_f32x = svmla_f32_m(pred_odd_b32x, ab_real_f32x, ar_odd_f32x, br_odd_f32x);
+        ab_real_f32x = svmla_f32_m(pred_odd_b32x, ab_real_f32x, ai_odd_f32x, bi_odd_f32x);
+        ab_imag_f32x = svmla_f32_m(pred_odd_b32x, ab_imag_f32x, ar_odd_f32x, bi_odd_f32x);
+        ab_imag_f32x = svmls_f32_m(pred_odd_b32x, ab_imag_f32x, ai_odd_f32x, br_odd_f32x);
+        idx_scalars += svcnth();
     } while (idx_scalars < count_pairs);
     results->real = svaddv_f32(svptrue_b32(), ab_real_f32x);
     results->imag = svaddv_f32(svptrue_b32(), ab_imag_f32x);

package/include/numkong/dot/svesdot.h ADDED Viewed

@@ -0,0 +1,89 @@
+/**
+ *  @brief SIMD-accelerated Dot Products for SVE SDOT.
+ *  @file include/numkong/dot/svesdot.h
+ *  @author Ash Vardanian
+ *  @date April 3, 2026
+ *
+ *  @sa include/numkong/dot.h
+ *
+ *  @section dot_svesdot_instructions ARM SVE+DotProd Instructions
+ *
+ *      Intrinsic      Instruction             V1
+ *      svld1_s8       LD1B (Z.B, P/Z, [Xn])   4-6cy @ 2p
+ *      svld1_u8       LD1B (Z.B, P/Z, [Xn])   4-6cy @ 2p
+ *      svdot_s32      SDOT (Z.S, Z.B, Z.B)    3cy @ 2p
+ *      svdot_u32      UDOT (Z.S, Z.B, Z.B)    3cy @ 2p
+ *      svaddv_s32     SADDV (D, P, Z.S)       6cy @ 1p
+ *      svaddv_u32     UADDV (D, P, Z.S)       6cy @ 1p
+ *      svdup_s32      DUP (Z.S, #imm)         1cy @ 2p
+ *      svwhilelt_b8   WHILELT (P.B, Xn, Xm)   2cy @ 1p
+ *      svcntb         CNTB (Xd)               1cy @ 2p
+ *
+ *  SVE vector widths vary across implementations: Graviton3 uses 256-bit, while Graviton4/5
+ *  and Apple M4+ use 128-bit. Code using svcntb() adapts automatically, but wider vectors
+ *  process more elements per iteration with identical latencies.
+ *
+ *  The SDOT/UDOT instructions fuse four int8 multiplications with int32 accumulation per lane,
+ *  providing the same 4-way dot product as NEON SDOT but with scalable vector widths.
+ *  On 256-bit SVE, this processes 32 int8 elements per instruction vs NEON's fixed 16.
+ */
+#ifndef NK_DOT_SVESDOT_H
+#define NK_DOT_SVESDOT_H
+#if NK_TARGET_ARM_
+#if NK_TARGET_SVESDOT
+#include "numkong/types.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+dotprod"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.2-a+sve+dotprod")
+#endif
+NK_PUBLIC void nk_dot_i8_svesdot(nk_i8_t const *a_scalars, nk_i8_t const *b_scalars, nk_size_t count_scalars,
+                                 nk_i32_t *result) {
+    nk_size_t idx_scalars = 0;
+    svint32_t sum_i32x = svdup_s32(0);
+    do {
+        svbool_t predicate_b8x = svwhilelt_b8_u64(idx_scalars, count_scalars);
+        svint8_t a_i8x = svld1_s8(predicate_b8x, a_scalars + idx_scalars);
+        svint8_t b_i8x = svld1_s8(predicate_b8x, b_scalars + idx_scalars);
+        sum_i32x = svdot_s32(sum_i32x, a_i8x, b_i8x);
+        idx_scalars += svcntb();
+    } while (idx_scalars < count_scalars);
+    *result = (nk_i32_t)svaddv_s32(svptrue_b32(), sum_i32x);
+}
+NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a_scalars, nk_u8_t const *b_scalars, nk_size_t count_scalars,
+                                 nk_u32_t *result) {
+    nk_size_t idx_scalars = 0;
+    svuint32_t sum_u32x = svdup_u32(0);
+    do {
+        svbool_t predicate_b8x = svwhilelt_b8_u64(idx_scalars, count_scalars);
+        svuint8_t a_u8x = svld1_u8(predicate_b8x, a_scalars + idx_scalars);
+        svuint8_t b_u8x = svld1_u8(predicate_b8x, b_scalars + idx_scalars);
+        sum_u32x = svdot_u32(sum_u32x, a_u8x, b_u8x);
+        idx_scalars += svcntb();
+    } while (idx_scalars < count_scalars);
+    *result = (nk_u32_t)svaddv_u32(svptrue_b32(), sum_u32x);
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_SVESDOT
+#endif // NK_TARGET_ARM_
+#endif // NK_DOT_SVESDOT_H

package/include/numkong/dot/v128relaxed.h CHANGED Viewed

@@ -73,8 +73,8 @@ nk_dot_f32_v128relaxed_cycle:
         nk_load_b64_serial_(b_scalars, &b_f32_vec);
         a_scalars += 2, b_scalars += 2, count_scalars -= 2;
     }
-    v128_t a_f32x2 = wasm_v128_load64_zero(&a_f32_vec.u64);
-    v128_t b_f32x2 = wasm_v128_load64_zero(&b_f32_vec.u64);
+    v128_t a_f32x2 = wasm_i64x2_splat(a_f32_vec.u64);
+    v128_t b_f32x2 = wasm_i64x2_splat(b_f32_vec.u64);
     v128_t a_f64x2 = wasm_f64x2_promote_low_f32x4(a_f32x2);
     v128_t b_f64x2 = wasm_f64x2_promote_low_f32x4(b_f32x2);
     sum_f64x2 = wasm_f64x2_relaxed_madd(a_f64x2, b_f64x2, sum_f64x2);
@@ -110,24 +110,28 @@ nk_dot_f16_v128relaxed_cycle:
 NK_PUBLIC void nk_dot_bf16_v128relaxed(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
     v128_t sum_f32x4 = wasm_f32x4_splat(0.0f);
+    v128_t mask_high_u32x4 = wasm_i32x4_splat((int)0xFFFF0000);
     nk_bf16_t const *a_scalars = a, *b_scalars = b;
     nk_size_t count_scalars = n;
-    nk_b64_vec_t a_bf16_vec, b_bf16_vec;
+    nk_b128_vec_t a_bf16_vec, b_bf16_vec;
 nk_dot_bf16_v128relaxed_cycle:
-    if (count_scalars < 4) {
-        nk_partial_load_b16x4_serial_(a_scalars, &a_bf16_vec, count_scalars);
-        nk_partial_load_b16x4_serial_(b_scalars, &b_bf16_vec, count_scalars);
+    if (count_scalars < 8) {
+        nk_partial_load_b16x8_serial_(a_scalars, &a_bf16_vec, count_scalars);
+        nk_partial_load_b16x8_serial_(b_scalars, &b_bf16_vec, count_scalars);
         count_scalars = 0;
     }
     else {
-        nk_load_b64_serial_(a_scalars, &a_bf16_vec);
-        nk_load_b64_serial_(b_scalars, &b_bf16_vec);
-        a_scalars += 4, b_scalars += 4, count_scalars -= 4;
+        nk_load_b128_v128relaxed_(a_scalars, &a_bf16_vec);
+        nk_load_b128_v128relaxed_(b_scalars, &b_bf16_vec);
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
     }
-    nk_b128_vec_t a_f32_vec = nk_bf16x4_to_f32x4_v128relaxed_(a_bf16_vec);
-    nk_b128_vec_t b_f32_vec = nk_bf16x4_to_f32x4_v128relaxed_(b_bf16_vec);
-    sum_f32x4 = wasm_f32x4_relaxed_madd(a_f32_vec.v128, b_f32_vec.v128, sum_f32x4);
+    v128_t a_even_f32x4 = wasm_i32x4_shl(a_bf16_vec.v128, 16);
+    v128_t b_even_f32x4 = wasm_i32x4_shl(b_bf16_vec.v128, 16);
+    sum_f32x4 = wasm_f32x4_relaxed_madd(a_even_f32x4, b_even_f32x4, sum_f32x4);
+    v128_t a_odd_f32x4 = wasm_v128_and(a_bf16_vec.v128, mask_high_u32x4);
+    v128_t b_odd_f32x4 = wasm_v128_and(b_bf16_vec.v128, mask_high_u32x4);
+    sum_f32x4 = wasm_f32x4_relaxed_madd(a_odd_f32x4, b_odd_f32x4, sum_f32x4);
     if (count_scalars) goto nk_dot_bf16_v128relaxed_cycle;
     *result = nk_reduce_add_f32x4_v128relaxed_(sum_f32x4);
@@ -274,8 +278,8 @@ NK_PUBLIC void nk_dot_e2m3_v128relaxed(nk_e2m3_t const *a_scalars, nk_e2m3_t con
     // Result = i32_dot / 256.0f (exact, no rounding error).
     //
     // 32-entry LUT split into two 16-entry halves for wasm_i8x16_relaxed_swizzle (indexes 0-15).
-    v128_t lut_lower_u8x16 = wasm_i8x16_const(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-    v128_t lut_upper_u8x16 = wasm_i8x16_const(32, 36, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120);
+    v128_t lut_low_u8x16 = wasm_i8x16_const(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+    v128_t lut_high_u8x16 = wasm_i8x16_const(32, 36, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120);
     v128_t magnitude_mask_u8x16 = wasm_u8x16_splat(0x1F);
     v128_t nibble_mask_u8x16 = wasm_u8x16_splat(0x0F);
     v128_t half_select_u8x16 = wasm_u8x16_splat(0x10);
@@ -304,17 +308,17 @@ nk_dot_e2m3_v128relaxed_cycle:
     // Dual swizzle + bitselect for 32-entry LUT (a)
     v128_t a_shuffle_index_u8x16 = wasm_v128_and(a_magnitude_u8x16, nibble_mask_u8x16);
-    v128_t a_lower_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lower_u8x16, a_shuffle_index_u8x16);
-    v128_t a_upper_u8x16 = wasm_i8x16_relaxed_swizzle(lut_upper_u8x16, a_shuffle_index_u8x16);
-    v128_t a_upper_select_u8x16 = wasm_i8x16_eq(wasm_v128_and(a_magnitude_u8x16, half_select_u8x16), half_select_u8x16);
-    v128_t a_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(a_upper_u8x16, a_lower_u8x16, a_upper_select_u8x16);
+    v128_t a_low_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_u8x16, a_shuffle_index_u8x16);
+    v128_t a_high_u8x16 = wasm_i8x16_relaxed_swizzle(lut_high_u8x16, a_shuffle_index_u8x16);
+    v128_t a_high_select_u8x16 = wasm_i8x16_eq(wasm_v128_and(a_magnitude_u8x16, half_select_u8x16), half_select_u8x16);
+    v128_t a_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(a_high_u8x16, a_low_u8x16, a_high_select_u8x16);
     // Dual swizzle + bitselect for 32-entry LUT (b)
     v128_t b_shuffle_index_u8x16 = wasm_v128_and(b_magnitude_u8x16, nibble_mask_u8x16);
-    v128_t b_lower_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lower_u8x16, b_shuffle_index_u8x16);
-    v128_t b_upper_u8x16 = wasm_i8x16_relaxed_swizzle(lut_upper_u8x16, b_shuffle_index_u8x16);
-    v128_t b_upper_select_u8x16 = wasm_i8x16_eq(wasm_v128_and(b_magnitude_u8x16, half_select_u8x16), half_select_u8x16);
-    v128_t b_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(b_upper_u8x16, b_lower_u8x16, b_upper_select_u8x16);
+    v128_t b_low_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_u8x16, b_shuffle_index_u8x16);
+    v128_t b_high_u8x16 = wasm_i8x16_relaxed_swizzle(lut_high_u8x16, b_shuffle_index_u8x16);
+    v128_t b_high_select_u8x16 = wasm_i8x16_eq(wasm_v128_and(b_magnitude_u8x16, half_select_u8x16), half_select_u8x16);
+    v128_t b_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(b_high_u8x16, b_low_u8x16, b_high_select_u8x16);
     // Combined sign: (a ^ b) & 0x20 — nonzero means negative product
     // Apply sign to a (relaxed_dot wants i8 × u7: a_signed, b_unsigned)
@@ -343,12 +347,13 @@ NK_PUBLIC void nk_dot_e3m2_v128relaxed(nk_e3m2_t const *a_scalars, nk_e3m2_t con
     // Low-byte LUT entries (magnitude[i] & 0xFF):
     //   [0,1,2,3,4,5,6,7,8,10,12,14,16,20,24,28] lower half
     //   [32,40,48,56,64,80,96,112,128,160,192,224,0,64,128,192] upper half
-    v128_t lut_lo_lower_u8x16 = wasm_i8x16_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28);
-    v128_t lut_lo_upper_u8x16 = wasm_u8x16_const(32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0, 64, 128, 192);
+    v128_t lut_low_byte_first_u8x16 = wasm_i8x16_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28);
+    v128_t lut_low_byte_second_u8x16 = wasm_u8x16_const(32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0, 64, 128,
+                                                        192);
     v128_t magnitude_mask_u8x16 = wasm_u8x16_splat(0x1F);
     v128_t nibble_mask_u8x16 = wasm_u8x16_splat(0x0F);
     v128_t half_select_u8x16 = wasm_u8x16_splat(0x10);
-    v128_t hi_threshold_u8x16 = wasm_u8x16_splat(28);
+    v128_t high_threshold_u8x16 = wasm_u8x16_splat(28);
     v128_t sign_mask_u8x16 = wasm_u8x16_splat(0x20);
     v128_t sum_i32x4 = wasm_i32x4_splat(0);
     v128_t a_e3m2_u8x16, b_e3m2_u8x16;
@@ -374,32 +379,34 @@ nk_dot_e3m2_v128relaxed_cycle:
     // Dual swizzle + bitselect for 32-entry low-byte LUT (a)
     v128_t a_shuffle_index_u8x16 = wasm_v128_and(a_magnitude_u8x16, nibble_mask_u8x16);
-    v128_t a_lower_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lo_lower_u8x16, a_shuffle_index_u8x16);
-    v128_t a_upper_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lo_upper_u8x16, a_shuffle_index_u8x16);
-    v128_t a_upper_select_u8x16 = wasm_i8x16_eq(wasm_v128_and(a_magnitude_u8x16, half_select_u8x16), half_select_u8x16);
-    v128_t a_lo_bytes_u8x16 = wasm_i8x16_relaxed_laneselect(a_upper_u8x16, a_lower_u8x16, a_upper_select_u8x16);
+    v128_t a_low_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_byte_first_u8x16, a_shuffle_index_u8x16);
+    v128_t a_high_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_byte_second_u8x16, a_shuffle_index_u8x16);
+    v128_t a_high_select_u8x16 = wasm_i8x16_eq(wasm_v128_and(a_magnitude_u8x16, half_select_u8x16), half_select_u8x16);
+    v128_t a_low_byte_u8x16 = wasm_i8x16_relaxed_laneselect(a_high_u8x16, a_low_u8x16, a_high_select_u8x16);
     // High byte is 1 iff magnitude index >= 28 (values 256, 320, 384, 448), else 0
-    v128_t a_hi_bytes_u8x16 = wasm_v128_and(wasm_u8x16_ge(a_magnitude_u8x16, hi_threshold_u8x16), wasm_u8x16_splat(1));
+    v128_t a_high_byte_u8x16 = wasm_v128_and(wasm_u8x16_ge(a_magnitude_u8x16, high_threshold_u8x16),
+                                             wasm_u8x16_splat(1));
     // Dual swizzle + bitselect for 32-entry low-byte LUT (b)
     v128_t b_shuffle_index_u8x16 = wasm_v128_and(b_magnitude_u8x16, nibble_mask_u8x16);
-    v128_t b_lower_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lo_lower_u8x16, b_shuffle_index_u8x16);
-    v128_t b_upper_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lo_upper_u8x16, b_shuffle_index_u8x16);
-    v128_t b_upper_select_u8x16 = wasm_i8x16_eq(wasm_v128_and(b_magnitude_u8x16, half_select_u8x16), half_select_u8x16);
-    v128_t b_lo_bytes_u8x16 = wasm_i8x16_relaxed_laneselect(b_upper_u8x16, b_lower_u8x16, b_upper_select_u8x16);
+    v128_t b_low_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_byte_first_u8x16, b_shuffle_index_u8x16);
+    v128_t b_high_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_byte_second_u8x16, b_shuffle_index_u8x16);
+    v128_t b_high_select_u8x16 = wasm_i8x16_eq(wasm_v128_and(b_magnitude_u8x16, half_select_u8x16), half_select_u8x16);
+    v128_t b_low_byte_u8x16 = wasm_i8x16_relaxed_laneselect(b_high_u8x16, b_low_u8x16, b_high_select_u8x16);
     // High byte is 1 iff magnitude index >= 28
-    v128_t b_hi_bytes_u8x16 = wasm_v128_and(wasm_u8x16_ge(b_magnitude_u8x16, hi_threshold_u8x16), wasm_u8x16_splat(1));
+    v128_t b_high_byte_u8x16 = wasm_v128_and(wasm_u8x16_ge(b_magnitude_u8x16, high_threshold_u8x16),
+                                             wasm_u8x16_splat(1));
     // Combine low and high bytes into i16 via byte interleave shuffle (little-endian: low byte first)
-    v128_t a_unsigned_low_i16x8 = wasm_i8x16_shuffle(a_lo_bytes_u8x16, a_hi_bytes_u8x16, 0, 16, 1, 17, 2, 18, 3, 19, 4,
+    v128_t a_unsigned_low_i16x8 = wasm_i8x16_shuffle(a_low_byte_u8x16, a_high_byte_u8x16, 0, 16, 1, 17, 2, 18, 3, 19, 4,
                                                      20, 5, 21, 6, 22, 7, 23);
-    v128_t a_unsigned_high_i16x8 = wasm_i8x16_shuffle(a_lo_bytes_u8x16, a_hi_bytes_u8x16, 8, 24, 9, 25, 10, 26, 11, 27,
+    v128_t a_unsigned_high_i16x8 = wasm_i8x16_shuffle(a_low_byte_u8x16, a_high_byte_u8x16, 8, 24, 9, 25, 10, 26, 11, 27,
                                                       12, 28, 13, 29, 14, 30, 15, 31);
-    v128_t b_unsigned_low_i16x8 = wasm_i8x16_shuffle(b_lo_bytes_u8x16, b_hi_bytes_u8x16, 0, 16, 1, 17, 2, 18, 3, 19, 4,
+    v128_t b_unsigned_low_i16x8 = wasm_i8x16_shuffle(b_low_byte_u8x16, b_high_byte_u8x16, 0, 16, 1, 17, 2, 18, 3, 19, 4,
                                                      20, 5, 21, 6, 22, 7, 23);
-    v128_t b_unsigned_high_i16x8 = wasm_i8x16_shuffle(b_lo_bytes_u8x16, b_hi_bytes_u8x16, 8, 24, 9, 25, 10, 26, 11, 27,
+    v128_t b_unsigned_high_i16x8 = wasm_i8x16_shuffle(b_low_byte_u8x16, b_high_byte_u8x16, 8, 24, 9, 25, 10, 26, 11, 27,
                                                       12, 28, 13, 29, 14, 30, 15, 31);
     // Combined sign: XOR sign bits, negate only b (saves ~15 ops vs independent negation)
@@ -497,6 +504,33 @@ NK_INTERNAL void nk_dot_through_f32x4_finalize_v128relaxed_( //
     result->f32s[3] = nk_reduce_add_f32x4_v128relaxed_(state_d->sum_f32x4);
 }
+typedef struct nk_dot_through_f32x4_state_v128relaxed_t_ nk_dot_bf16x8_state_v128relaxed_t;
+NK_INTERNAL void nk_dot_bf16x8_init_v128relaxed(nk_dot_bf16x8_state_v128relaxed_t *state) {
+    nk_dot_through_f32x4_init_v128relaxed_(state);
+}
+NK_INTERNAL void nk_dot_bf16x8_update_v128relaxed(nk_dot_bf16x8_state_v128relaxed_t *state, nk_b128_vec_t a,
+                                                  nk_b128_vec_t b, nk_size_t depth_offset,
+                                                  nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    v128_t mask_high_u32x4 = wasm_i32x4_splat((int)0xFFFF0000);
+    v128_t a_even_f32x4 = wasm_i32x4_shl(a.v128, 16);
+    v128_t b_even_f32x4 = wasm_i32x4_shl(b.v128, 16);
+    state->sum_f32x4 = wasm_f32x4_relaxed_madd(a_even_f32x4, b_even_f32x4, state->sum_f32x4);
+    v128_t a_odd_f32x4 = wasm_v128_and(a.v128, mask_high_u32x4);
+    v128_t b_odd_f32x4 = wasm_v128_and(b.v128, mask_high_u32x4);
+    state->sum_f32x4 = wasm_f32x4_relaxed_madd(a_odd_f32x4, b_odd_f32x4, state->sum_f32x4);
+}
+NK_INTERNAL void nk_dot_bf16x8_finalize_v128relaxed(                                                    //
+    nk_dot_bf16x8_state_v128relaxed_t const *state_a, nk_dot_bf16x8_state_v128relaxed_t const *state_b, //
+    nk_dot_bf16x8_state_v128relaxed_t const *state_c, nk_dot_bf16x8_state_v128relaxed_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_dot_through_f32x4_finalize_v128relaxed_(state_a, state_b, state_c, state_d, total_dimensions, result);
+}
 typedef struct nk_dot_f32x2_state_v128relaxed_t {
     v128_t sum_f64x2;
 } nk_dot_f32x2_state_v128relaxed_t;
@@ -509,8 +543,8 @@ NK_INTERNAL void nk_dot_f32x2_update_v128relaxed(nk_dot_f32x2_state_v128relaxed_
                                                  nk_b64_vec_t b, nk_size_t depth_offset, nk_size_t active_dimensions) {
     nk_unused_(depth_offset);
     nk_unused_(active_dimensions);
-    v128_t a_f32x2 = wasm_v128_load64_zero(&a.u64);
-    v128_t b_f32x2 = wasm_v128_load64_zero(&b.u64);
+    v128_t a_f32x2 = wasm_i64x2_splat(a.u64);
+    v128_t b_f32x2 = wasm_i64x2_splat(b.u64);
     v128_t a_f64x2 = wasm_f64x2_promote_low_f32x4(a_f32x2);
     v128_t b_f64x2 = wasm_f64x2_promote_low_f32x4(b_f32x2);
     state->sum_f64x2 = wasm_f64x2_relaxed_madd(a_f64x2, b_f64x2, state->sum_f64x2);
@@ -603,12 +637,12 @@ NK_INTERNAL void nk_dot_i8x16_update_v128relaxed(nk_dot_i8x16_state_v128relaxed_
                                                  nk_b128_vec_t b, nk_size_t depth_offset, nk_size_t active_dimensions) {
     nk_unused_(depth_offset);
     nk_unused_(active_dimensions);
-    // Bit-split: b = b_lo + (-128)·b_hi where b_lo = b & 0x7F ∈ [0,127], b_hi = b >> 7 ∈ {0,1}
-    // So a·b = a·b_lo − 128·a·b_hi, both operands fit i7 for relaxed_dot
-    v128_t b_lo_u8x16 = wasm_v128_and(b.v128, wasm_i8x16_splat(0x7F));
-    v128_t b_hi_u8x16 = wasm_u8x16_shr(b.v128, 7);
-    state->product_sum_i32x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a.v128, b_lo_u8x16, state->product_sum_i32x4);
-    state->negative_sum_a_i32x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a.v128, b_hi_u8x16,
+    // Bit-split: b = b_low + (-128)·b_high where b_low = b & 0x7F ∈ [0,127], b_high = b >> 7 ∈ {0,1}
+    // So a·b = a·b_low − 128·a·b_high, both operands fit i7 for relaxed_dot
+    v128_t b_low_u8x16 = wasm_v128_and(b.v128, wasm_i8x16_splat(0x7F));
+    v128_t b_high_u8x16 = wasm_u8x16_shr(b.v128, 7);
+    state->product_sum_i32x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a.v128, b_low_u8x16, state->product_sum_i32x4);
+    state->negative_sum_a_i32x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a.v128, b_high_u8x16,
                                                                          state->negative_sum_a_i32x4);
 }
@@ -629,28 +663,29 @@ NK_INTERNAL void nk_dot_i8x16_finalize_v128relaxed(
 }
 typedef struct nk_dot_u8x16_state_v128relaxed_t {
-    v128_t product_lo_i32x4; // relaxed_dot(a_signed, b_lo) accumulator
-    v128_t product_hi_i32x4; // relaxed_dot(a_signed, b_hi) accumulator
+    v128_t product_low_i32x4;  // relaxed_dot(a_signed, b_low) accumulator
+    v128_t product_high_i32x4; // relaxed_dot(a_signed, b_high) accumulator
 } nk_dot_u8x16_state_v128relaxed_t;
 NK_INTERNAL void nk_dot_u8x16_init_v128relaxed(nk_dot_u8x16_state_v128relaxed_t *state) {
-    state->product_lo_i32x4 = wasm_i32x4_splat(0);
-    state->product_hi_i32x4 = wasm_i32x4_splat(0);
+    state->product_low_i32x4 = wasm_i32x4_splat(0);
+    state->product_high_i32x4 = wasm_i32x4_splat(0);
 }
 NK_INTERNAL void nk_dot_u8x16_update_v128relaxed(nk_dot_u8x16_state_v128relaxed_t *state, nk_b128_vec_t a,
                                                  nk_b128_vec_t b, nk_size_t depth_offset, nk_size_t active_dimensions) {
     nk_unused_(depth_offset);
     nk_unused_(active_dimensions);
-    // Bit-split b: b = b_lo + 128·b_hi, with a_signed = a ^ 0x80 = a - 128 (reinterpret u8 as i8)
-    // Σ a·b = Σ(a_signed+128)·(b_lo+128·b_hi) = relaxed_dot(a_signed,b_lo) + 128·relaxed_dot(a_signed,b_hi) + 128·Σb
+    // Bit-split b: b = b_low + 128·b_high, with a_signed = a ^ 0x80 = a - 128 (reinterpret u8 as i8)
+    // Σ a·b = Σ(a_signed+128)·(b_lo+128·b_high) = relaxed_dot(a_signed,b_low) + 128·relaxed_dot(a_signed,b_high) +
+    // 128·Σb
     v128_t a_signed_i8x16 = wasm_v128_xor(a.v128, wasm_i8x16_splat((signed char)0x80));
-    v128_t b_lo_u8x16 = wasm_v128_and(b.v128, wasm_i8x16_splat(0x7F));
-    v128_t b_hi_u8x16 = wasm_u8x16_shr(b.v128, 7);
-    state->product_lo_i32x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a_signed_i8x16, b_lo_u8x16,
-                                                                     state->product_lo_i32x4);
-    state->product_hi_i32x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a_signed_i8x16, b_hi_u8x16,
-                                                                     state->product_hi_i32x4);
+    v128_t b_low_u8x16 = wasm_v128_and(b.v128, wasm_i8x16_splat(0x7F));
+    v128_t b_high_u8x16 = wasm_u8x16_shr(b.v128, 7);
+    state->product_low_i32x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a_signed_i8x16, b_low_u8x16,
+                                                                      state->product_low_i32x4);
+    state->product_high_i32x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a_signed_i8x16, b_high_u8x16,
+                                                                       state->product_high_i32x4);
 }
 NK_INTERNAL void nk_dot_u8x16_finalize_v128relaxed(                                                   //
@@ -659,17 +694,17 @@ NK_INTERNAL void nk_dot_u8x16_finalize_v128relaxed(
     nk_size_t total_dimensions, nk_u32_t a_sum, nk_b128_vec_t b_sums, nk_b128_vec_t *result) {
     nk_unused_(a_sum);
     // Σ a·b = reduce(lo) + 128·reduce(hi) + 128·Σb
-    result->u32s[0] = (nk_u32_t)(nk_reduce_add_i32x4_v128relaxed_(state_a->product_lo_i32x4) +
-                                 128 * nk_reduce_add_i32x4_v128relaxed_(state_a->product_hi_i32x4) +
+    result->u32s[0] = (nk_u32_t)(nk_reduce_add_i32x4_v128relaxed_(state_a->product_low_i32x4) +
+                                 128 * nk_reduce_add_i32x4_v128relaxed_(state_a->product_high_i32x4) +
                                  128 * (nk_i32_t)b_sums.u32s[0]);
-    result->u32s[1] = (nk_u32_t)(nk_reduce_add_i32x4_v128relaxed_(state_b->product_lo_i32x4) +
-                                 128 * nk_reduce_add_i32x4_v128relaxed_(state_b->product_hi_i32x4) +
+    result->u32s[1] = (nk_u32_t)(nk_reduce_add_i32x4_v128relaxed_(state_b->product_low_i32x4) +
+                                 128 * nk_reduce_add_i32x4_v128relaxed_(state_b->product_high_i32x4) +
                                  128 * (nk_i32_t)b_sums.u32s[1]);
-    result->u32s[2] = (nk_u32_t)(nk_reduce_add_i32x4_v128relaxed_(state_c->product_lo_i32x4) +
-                                 128 * nk_reduce_add_i32x4_v128relaxed_(state_c->product_hi_i32x4) +
+    result->u32s[2] = (nk_u32_t)(nk_reduce_add_i32x4_v128relaxed_(state_c->product_low_i32x4) +
+                                 128 * nk_reduce_add_i32x4_v128relaxed_(state_c->product_high_i32x4) +
                                  128 * (nk_i32_t)b_sums.u32s[2]);
-    result->u32s[3] = (nk_u32_t)(nk_reduce_add_i32x4_v128relaxed_(state_d->product_lo_i32x4) +
-                                 128 * nk_reduce_add_i32x4_v128relaxed_(state_d->product_hi_i32x4) +
+    result->u32s[3] = (nk_u32_t)(nk_reduce_add_i32x4_v128relaxed_(state_d->product_low_i32x4) +
+                                 128 * nk_reduce_add_i32x4_v128relaxed_(state_d->product_high_i32x4) +
                                  128 * (nk_i32_t)b_sums.u32s[3]);
 }
@@ -706,8 +741,8 @@ NK_INTERNAL void nk_dot_e2m3x16_update_v128relaxed(nk_dot_e2m3x16_state_v128rela
     nk_unused_(depth_offset);
     nk_unused_(active_dimensions);
     // Same LUT-based approach as 1:1 dot, accumulating into state
-    v128_t lut_lower_u8x16 = wasm_i8x16_const(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-    v128_t lut_upper_u8x16 = wasm_i8x16_const(32, 36, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120);
+    v128_t lut_low_u8x16 = wasm_i8x16_const(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+    v128_t lut_high_u8x16 = wasm_i8x16_const(32, 36, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120);
     v128_t magnitude_mask_u8x16 = wasm_u8x16_splat(0x1F);
     v128_t nibble_mask_u8x16 = wasm_u8x16_splat(0x0F);
     v128_t half_select_u8x16 = wasm_u8x16_splat(0x10);
@@ -719,17 +754,17 @@ NK_INTERNAL void nk_dot_e2m3x16_update_v128relaxed(nk_dot_e2m3x16_state_v128rela
     // Dual swizzle LUT for a
     v128_t a_idx_u8x16 = wasm_v128_and(a_mag_u8x16, nibble_mask_u8x16);
-    v128_t a_lo_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lower_u8x16, a_idx_u8x16);
-    v128_t a_hi_u8x16 = wasm_i8x16_relaxed_swizzle(lut_upper_u8x16, a_idx_u8x16);
+    v128_t a_low_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_u8x16, a_idx_u8x16);
+    v128_t a_high_u8x16 = wasm_i8x16_relaxed_swizzle(lut_high_u8x16, a_idx_u8x16);
     v128_t a_sel_u8x16 = wasm_i8x16_eq(wasm_v128_and(a_mag_u8x16, half_select_u8x16), half_select_u8x16);
-    v128_t a_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(a_hi_u8x16, a_lo_u8x16, a_sel_u8x16);
+    v128_t a_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(a_high_u8x16, a_low_u8x16, a_sel_u8x16);
     // Dual swizzle LUT for b
     v128_t b_idx_u8x16 = wasm_v128_and(b_mag_u8x16, nibble_mask_u8x16);
-    v128_t b_lo_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lower_u8x16, b_idx_u8x16);
-    v128_t b_hi_u8x16 = wasm_i8x16_relaxed_swizzle(lut_upper_u8x16, b_idx_u8x16);
+    v128_t b_low_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_u8x16, b_idx_u8x16);
+    v128_t b_high_u8x16 = wasm_i8x16_relaxed_swizzle(lut_high_u8x16, b_idx_u8x16);
     v128_t b_sel_u8x16 = wasm_i8x16_eq(wasm_v128_and(b_mag_u8x16, half_select_u8x16), half_select_u8x16);
-    v128_t b_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(b_hi_u8x16, b_lo_u8x16, b_sel_u8x16);
+    v128_t b_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(b_high_u8x16, b_low_u8x16, b_sel_u8x16);
     // Combined sign → apply to a (relaxed_dot wants i8 × u7)
     v128_t sign_u8x16 = wasm_v128_and(wasm_v128_xor(a.v128, b.v128), sign_mask_u8x16);
@@ -770,8 +805,8 @@ NK_INTERNAL void nk_dot_e3m2x16_update_v128relaxed(nk_dot_e3m2x16_state_v128rela
     // ×4 scaled LUT — all values ≤ 112, fits u7 for relaxed_dot
     // Indices 0-11 rounded to nearest integer (max error ±0.5 in ×4 domain = ±0.125 in value)
     // Indices 12-31 exact
-    v128_t lut_lower_u8x16 = wasm_i8x16_const(0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 5, 6, 7);
-    v128_t lut_upper_u8x16 = wasm_i8x16_const(8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112);
+    v128_t lut_low_u8x16 = wasm_i8x16_const(0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 5, 6, 7);
+    v128_t lut_high_u8x16 = wasm_i8x16_const(8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112);
     v128_t magnitude_mask_u8x16 = wasm_u8x16_splat(0x1F);
     v128_t nibble_mask_u8x16 = wasm_u8x16_splat(0x0F);
     v128_t half_select_u8x16 = wasm_u8x16_splat(0x10);
@@ -782,17 +817,17 @@ NK_INTERNAL void nk_dot_e3m2x16_update_v128relaxed(nk_dot_e3m2x16_state_v128rela
     // Dual swizzle LUT for a
     v128_t a_idx_u8x16 = wasm_v128_and(a_mag_u8x16, nibble_mask_u8x16);
-    v128_t a_lo_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lower_u8x16, a_idx_u8x16);
-    v128_t a_hi_u8x16 = wasm_i8x16_relaxed_swizzle(lut_upper_u8x16, a_idx_u8x16);
+    v128_t a_low_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_u8x16, a_idx_u8x16);
+    v128_t a_high_u8x16 = wasm_i8x16_relaxed_swizzle(lut_high_u8x16, a_idx_u8x16);
     v128_t a_sel_u8x16 = wasm_i8x16_eq(wasm_v128_and(a_mag_u8x16, half_select_u8x16), half_select_u8x16);
-    v128_t a_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(a_hi_u8x16, a_lo_u8x16, a_sel_u8x16);
+    v128_t a_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(a_high_u8x16, a_low_u8x16, a_sel_u8x16);
     // Dual swizzle LUT for b
     v128_t b_idx_u8x16 = wasm_v128_and(b_mag_u8x16, nibble_mask_u8x16);
-    v128_t b_lo_u8x16 = wasm_i8x16_relaxed_swizzle(lut_lower_u8x16, b_idx_u8x16);
-    v128_t b_hi_u8x16 = wasm_i8x16_relaxed_swizzle(lut_upper_u8x16, b_idx_u8x16);
+    v128_t b_low_u8x16 = wasm_i8x16_relaxed_swizzle(lut_low_u8x16, b_idx_u8x16);
+    v128_t b_high_u8x16 = wasm_i8x16_relaxed_swizzle(lut_high_u8x16, b_idx_u8x16);
     v128_t b_sel_u8x16 = wasm_i8x16_eq(wasm_v128_and(b_mag_u8x16, half_select_u8x16), half_select_u8x16);
-    v128_t b_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(b_hi_u8x16, b_lo_u8x16, b_sel_u8x16);
+    v128_t b_unsigned_u8x16 = wasm_i8x16_relaxed_laneselect(b_high_u8x16, b_low_u8x16, b_sel_u8x16);
     // Combined sign → apply to a (relaxed_dot wants i8 × u7)
     v128_t sign_u8x16 = wasm_v128_and(wasm_v128_xor(a.v128, b.v128), sign_mask_u8x16);
@@ -1233,13 +1268,13 @@ NK_INTERNAL void nk_dot_u1x128_finalize_v128relaxed(
     v128_t a_u32x4 = state_a->dot_count_u32x4, b_u32x4 = state_b->dot_count_u32x4;
     v128_t c_u32x4 = state_c->dot_count_u32x4, d_u32x4 = state_d->dot_count_u32x4;
     // Step 1: interleave pairs
-    v128_t ab_lo_u32x4 = wasm_i32x4_shuffle(a_u32x4, b_u32x4, 0, 4, 1, 5); // a0 b0 a1 b1
-    v128_t ab_hi_u32x4 = wasm_i32x4_shuffle(a_u32x4, b_u32x4, 2, 6, 3, 7); // a2 b2 a3 b3
-    v128_t cd_lo_u32x4 = wasm_i32x4_shuffle(c_u32x4, d_u32x4, 0, 4, 1, 5); // c0 d0 c1 d1
-    v128_t cd_hi_u32x4 = wasm_i32x4_shuffle(c_u32x4, d_u32x4, 2, 6, 3, 7); // c2 d2 c3 d3
+    v128_t ab_low_u32x4 = wasm_i32x4_shuffle(a_u32x4, b_u32x4, 0, 4, 1, 5);  // a0 b0 a1 b1
+    v128_t ab_high_u32x4 = wasm_i32x4_shuffle(a_u32x4, b_u32x4, 2, 6, 3, 7); // a2 b2 a3 b3
+    v128_t cd_low_u32x4 = wasm_i32x4_shuffle(c_u32x4, d_u32x4, 0, 4, 1, 5);  // c0 d0 c1 d1
+    v128_t cd_high_u32x4 = wasm_i32x4_shuffle(c_u32x4, d_u32x4, 2, 6, 3, 7); // c2 d2 c3 d3
     // Step 2: pairwise add
-    v128_t sum_02_u32x4 = wasm_i32x4_add(ab_lo_u32x4, ab_hi_u32x4); // a02 b02 a13 b13
-    v128_t sum_13_u32x4 = wasm_i32x4_add(cd_lo_u32x4, cd_hi_u32x4); // c02 d02 c13 d13
+    v128_t sum_02_u32x4 = wasm_i32x4_add(ab_low_u32x4, ab_high_u32x4); // a02 b02 a13 b13
+    v128_t sum_13_u32x4 = wasm_i32x4_add(cd_low_u32x4, cd_high_u32x4); // c02 d02 c13 d13
     // Step 3: final interleave
     v128_t even_u32x4 = wasm_i32x4_shuffle(sum_02_u32x4, sum_13_u32x4, 0, 1, 4, 5);
     v128_t odd_u32x4 = wasm_i32x4_shuffle(sum_02_u32x4, sum_13_u32x4, 2, 3, 6, 7);