npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/dot/haswell.h CHANGED Viewed

@@ -8,12 +8,12 @@
  *
  *  @section dot_haswell_instructions Key AVX2/FMA Dot Product Instructions
  *
- *      Intrinsic                   Instruction                     Latency     Throughput  Ports
- *      _mm256_fmadd_ps/pd          VFMADD (YMM, YMM, YMM)          5cy         0.5/cy      p01
- *      _mm256_mul_ps/pd            VMULPS/PD (YMM, YMM, YMM)       5cy         0.5/cy      p01
- *      _mm256_add_ps/pd            VADDPS/PD (YMM, YMM, YMM)       3cy         1/cy        p01
- *      _mm256_cvtph_ps             VCVTPH2PS (YMM, XMM)            5cy         1/cy        p01
- *      _mm256_cvtps_pd             VCVTPS2PD (YMM, XMM)            2cy         1/cy        p01
+ *      Intrinsic           Instruction                Haswell    Genoa
+ *      _mm256_fmadd_ps/pd  VFMADD (YMM, YMM, YMM)     5cy @ p01  4cy @ p01
+ *      _mm256_mul_ps/pd    VMULPS/PD (YMM, YMM, YMM)  5cy @ p01  3cy @ p01
+ *      _mm256_add_ps/pd    VADDPS/PD (YMM, YMM, YMM)  3cy @ p01  3cy @ p23
+ *      _mm256_cvtph_ps     VCVTPH2PS (YMM, XMM)       5cy @ p01  4cy @ p12+p23
+ *      _mm256_cvtps_pd     VCVTPS2PD (YMM, XMM)       2cy @ p01  4cy @ p12+p23
  *
  *  For small numeric types (F16, BF16, E4M3, E5M2) we use F32 accumulators. For F32 dot products,
  *  upcasting to F64 and downcasting back is faster than stable summation algorithms. For F64 we
@@ -141,7 +141,7 @@ NK_INTERNAL nk_f64_t nk_dot_stable_sum_f64x4_haswell_(__m256d sum_f64x4, __m256d
     return tentative_sum + (lower_error + upper_error + rounding_error);
 }
-#pragma region - Traditional Floats
+#pragma region F32 and F64 Floats
 NK_PUBLIC void nk_dot_f32_haswell(nk_f32_t const *a_scalars, nk_f32_t const *b_scalars, nk_size_t count_scalars,
                                   nk_f64_t *result) {
@@ -479,30 +479,35 @@ NK_INTERNAL void nk_dot_f32x4_finalize_haswell(
     result->ymm_pd = sum_abcd_f64x4;
 }
-#pragma endregion - Traditional Floats
+#pragma endregion F32 and F64 Floats
-#pragma region - Smaller Floats
+#pragma region F16 and BF16 Floats
 NK_PUBLIC void nk_dot_bf16_haswell(nk_bf16_t const *a_scalars, nk_bf16_t const *b_scalars, nk_size_t count_scalars,
                                    nk_f32_t *result) {
-    __m128i a_bf16x8, b_bf16x8;
+    __m256i a_bf16_i16x16, b_bf16_i16x16;
     __m256 sum_f32x8 = _mm256_setzero_ps();
+    __m256i mask_high_u32x8 = _mm256_set1_epi32((int)0xFFFF0000);
 nk_dot_bf16_haswell_cycle:
-    if (count_scalars < 8) {
+    if (count_scalars < 16) {
         nk_b256_vec_t a_vec, b_vec;
         nk_partial_load_b16x16_serial_(a_scalars, &a_vec, count_scalars);
         nk_partial_load_b16x16_serial_(b_scalars, &b_vec, count_scalars);
-        a_bf16x8 = a_vec.xmms[0];
-        b_bf16x8 = b_vec.xmms[0];
+        a_bf16_i16x16 = a_vec.ymm;
+        b_bf16_i16x16 = b_vec.ymm;
         count_scalars = 0;
     }
     else {
-        a_bf16x8 = _mm_loadu_si128((__m128i const *)a_scalars);
-        b_bf16x8 = _mm_loadu_si128((__m128i const *)b_scalars);
-        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+        a_bf16_i16x16 = _mm256_loadu_si256((__m256i const *)a_scalars);
+        b_bf16_i16x16 = _mm256_loadu_si256((__m256i const *)b_scalars);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
     }
-    sum_f32x8 = _mm256_fmadd_ps(nk_bf16x8_to_f32x8_haswell_(a_bf16x8), nk_bf16x8_to_f32x8_haswell_(b_bf16x8),
-                                sum_f32x8);
+    __m256 a_even_f32x8 = _mm256_castsi256_ps(_mm256_slli_epi32(a_bf16_i16x16, 16));
+    __m256 b_even_f32x8 = _mm256_castsi256_ps(_mm256_slli_epi32(b_bf16_i16x16, 16));
+    sum_f32x8 = _mm256_fmadd_ps(a_even_f32x8, b_even_f32x8, sum_f32x8);
+    __m256 a_odd_f32x8 = _mm256_castsi256_ps(_mm256_and_si256(a_bf16_i16x16, mask_high_u32x8));
+    __m256 b_odd_f32x8 = _mm256_castsi256_ps(_mm256_and_si256(b_bf16_i16x16, mask_high_u32x8));
+    sum_f32x8 = _mm256_fmadd_ps(a_odd_f32x8, b_odd_f32x8, sum_f32x8);
     if (count_scalars) goto nk_dot_bf16_haswell_cycle;
     *result = (nk_f32_t)nk_reduce_add_f32x8_haswell_(sum_f32x8);
 }
@@ -534,7 +539,7 @@ NK_PUBLIC void nk_dot_bf16c_haswell(nk_bf16c_t const *a_pairs, nk_bf16c_t const
                                     nk_f32c_t *result) {
     // Convert BF16 to F32, then use F32 complex dot product with sign-flipping optimization.
     // Uses same XOR trick as f32c to double throughput by deferring sign flips until after loop.
-    __m128i a_bf16x8, b_bf16x8;
+    __m128i a_bf16_i16x8, b_bf16_i16x8;
     __m256 sum_real_f32x8 = _mm256_setzero_ps();
     __m256 sum_imag_f32x8 = _mm256_setzero_ps();
     __m256i const sign_flip_i64x4 = _mm256_set1_epi64x(0x8000000000000000);
@@ -547,19 +552,19 @@ nk_dot_bf16c_haswell_cycle:
         nk_b256_vec_t a_vec, b_vec;
         nk_partial_load_b16x16_serial_(a_pairs, &a_vec, count_pairs * 2);
         nk_partial_load_b16x16_serial_(b_pairs, &b_vec, count_pairs * 2);
-        a_bf16x8 = a_vec.xmms[0];
-        b_bf16x8 = b_vec.xmms[0];
+        a_bf16_i16x8 = a_vec.xmms[0];
+        b_bf16_i16x8 = b_vec.xmms[0];
         count_pairs = 0;
     }
     else {
-        a_bf16x8 = _mm_loadu_si128((__m128i const *)a_pairs);
-        b_bf16x8 = _mm_loadu_si128((__m128i const *)b_pairs);
+        a_bf16_i16x8 = _mm_loadu_si128((__m128i const *)a_pairs);
+        b_bf16_i16x8 = _mm_loadu_si128((__m128i const *)b_pairs);
         a_pairs += 4, b_pairs += 4, count_pairs -= 4;
     }
     // Convert BF16 to F32
-    __m256 a_f32x8 = nk_bf16x8_to_f32x8_haswell_(a_bf16x8);
-    __m256 b_f32x8 = nk_bf16x8_to_f32x8_haswell_(b_bf16x8);
+    __m256 a_f32x8 = nk_bf16x8_to_f32x8_haswell_(a_bf16_i16x8);
+    __m256 b_f32x8 = nk_bf16x8_to_f32x8_haswell_(b_bf16_i16x8);
     // Complex multiply-accumulate: swap b for imaginary part
     __m256 b_swapped_f32x8 = _mm256_castsi256_ps(
@@ -579,7 +584,7 @@ nk_dot_bf16c_haswell_cycle:
 NK_PUBLIC void nk_vdot_bf16c_haswell(nk_bf16c_t const *a_pairs, nk_bf16c_t const *b_pairs, nk_size_t count_pairs,
                                      nk_f32c_t *result) {
     // Conjugate complex dot product: conj(a) * b
-    __m128i a_bf16x8, b_bf16x8;
+    __m128i a_bf16_i16x8, b_bf16_i16x8;
     __m256 sum_real_f32x8 = _mm256_setzero_ps();
     __m256 sum_imag_f32x8 = _mm256_setzero_ps();
     __m256i const sign_flip_i64x4 = _mm256_set1_epi64x(0x8000000000000000);
@@ -592,19 +597,19 @@ nk_vdot_bf16c_haswell_cycle:
         nk_b256_vec_t a_vec, b_vec;
         nk_partial_load_b16x16_serial_(a_pairs, &a_vec, count_pairs * 2);
         nk_partial_load_b16x16_serial_(b_pairs, &b_vec, count_pairs * 2);
-        a_bf16x8 = a_vec.xmms[0];
-        b_bf16x8 = b_vec.xmms[0];
+        a_bf16_i16x8 = a_vec.xmms[0];
+        b_bf16_i16x8 = b_vec.xmms[0];
         count_pairs = 0;
     }
     else {
-        a_bf16x8 = _mm_loadu_si128((__m128i const *)a_pairs);
-        b_bf16x8 = _mm_loadu_si128((__m128i const *)b_pairs);
+        a_bf16_i16x8 = _mm_loadu_si128((__m128i const *)a_pairs);
+        b_bf16_i16x8 = _mm_loadu_si128((__m128i const *)b_pairs);
         a_pairs += 4, b_pairs += 4, count_pairs -= 4;
     }
     // Convert BF16 to F32
-    __m256 a_f32x8 = nk_bf16x8_to_f32x8_haswell_(a_bf16x8);
-    __m256 b_f32x8 = nk_bf16x8_to_f32x8_haswell_(b_bf16x8);
+    __m256 a_f32x8 = nk_bf16x8_to_f32x8_haswell_(a_bf16_i16x8);
+    __m256 b_f32x8 = nk_bf16x8_to_f32x8_haswell_(b_bf16_i16x8);
     // Conjugate complex multiply-accumulate
     sum_real_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, sum_real_f32x8);
@@ -724,10 +729,10 @@ NK_PUBLIC void nk_dot_e2m3_haswell(nk_e2m3_t const *a_scalars, nk_e2m3_t const *
     //   lut_lower[0..15]: {0,2,4,6,8,10,12,14, 16,18,20,22,24,26,28,30}
     //   lut_upper[0..15]: {32,36,40,44,48,52,56,60, 64,72,80,88,96,104,112,120}
     //
-    __m256i const lut_lower_u8x32 = _mm256_set_epi8(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28,
-                                                    26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-    __m256i const lut_upper_u8x32 = _mm256_set_epi8(120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32,
-                                                    120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32);
+    __m256i const lut_low_u8x32 = _mm256_set_epi8(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26,
+                                                  24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m256i const lut_high_u8x32 = _mm256_set_epi8(120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32,
+                                                   120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32);
     __m256i const nibble_mask_u8x32 = _mm256_set1_epi8(0x0F);
     __m256i const magnitude_mask_u8x32 = _mm256_set1_epi8(0x1F);
     __m256i const half_select_u8x32 = _mm256_set1_epi8(0x10);
@@ -756,18 +761,18 @@ nk_dot_e2m3_haswell_cycle:
     __m256i b_magnitude_u8x32 = _mm256_and_si256(b_e2m3_u8x32, magnitude_mask_u8x32);
     __m256i a_shuffle_index_u8x32 = _mm256_and_si256(a_magnitude_u8x32, nibble_mask_u8x32);
     __m256i b_shuffle_index_u8x32 = _mm256_and_si256(b_magnitude_u8x32, nibble_mask_u8x32);
-    __m256i a_upper_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(a_magnitude_u8x32, half_select_u8x32),
-                                                     half_select_u8x32);
-    __m256i b_upper_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(b_magnitude_u8x32, half_select_u8x32),
-                                                     half_select_u8x32);
+    __m256i a_high_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(a_magnitude_u8x32, half_select_u8x32),
+                                                    half_select_u8x32);
+    __m256i b_high_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(b_magnitude_u8x32, half_select_u8x32),
+                                                    half_select_u8x32);
     // Dual VPSHUFB: lookup in both halves, blend based on bit 4
-    __m256i a_unsigned_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_lower_u8x32, a_shuffle_index_u8x32),
-                                                  _mm256_shuffle_epi8(lut_upper_u8x32, a_shuffle_index_u8x32),
-                                                  a_upper_select_u8x32);
-    __m256i b_unsigned_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_lower_u8x32, b_shuffle_index_u8x32),
-                                                  _mm256_shuffle_epi8(lut_upper_u8x32, b_shuffle_index_u8x32),
-                                                  b_upper_select_u8x32);
+    __m256i a_unsigned_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_low_u8x32, a_shuffle_index_u8x32),
+                                                  _mm256_shuffle_epi8(lut_high_u8x32, a_shuffle_index_u8x32),
+                                                  a_high_select_u8x32);
+    __m256i b_unsigned_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_low_u8x32, b_shuffle_index_u8x32),
+                                                  _mm256_shuffle_epi8(lut_high_u8x32, b_shuffle_index_u8x32),
+                                                  b_high_select_u8x32);
     // Combined sign: (a ^ b) & 0x20, negate b where signs differ
     __m256i sign_combined_u8x32 = _mm256_and_si256(_mm256_xor_si256(a_e2m3_u8x32, b_e2m3_u8x32), sign_mask_u8x32);
@@ -795,10 +800,10 @@ NK_PUBLIC void nk_dot_e3m2_haswell(nk_e3m2_t const *a_scalars, nk_e3m2_t const *
     //   lut_upper[0..15]: low bytes of {32,40,48,56,64,80,96,112,128,160,192,224,256,320,384,448}
     // High byte is 1 iff magnitude index >= 28 (values 256-448), else 0.
     //
-    __m256i const lut_lo_lower_u8x32 = _mm256_set_epi8(        //
+    __m256i const lut_low_byte_first_u8x32 = _mm256_set_epi8(  //
         28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0, //
         28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    __m256i const lut_lo_upper_u8x32 = _mm256_set_epi8(                                                           //
+    __m256i const lut_low_byte_second_u8x32 = _mm256_set_epi8(                                                    //
         (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32, //
         (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32);
     __m256i const nibble_mask_u8x32 = _mm256_set1_epi8(0x0F);
@@ -831,42 +836,44 @@ nk_dot_e3m2_haswell_cycle:
     __m256i b_magnitude_u8x32 = _mm256_and_si256(b_e3m2_u8x32, magnitude_mask_u8x32);
     __m256i a_shuffle_index_u8x32 = _mm256_and_si256(a_magnitude_u8x32, nibble_mask_u8x32);
     __m256i b_shuffle_index_u8x32 = _mm256_and_si256(b_magnitude_u8x32, nibble_mask_u8x32);
-    __m256i a_upper_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(a_magnitude_u8x32, half_select_u8x32),
-                                                     half_select_u8x32);
-    __m256i b_upper_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(b_magnitude_u8x32, half_select_u8x32),
-                                                     half_select_u8x32);
+    __m256i a_high_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(a_magnitude_u8x32, half_select_u8x32),
+                                                    half_select_u8x32);
+    __m256i b_high_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(b_magnitude_u8x32, half_select_u8x32),
+                                                    half_select_u8x32);
     // Dual VPSHUFB: lookup low bytes in both halves, blend based on bit 4
-    __m256i a_lo_bytes_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_lo_lower_u8x32, a_shuffle_index_u8x32),
-                                                  _mm256_shuffle_epi8(lut_lo_upper_u8x32, a_shuffle_index_u8x32),
-                                                  a_upper_select_u8x32);
-    __m256i b_lo_bytes_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_lo_lower_u8x32, b_shuffle_index_u8x32),
-                                                  _mm256_shuffle_epi8(lut_lo_upper_u8x32, b_shuffle_index_u8x32),
-                                                  b_upper_select_u8x32);
+    __m256i a_low_byte_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_low_byte_first_u8x32, a_shuffle_index_u8x32),
+                                                  _mm256_shuffle_epi8(lut_low_byte_second_u8x32, a_shuffle_index_u8x32),
+                                                  a_high_select_u8x32);
+    __m256i b_low_byte_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_low_byte_first_u8x32, b_shuffle_index_u8x32),
+                                                  _mm256_shuffle_epi8(lut_low_byte_second_u8x32, b_shuffle_index_u8x32),
+                                                  b_high_select_u8x32);
     // High byte: 1 iff magnitude >= 28 (signed compare safe: 27 < 128)
-    __m256i a_hi_bytes_u8x32 = _mm256_and_si256(_mm256_cmpgt_epi8(a_magnitude_u8x32, high_threshold_u8x32), ones_u8x32);
-    __m256i b_hi_bytes_u8x32 = _mm256_and_si256(_mm256_cmpgt_epi8(b_magnitude_u8x32, high_threshold_u8x32), ones_u8x32);
+    __m256i a_high_byte_u8x32 = _mm256_and_si256(_mm256_cmpgt_epi8(a_magnitude_u8x32, high_threshold_u8x32),
+                                                 ones_u8x32);
+    __m256i b_high_byte_u8x32 = _mm256_and_si256(_mm256_cmpgt_epi8(b_magnitude_u8x32, high_threshold_u8x32),
+                                                 ones_u8x32);
     // Interleave low and high bytes into i16 (little-endian: low byte first)
-    __m256i a_lo_i16x16 = _mm256_unpacklo_epi8(a_lo_bytes_u8x32, a_hi_bytes_u8x32);
-    __m256i a_hi_i16x16 = _mm256_unpackhi_epi8(a_lo_bytes_u8x32, a_hi_bytes_u8x32);
-    __m256i b_lo_i16x16 = _mm256_unpacklo_epi8(b_lo_bytes_u8x32, b_hi_bytes_u8x32);
-    __m256i b_hi_i16x16 = _mm256_unpackhi_epi8(b_lo_bytes_u8x32, b_hi_bytes_u8x32);
+    __m256i a_low_i16x16 = _mm256_unpacklo_epi8(a_low_byte_u8x32, a_high_byte_u8x32);
+    __m256i a_high_i16x16 = _mm256_unpackhi_epi8(a_low_byte_u8x32, a_high_byte_u8x32);
+    __m256i b_low_i16x16 = _mm256_unpacklo_epi8(b_low_byte_u8x32, b_high_byte_u8x32);
+    __m256i b_high_i16x16 = _mm256_unpackhi_epi8(b_low_byte_u8x32, b_high_byte_u8x32);
     // Combined sign: (a ^ b) & 0x20, widen to i16 via unpack, create +1/-1 sign vector
     __m256i sign_combined_u8x32 = _mm256_and_si256(_mm256_xor_si256(a_e3m2_u8x32, b_e3m2_u8x32), sign_mask_u8x32);
     __m256i negate_mask_u8x32 = _mm256_cmpeq_epi8(sign_combined_u8x32, sign_mask_u8x32);
-    __m256i negate_lo_i16x16 = _mm256_unpacklo_epi8(negate_mask_u8x32, negate_mask_u8x32);
-    __m256i negate_hi_i16x16 = _mm256_unpackhi_epi8(negate_mask_u8x32, negate_mask_u8x32);
-    __m256i sign_lo_i16x16 = _mm256_or_si256(negate_lo_i16x16, ones_i16x16);
-    __m256i sign_hi_i16x16 = _mm256_or_si256(negate_hi_i16x16, ones_i16x16);
-    __m256i b_signed_lo_i16x16 = _mm256_sign_epi16(b_lo_i16x16, sign_lo_i16x16);
-    __m256i b_signed_hi_i16x16 = _mm256_sign_epi16(b_hi_i16x16, sign_hi_i16x16);
+    __m256i negate_low_i16x16 = _mm256_unpacklo_epi8(negate_mask_u8x32, negate_mask_u8x32);
+    __m256i negate_high_i16x16 = _mm256_unpackhi_epi8(negate_mask_u8x32, negate_mask_u8x32);
+    __m256i sign_low_i16x16 = _mm256_or_si256(negate_low_i16x16, ones_i16x16);
+    __m256i sign_high_i16x16 = _mm256_or_si256(negate_high_i16x16, ones_i16x16);
+    __m256i b_signed_low_i16x16 = _mm256_sign_epi16(b_low_i16x16, sign_low_i16x16);
+    __m256i b_signed_high_i16x16 = _mm256_sign_epi16(b_high_i16x16, sign_high_i16x16);
     // VPMADDWD: a_unsigned_i16 × b_signed_i16 → i32 accumulator
-    sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(a_lo_i16x16, b_signed_lo_i16x16));
-    sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(a_hi_i16x16, b_signed_hi_i16x16));
+    sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(a_low_i16x16, b_signed_low_i16x16));
+    sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(a_high_i16x16, b_signed_high_i16x16));
     if (count_scalars) goto nk_dot_e3m2_haswell_cycle;
     *result = (nk_f32_t)nk_reduce_add_i32x8_haswell_(sum_i32x8) / 256.0f;
@@ -946,10 +953,34 @@ NK_INTERNAL void nk_dot_through_f32_finalize_haswell_(
 typedef struct nk_dot_through_f32_state_haswell_t_ nk_dot_f16x8_state_haswell_t;
 /**
- *  @brief Running state for 128-bit dot accumulation over bf16 scalars on Haswell.
- *  @note Alias of nk_dot_through_f32_state_haswell_t_
+ *  @brief Running state for 256-bit dot accumulation over bf16 scalars on Haswell.
+ *  @note Processes 16 bf16 per tile step via unpack(zero, bf16) → 2×8 f32 FMA.
  */
-typedef struct nk_dot_through_f32_state_haswell_t_ nk_dot_bf16x8_state_haswell_t;
+typedef struct nk_dot_through_f32_state_haswell_t_ nk_dot_bf16x16_state_haswell_t;
+NK_INTERNAL void nk_dot_bf16x16_init_haswell(nk_dot_bf16x16_state_haswell_t *state) {
+    nk_dot_through_f32_init_haswell_(state);
+}
+NK_INTERNAL void nk_dot_bf16x16_update_haswell(nk_dot_bf16x16_state_haswell_t *state, nk_b256_vec_t a, nk_b256_vec_t b,
+                                               nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    __m256i mask_high_u32x8 = _mm256_set1_epi32((int)0xFFFF0000);
+    __m256 a_even_f32x8 = _mm256_castsi256_ps(_mm256_slli_epi32(a.ymm, 16));
+    __m256 b_even_f32x8 = _mm256_castsi256_ps(_mm256_slli_epi32(b.ymm, 16));
+    state->sum_f32x8 = _mm256_fmadd_ps(a_even_f32x8, b_even_f32x8, state->sum_f32x8);
+    __m256 a_odd_f32x8 = _mm256_castsi256_ps(_mm256_and_si256(a.ymm, mask_high_u32x8));
+    __m256 b_odd_f32x8 = _mm256_castsi256_ps(_mm256_and_si256(b.ymm, mask_high_u32x8));
+    state->sum_f32x8 = _mm256_fmadd_ps(a_odd_f32x8, b_odd_f32x8, state->sum_f32x8);
+}
+NK_INTERNAL void nk_dot_bf16x16_finalize_haswell(                                                 //
+    nk_dot_bf16x16_state_haswell_t const *state_a, nk_dot_bf16x16_state_haswell_t const *state_b, //
+    nk_dot_bf16x16_state_haswell_t const *state_c, nk_dot_bf16x16_state_haswell_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_dot_through_f32_finalize_haswell_(state_a, state_b, state_c, state_d, total_dimensions, result);
+}
 /**
  *  @brief Running state for 128-bit dot accumulation over e4m3 scalars on Haswell.
@@ -991,10 +1022,10 @@ NK_INTERNAL void nk_dot_e2m3x32_update_haswell(nk_dot_e2m3x32_state_haswell_t *s
                                                nk_size_t depth_offset, nk_size_t active_dimensions) {
     nk_unused_(depth_offset);
     nk_unused_(active_dimensions);
-    __m256i const lut_lower_u8x32 = _mm256_set_epi8(               //
+    __m256i const lut_low_u8x32 = _mm256_set_epi8(                 //
         30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, //
         30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-    __m256i const lut_upper_u8x32 = _mm256_set_epi8(                       //
+    __m256i const lut_high_u8x32 = _mm256_set_epi8(                        //
         120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32, //
         120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32);
     __m256i const nibble_mask_u8x32 = _mm256_set1_epi8(0x0F);
@@ -1011,18 +1042,18 @@ NK_INTERNAL void nk_dot_e2m3x32_update_haswell(nk_dot_e2m3x32_state_haswell_t *s
     __m256i b_magnitude_u8x32 = _mm256_and_si256(b_e2m3_u8x32, magnitude_mask_u8x32);
     __m256i a_shuffle_index_u8x32 = _mm256_and_si256(a_magnitude_u8x32, nibble_mask_u8x32);
     __m256i b_shuffle_index_u8x32 = _mm256_and_si256(b_magnitude_u8x32, nibble_mask_u8x32);
-    __m256i a_upper_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(a_magnitude_u8x32, half_select_u8x32),
-                                                     half_select_u8x32);
-    __m256i b_upper_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(b_magnitude_u8x32, half_select_u8x32),
-                                                     half_select_u8x32);
+    __m256i a_high_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(a_magnitude_u8x32, half_select_u8x32),
+                                                    half_select_u8x32);
+    __m256i b_high_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(b_magnitude_u8x32, half_select_u8x32),
+                                                    half_select_u8x32);
     // Dual VPSHUFB + blend
-    __m256i a_unsigned_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_lower_u8x32, a_shuffle_index_u8x32),
-                                                  _mm256_shuffle_epi8(lut_upper_u8x32, a_shuffle_index_u8x32),
-                                                  a_upper_select_u8x32);
-    __m256i b_unsigned_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_lower_u8x32, b_shuffle_index_u8x32),
-                                                  _mm256_shuffle_epi8(lut_upper_u8x32, b_shuffle_index_u8x32),
-                                                  b_upper_select_u8x32);
+    __m256i a_unsigned_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_low_u8x32, a_shuffle_index_u8x32),
+                                                  _mm256_shuffle_epi8(lut_high_u8x32, a_shuffle_index_u8x32),
+                                                  a_high_select_u8x32);
+    __m256i b_unsigned_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_low_u8x32, b_shuffle_index_u8x32),
+                                                  _mm256_shuffle_epi8(lut_high_u8x32, b_shuffle_index_u8x32),
+                                                  b_high_select_u8x32);
     // Combined sign + conditional negate
     __m256i sign_combined_u8x32 = _mm256_and_si256(_mm256_xor_si256(a_e2m3_u8x32, b_e2m3_u8x32), sign_mask_u8x32);
@@ -1086,9 +1117,9 @@ NK_INTERNAL void nk_dot_e3m2x32_update_haswell(nk_dot_e3m2x32_state_haswell_t *s
                                                nk_size_t depth_offset, nk_size_t active_dimensions) {
     nk_unused_(depth_offset);
     nk_unused_(active_dimensions);
-    __m256i const lut_lo_lower_u8x32 = _mm256_set_epi8( //
+    __m256i const lut_low_byte_first_u8x32 = _mm256_set_epi8( //
         28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0, 28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    __m256i const lut_lo_upper_u8x32 = _mm256_set_epi8(                                                           //
+    __m256i const lut_low_byte_second_u8x32 = _mm256_set_epi8(                                                    //
         (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32, //
         (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32);
     __m256i const nibble_mask_u8x32 = _mm256_set1_epi8(0x0F);
@@ -1107,42 +1138,44 @@ NK_INTERNAL void nk_dot_e3m2x32_update_haswell(nk_dot_e3m2x32_state_haswell_t *s
     __m256i b_magnitude_u8x32 = _mm256_and_si256(b_e3m2_u8x32, magnitude_mask_u8x32);
     __m256i a_shuffle_index_u8x32 = _mm256_and_si256(a_magnitude_u8x32, nibble_mask_u8x32);
     __m256i b_shuffle_index_u8x32 = _mm256_and_si256(b_magnitude_u8x32, nibble_mask_u8x32);
-    __m256i a_upper_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(a_magnitude_u8x32, half_select_u8x32),
-                                                     half_select_u8x32);
-    __m256i b_upper_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(b_magnitude_u8x32, half_select_u8x32),
-                                                     half_select_u8x32);
+    __m256i a_high_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(a_magnitude_u8x32, half_select_u8x32),
+                                                    half_select_u8x32);
+    __m256i b_high_select_u8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(b_magnitude_u8x32, half_select_u8x32),
+                                                    half_select_u8x32);
     // Dual VPSHUFB for low bytes
-    __m256i a_lo_bytes_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_lo_lower_u8x32, a_shuffle_index_u8x32),
-                                                  _mm256_shuffle_epi8(lut_lo_upper_u8x32, a_shuffle_index_u8x32),
-                                                  a_upper_select_u8x32);
-    __m256i b_lo_bytes_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_lo_lower_u8x32, b_shuffle_index_u8x32),
-                                                  _mm256_shuffle_epi8(lut_lo_upper_u8x32, b_shuffle_index_u8x32),
-                                                  b_upper_select_u8x32);
+    __m256i a_low_byte_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_low_byte_first_u8x32, a_shuffle_index_u8x32),
+                                                  _mm256_shuffle_epi8(lut_low_byte_second_u8x32, a_shuffle_index_u8x32),
+                                                  a_high_select_u8x32);
+    __m256i b_low_byte_u8x32 = _mm256_blendv_epi8(_mm256_shuffle_epi8(lut_low_byte_first_u8x32, b_shuffle_index_u8x32),
+                                                  _mm256_shuffle_epi8(lut_low_byte_second_u8x32, b_shuffle_index_u8x32),
+                                                  b_high_select_u8x32);
     // High byte: 1 iff magnitude >= 28
-    __m256i a_hi_bytes_u8x32 = _mm256_and_si256(_mm256_cmpgt_epi8(a_magnitude_u8x32, high_threshold_u8x32), ones_u8x32);
-    __m256i b_hi_bytes_u8x32 = _mm256_and_si256(_mm256_cmpgt_epi8(b_magnitude_u8x32, high_threshold_u8x32), ones_u8x32);
+    __m256i a_high_byte_u8x32 = _mm256_and_si256(_mm256_cmpgt_epi8(a_magnitude_u8x32, high_threshold_u8x32),
+                                                 ones_u8x32);
+    __m256i b_high_byte_u8x32 = _mm256_and_si256(_mm256_cmpgt_epi8(b_magnitude_u8x32, high_threshold_u8x32),
+                                                 ones_u8x32);
     // Interleave low and high bytes into i16
-    __m256i a_lo_i16x16 = _mm256_unpacklo_epi8(a_lo_bytes_u8x32, a_hi_bytes_u8x32);
-    __m256i a_hi_i16x16 = _mm256_unpackhi_epi8(a_lo_bytes_u8x32, a_hi_bytes_u8x32);
-    __m256i b_lo_i16x16 = _mm256_unpacklo_epi8(b_lo_bytes_u8x32, b_hi_bytes_u8x32);
-    __m256i b_hi_i16x16 = _mm256_unpackhi_epi8(b_lo_bytes_u8x32, b_hi_bytes_u8x32);
+    __m256i a_low_i16x16 = _mm256_unpacklo_epi8(a_low_byte_u8x32, a_high_byte_u8x32);
+    __m256i a_high_i16x16 = _mm256_unpackhi_epi8(a_low_byte_u8x32, a_high_byte_u8x32);
+    __m256i b_low_i16x16 = _mm256_unpacklo_epi8(b_low_byte_u8x32, b_high_byte_u8x32);
+    __m256i b_high_i16x16 = _mm256_unpackhi_epi8(b_low_byte_u8x32, b_high_byte_u8x32);
     // Combined sign: (a ^ b) & 0x20, widen to i16, create +1/-1 sign vector via VPSIGNW
     __m256i sign_combined_u8x32 = _mm256_and_si256(_mm256_xor_si256(a_e3m2_u8x32, b_e3m2_u8x32), sign_mask_u8x32);
     __m256i negate_mask_u8x32 = _mm256_cmpeq_epi8(sign_combined_u8x32, sign_mask_u8x32);
-    __m256i negate_lo_i16x16 = _mm256_unpacklo_epi8(negate_mask_u8x32, negate_mask_u8x32);
-    __m256i negate_hi_i16x16 = _mm256_unpackhi_epi8(negate_mask_u8x32, negate_mask_u8x32);
-    __m256i sign_lo_i16x16 = _mm256_or_si256(negate_lo_i16x16, ones_i16x16);
-    __m256i sign_hi_i16x16 = _mm256_or_si256(negate_hi_i16x16, ones_i16x16);
-    __m256i b_signed_lo_i16x16 = _mm256_sign_epi16(b_lo_i16x16, sign_lo_i16x16);
-    __m256i b_signed_hi_i16x16 = _mm256_sign_epi16(b_hi_i16x16, sign_hi_i16x16);
+    __m256i negate_low_i16x16 = _mm256_unpacklo_epi8(negate_mask_u8x32, negate_mask_u8x32);
+    __m256i negate_high_i16x16 = _mm256_unpackhi_epi8(negate_mask_u8x32, negate_mask_u8x32);
+    __m256i sign_low_i16x16 = _mm256_or_si256(negate_low_i16x16, ones_i16x16);
+    __m256i sign_high_i16x16 = _mm256_or_si256(negate_high_i16x16, ones_i16x16);
+    __m256i b_signed_low_i16x16 = _mm256_sign_epi16(b_low_i16x16, sign_low_i16x16);
+    __m256i b_signed_high_i16x16 = _mm256_sign_epi16(b_high_i16x16, sign_high_i16x16);
     // VPMADDWD: a_unsigned_i16 × b_signed_i16 → i32 (two halves → two accumulators)
-    state->sum_a_i32x8 = _mm256_add_epi32(state->sum_a_i32x8, _mm256_madd_epi16(a_lo_i16x16, b_signed_lo_i16x16));
-    state->sum_b_i32x8 = _mm256_add_epi32(state->sum_b_i32x8, _mm256_madd_epi16(a_hi_i16x16, b_signed_hi_i16x16));
+    state->sum_a_i32x8 = _mm256_add_epi32(state->sum_a_i32x8, _mm256_madd_epi16(a_low_i16x16, b_signed_low_i16x16));
+    state->sum_b_i32x8 = _mm256_add_epi32(state->sum_b_i32x8, _mm256_madd_epi16(a_high_i16x16, b_signed_high_i16x16));
 }
 NK_INTERNAL void nk_dot_e3m2x32_finalize_haswell(                                                 //
@@ -1176,9 +1209,9 @@ NK_INTERNAL void nk_dot_e3m2x32_finalize_haswell(
     results->xmm = _mm_castps_si128(sum_f32x4);
 }
-#pragma endregion - Smaller Floats
+#pragma endregion F16 and BF16 Floats
-#pragma region - Small Integers
+#pragma region I8 and U8 Integers
 NK_PUBLIC void nk_dot_i8_haswell(nk_i8_t const *a_scalars, nk_i8_t const *b_scalars, nk_size_t count_scalars,
                                  nk_i32_t *result) {
@@ -1275,33 +1308,33 @@ nk_dot_i4_haswell_cycle:
     }
     // Extract low and high nibbles
-    __m128i a_lo_u8x16 = _mm_and_si128(a_i4x32, nibble_mask_u8x16);
-    __m128i a_hi_u8x16 = _mm_and_si128(_mm_srli_epi16(a_i4x32, 4), nibble_mask_u8x16);
-    __m128i b_lo_u8x16 = _mm_and_si128(b_i4x32, nibble_mask_u8x16);
-    __m128i b_hi_u8x16 = _mm_and_si128(_mm_srli_epi16(b_i4x32, 4), nibble_mask_u8x16);
+    __m128i a_low_u8x16 = _mm_and_si128(a_i4x32, nibble_mask_u8x16);
+    __m128i a_high_u8x16 = _mm_and_si128(_mm_srli_epi16(a_i4x32, 4), nibble_mask_u8x16);
+    __m128i b_low_u8x16 = _mm_and_si128(b_i4x32, nibble_mask_u8x16);
+    __m128i b_high_u8x16 = _mm_and_si128(_mm_srli_epi16(b_i4x32, 4), nibble_mask_u8x16);
     // XOR with 8 to get cx, dx values for the algebraic transformation
-    __m128i c_lo_u8x16 = _mm_xor_si128(a_lo_u8x16, xor_mask_u8x16);
-    __m128i c_hi_u8x16 = _mm_xor_si128(a_hi_u8x16, xor_mask_u8x16);
-    __m128i d_lo_u8x16 = _mm_xor_si128(b_lo_u8x16, xor_mask_u8x16);
-    __m128i d_hi_u8x16 = _mm_xor_si128(b_hi_u8x16, xor_mask_u8x16);
+    __m128i c_low_u8x16 = _mm_xor_si128(a_low_u8x16, xor_mask_u8x16);
+    __m128i c_high_u8x16 = _mm_xor_si128(a_high_u8x16, xor_mask_u8x16);
+    __m128i d_low_u8x16 = _mm_xor_si128(b_low_u8x16, xor_mask_u8x16);
+    __m128i d_high_u8x16 = _mm_xor_si128(b_high_u8x16, xor_mask_u8x16);
     // Widen u8 to i16 and multiply using MADD (2× instead of 4×)
-    __m256i c_lo_i16x16 = _mm256_cvtepu8_epi16(c_lo_u8x16);
-    __m256i c_hi_i16x16 = _mm256_cvtepu8_epi16(c_hi_u8x16);
-    __m256i d_lo_i16x16 = _mm256_cvtepu8_epi16(d_lo_u8x16);
-    __m256i d_hi_i16x16 = _mm256_cvtepu8_epi16(d_hi_u8x16);
+    __m256i c_low_i16x16 = _mm256_cvtepu8_epi16(c_low_u8x16);
+    __m256i c_high_i16x16 = _mm256_cvtepu8_epi16(c_high_u8x16);
+    __m256i d_low_i16x16 = _mm256_cvtepu8_epi16(d_low_u8x16);
+    __m256i d_high_i16x16 = _mm256_cvtepu8_epi16(d_high_u8x16);
     // Multiply i16×i16 and accumulate to i32 using MADD
-    sum_cd_i32x8 = _mm256_add_epi32(sum_cd_i32x8, _mm256_madd_epi16(c_lo_i16x16, d_lo_i16x16));
-    sum_cd_i32x8 = _mm256_add_epi32(sum_cd_i32x8, _mm256_madd_epi16(c_hi_i16x16, d_hi_i16x16));
+    sum_cd_i32x8 = _mm256_add_epi32(sum_cd_i32x8, _mm256_madd_epi16(c_low_i16x16, d_low_i16x16));
+    sum_cd_i32x8 = _mm256_add_epi32(sum_cd_i32x8, _mm256_madd_epi16(c_high_i16x16, d_high_i16x16));
     // Optimization: Use SAD for correction sums (5cy vs 24cy for 8× widenings)
     // PSADBW sums 8× u8 values to a single i64 in each 64-bit lane
-    sum_cx_i64x2 = _mm_add_epi64(sum_cx_i64x2, _mm_sad_epu8(c_lo_u8x16, zeros_u8x16));
-    sum_cx_i64x2 = _mm_add_epi64(sum_cx_i64x2, _mm_sad_epu8(c_hi_u8x16, zeros_u8x16));
-    sum_dx_i64x2 = _mm_add_epi64(sum_dx_i64x2, _mm_sad_epu8(d_lo_u8x16, zeros_u8x16));
-    sum_dx_i64x2 = _mm_add_epi64(sum_dx_i64x2, _mm_sad_epu8(d_hi_u8x16, zeros_u8x16));
+    sum_cx_i64x2 = _mm_add_epi64(sum_cx_i64x2, _mm_sad_epu8(c_low_u8x16, zeros_u8x16));
+    sum_cx_i64x2 = _mm_add_epi64(sum_cx_i64x2, _mm_sad_epu8(c_high_u8x16, zeros_u8x16));
+    sum_dx_i64x2 = _mm_add_epi64(sum_dx_i64x2, _mm_sad_epu8(d_low_u8x16, zeros_u8x16));
+    sum_dx_i64x2 = _mm_add_epi64(sum_dx_i64x2, _mm_sad_epu8(d_high_u8x16, zeros_u8x16));
     if (n_bytes) goto nk_dot_i4_haswell_cycle;
@@ -1347,20 +1380,20 @@ nk_dot_u4_haswell_cycle:
     }
     // Extract low and high nibbles
-    __m128i a_lo_u8x16 = _mm_and_si128(a_u4x32, nibble_mask_u8x16);
-    __m128i a_hi_u8x16 = _mm_and_si128(_mm_srli_epi16(a_u4x32, 4), nibble_mask_u8x16);
-    __m128i b_lo_u8x16 = _mm_and_si128(b_u4x32, nibble_mask_u8x16);
-    __m128i b_hi_u8x16 = _mm_and_si128(_mm_srli_epi16(b_u4x32, 4), nibble_mask_u8x16);
+    __m128i a_low_u8x16 = _mm_and_si128(a_u4x32, nibble_mask_u8x16);
+    __m128i a_high_u8x16 = _mm_and_si128(_mm_srli_epi16(a_u4x32, 4), nibble_mask_u8x16);
+    __m128i b_low_u8x16 = _mm_and_si128(b_u4x32, nibble_mask_u8x16);
+    __m128i b_high_u8x16 = _mm_and_si128(_mm_srli_epi16(b_u4x32, 4), nibble_mask_u8x16);
     // Widen u8 to i16
-    __m256i a_lo_i16x16 = _mm256_cvtepu8_epi16(a_lo_u8x16);
-    __m256i a_hi_i16x16 = _mm256_cvtepu8_epi16(a_hi_u8x16);
-    __m256i b_lo_i16x16 = _mm256_cvtepu8_epi16(b_lo_u8x16);
-    __m256i b_hi_i16x16 = _mm256_cvtepu8_epi16(b_hi_u8x16);
+    __m256i a_low_i16x16 = _mm256_cvtepu8_epi16(a_low_u8x16);
+    __m256i a_high_i16x16 = _mm256_cvtepu8_epi16(a_high_u8x16);
+    __m256i b_low_i16x16 = _mm256_cvtepu8_epi16(b_low_u8x16);
+    __m256i b_high_i16x16 = _mm256_cvtepu8_epi16(b_high_u8x16);
     // Multiply i16×i16 and accumulate to i32 using MADD
-    sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(a_lo_i16x16, b_lo_i16x16));
-    sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(a_hi_i16x16, b_hi_i16x16));
+    sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(a_low_i16x16, b_low_i16x16));
+    sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(a_high_i16x16, b_high_i16x16));
     if (n_bytes) goto nk_dot_u4_haswell_cycle;
@@ -1496,28 +1529,28 @@ NK_INTERNAL void nk_dot_i4x32_update_haswell(nk_dot_i4x32_state_haswell_t *state
     __m128i b_i4x32 = b.xmm;
     // Extract low and high nibbles
-    __m128i a_lo_u8x16 = _mm_and_si128(a_i4x32, nibble_mask_u8x16);
-    __m128i a_hi_u8x16 = _mm_and_si128(_mm_srli_epi16(a_i4x32, 4), nibble_mask_u8x16);
-    __m128i b_lo_u8x16 = _mm_and_si128(b_i4x32, nibble_mask_u8x16);
-    __m128i b_hi_u8x16 = _mm_and_si128(_mm_srli_epi16(b_i4x32, 4), nibble_mask_u8x16);
+    __m128i a_low_u8x16 = _mm_and_si128(a_i4x32, nibble_mask_u8x16);
+    __m128i a_high_u8x16 = _mm_and_si128(_mm_srli_epi16(a_i4x32, 4), nibble_mask_u8x16);
+    __m128i b_low_u8x16 = _mm_and_si128(b_i4x32, nibble_mask_u8x16);
+    __m128i b_high_u8x16 = _mm_and_si128(_mm_srli_epi16(b_i4x32, 4), nibble_mask_u8x16);
     // XOR with 8 for algebraic transformation
-    __m128i c_lo_u8x16 = _mm_xor_si128(a_lo_u8x16, xor_mask_u8x16);
-    __m128i c_hi_u8x16 = _mm_xor_si128(a_hi_u8x16, xor_mask_u8x16);
-    __m128i d_lo_u8x16 = _mm_xor_si128(b_lo_u8x16, xor_mask_u8x16);
-    __m128i d_hi_u8x16 = _mm_xor_si128(b_hi_u8x16, xor_mask_u8x16);
+    __m128i c_low_u8x16 = _mm_xor_si128(a_low_u8x16, xor_mask_u8x16);
+    __m128i c_high_u8x16 = _mm_xor_si128(a_high_u8x16, xor_mask_u8x16);
+    __m128i d_low_u8x16 = _mm_xor_si128(b_low_u8x16, xor_mask_u8x16);
+    __m128i d_high_u8x16 = _mm_xor_si128(b_high_u8x16, xor_mask_u8x16);
     // Widen u8 to i16 and multiply using MADD
-    __m256i c_lo_i16x16 = _mm256_cvtepu8_epi16(c_lo_u8x16);
-    __m256i c_hi_i16x16 = _mm256_cvtepu8_epi16(c_hi_u8x16);
-    __m256i d_lo_i16x16 = _mm256_cvtepu8_epi16(d_lo_u8x16);
-    __m256i d_hi_i16x16 = _mm256_cvtepu8_epi16(d_hi_u8x16);
+    __m256i c_low_i16x16 = _mm256_cvtepu8_epi16(c_low_u8x16);
+    __m256i c_high_i16x16 = _mm256_cvtepu8_epi16(c_high_u8x16);
+    __m256i d_low_i16x16 = _mm256_cvtepu8_epi16(d_low_u8x16);
+    __m256i d_high_i16x16 = _mm256_cvtepu8_epi16(d_high_u8x16);
     // Multiply and accumulate (no SAD — correction deferred to finalize)
     state->biased_product_sum_i32x8 = _mm256_add_epi32(state->biased_product_sum_i32x8,
-                                                       _mm256_madd_epi16(c_lo_i16x16, d_lo_i16x16));
+                                                       _mm256_madd_epi16(c_low_i16x16, d_low_i16x16));
     state->biased_product_sum_i32x8 = _mm256_add_epi32(state->biased_product_sum_i32x8,
-                                                       _mm256_madd_epi16(c_hi_i16x16, d_hi_i16x16));
+                                                       _mm256_madd_epi16(c_high_i16x16, d_high_i16x16));
 }
 NK_INTERNAL void nk_dot_i4x32_finalize_haswell(                                               //
@@ -1585,20 +1618,22 @@ NK_INTERNAL void nk_dot_u4x32_update_haswell(nk_dot_u4x32_state_haswell_t *state
     __m128i b_u4x32 = b.xmm;
     // Extract low and high nibbles
-    __m128i a_lo_u8x16 = _mm_and_si128(a_u4x32, nibble_mask_u8x16);
-    __m128i a_hi_u8x16 = _mm_and_si128(_mm_srli_epi16(a_u4x32, 4), nibble_mask_u8x16);
-    __m128i b_lo_u8x16 = _mm_and_si128(b_u4x32, nibble_mask_u8x16);
-    __m128i b_hi_u8x16 = _mm_and_si128(_mm_srli_epi16(b_u4x32, 4), nibble_mask_u8x16);
+    __m128i a_low_u8x16 = _mm_and_si128(a_u4x32, nibble_mask_u8x16);
+    __m128i a_high_u8x16 = _mm_and_si128(_mm_srli_epi16(a_u4x32, 4), nibble_mask_u8x16);
+    __m128i b_low_u8x16 = _mm_and_si128(b_u4x32, nibble_mask_u8x16);
+    __m128i b_high_u8x16 = _mm_and_si128(_mm_srli_epi16(b_u4x32, 4), nibble_mask_u8x16);
     // Widen u8 to i16
-    __m256i a_lo_i16x16 = _mm256_cvtepu8_epi16(a_lo_u8x16);
-    __m256i a_hi_i16x16 = _mm256_cvtepu8_epi16(a_hi_u8x16);
-    __m256i b_lo_i16x16 = _mm256_cvtepu8_epi16(b_lo_u8x16);
-    __m256i b_hi_i16x16 = _mm256_cvtepu8_epi16(b_hi_u8x16);
+    __m256i a_low_i16x16 = _mm256_cvtepu8_epi16(a_low_u8x16);
+    __m256i a_high_i16x16 = _mm256_cvtepu8_epi16(a_high_u8x16);
+    __m256i b_low_i16x16 = _mm256_cvtepu8_epi16(b_low_u8x16);
+    __m256i b_high_i16x16 = _mm256_cvtepu8_epi16(b_high_u8x16);
     // Multiply and accumulate
-    state->product_sum_i32x8 = _mm256_add_epi32(state->product_sum_i32x8, _mm256_madd_epi16(a_lo_i16x16, b_lo_i16x16));
-    state->product_sum_i32x8 = _mm256_add_epi32(state->product_sum_i32x8, _mm256_madd_epi16(a_hi_i16x16, b_hi_i16x16));
+    state->product_sum_i32x8 = _mm256_add_epi32(state->product_sum_i32x8,
+                                                _mm256_madd_epi16(a_low_i16x16, b_low_i16x16));
+    state->product_sum_i32x8 = _mm256_add_epi32(state->product_sum_i32x8,
+                                                _mm256_madd_epi16(a_high_i16x16, b_high_i16x16));
 }
 NK_INTERNAL void nk_dot_u4x32_finalize_haswell(                                               //
@@ -1619,23 +1654,23 @@ NK_INTERNAL void nk_dot_u4x32_finalize_haswell(
                                             _mm256_extracti128_si256(state_d->product_sum_i32x8, 1));
     // 4-way transpose to get [a,b,c,d] in lanes
-    __m128i transpose_ab_low = _mm_unpacklo_epi32(product_a_i32x4, product_b_i32x4);
-    __m128i transpose_cd_low = _mm_unpacklo_epi32(product_c_i32x4, product_d_i32x4);
-    __m128i transpose_ab_high = _mm_unpackhi_epi32(product_a_i32x4, product_b_i32x4);
-    __m128i transpose_cd_high = _mm_unpackhi_epi32(product_c_i32x4, product_d_i32x4);
-    __m128i product_lane0 = _mm_unpacklo_epi64(transpose_ab_low, transpose_cd_low);
-    __m128i product_lane1 = _mm_unpackhi_epi64(transpose_ab_low, transpose_cd_low);
-    __m128i product_lane2 = _mm_unpacklo_epi64(transpose_ab_high, transpose_cd_high);
-    __m128i product_lane3 = _mm_unpackhi_epi64(transpose_ab_high, transpose_cd_high);
+    __m128i transpose_ab_low_i32x4 = _mm_unpacklo_epi32(product_a_i32x4, product_b_i32x4);
+    __m128i transpose_cd_low_i32x4 = _mm_unpacklo_epi32(product_c_i32x4, product_d_i32x4);
+    __m128i transpose_ab_high_i32x4 = _mm_unpackhi_epi32(product_a_i32x4, product_b_i32x4);
+    __m128i transpose_cd_high_i32x4 = _mm_unpackhi_epi32(product_c_i32x4, product_d_i32x4);
+    __m128i product_lane0_i32x4 = _mm_unpacklo_epi64(transpose_ab_low_i32x4, transpose_cd_low_i32x4);
+    __m128i product_lane1_i32x4 = _mm_unpackhi_epi64(transpose_ab_low_i32x4, transpose_cd_low_i32x4);
+    __m128i product_lane2_i32x4 = _mm_unpacklo_epi64(transpose_ab_high_i32x4, transpose_cd_high_i32x4);
+    __m128i product_lane3_i32x4 = _mm_unpackhi_epi64(transpose_ab_high_i32x4, transpose_cd_high_i32x4);
     // Sum product lanes
-    result->xmm = _mm_add_epi32(_mm_add_epi32(product_lane0, product_lane1),
-                                _mm_add_epi32(product_lane2, product_lane3));
+    result->xmm = _mm_add_epi32(_mm_add_epi32(product_lane0_i32x4, product_lane1_i32x4),
+                                _mm_add_epi32(product_lane2_i32x4, product_lane3_i32x4));
 }
-#pragma endregion - Small Integers
+#pragma endregion I8 and U8 Integers
-#pragma region - Binary
+#pragma region Binary
 NK_PUBLIC void nk_dot_u1_haswell(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bits, nk_u32_t *result) {
     nk_size_t n_bytes = nk_size_divide_round_up_(n_bits, NK_BITS_PER_BYTE);
@@ -1671,7 +1706,7 @@ NK_INTERNAL void nk_dot_u1x128_finalize_haswell( //
     result->u32s[3] = state_d->dot_count;
 }
-#pragma endregion - Binary
+#pragma endregion Binary
 #if defined(__clang__)
 #pragma clang attribute pop