npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/dot/skylake.h ADDED Viewed

@@ -0,0 +1,1084 @@
+/**
+ *  @brief SIMD-accelerated Dot Products for Skylake.
+ *  @file include/numkong/dot/skylake.h
+ *  @author Ash Vardanian
+ *  @date December 27, 2025
+ *
+ *  @sa include/numkong/dot.h
+ *
+ *  @section dot_skylake_instructions Key AVX-512 Instructions
+ *
+ *      Intrinsic                   Instruction                     Latency     Throughput  Ports
+ *      _mm512_madd_epi16           VPMADDWD (ZMM, ZMM, ZMM)        5cy         0.5/cy      p05
+ *      _mm512_add_epi32            VPADDD (ZMM, ZMM, ZMM)          1cy         0.5/cy      p05
+ *      _mm512_fmadd_ps             VFMADD132PS (ZMM, ZMM, ZMM)     4cy         0.5/cy      p05
+ *      _mm512_cvtepi8_epi16        VPMOVSXBW (ZMM, YMM)            3cy         1/cy        p5
+ *
+ *  Skylake-X server chips feature dual 512-bit FMA units on ports 0 and 5, enabling 0.5cy throughput for
+ *  VFMADD and arithmetic operations. Client Skylake variants have only one FMA unit with 1cy throughput.
+ *  Without VNNI support, integer dot products use VPMADDWD for i16 pair multiplication with i32 accumulation.
+ *
+ *  @section dot_skylake_stateful Stateful Streaming Logic
+ *
+ *  To build memory-optimal tiled algorithms, this file defines following structures and force-inlined
+ *  `NK_INTERNAL` functions:
+ *
+ *  - nk_dot_f64x8 state with Dot2 stable dot-products,
+ *  - nk_dot_f32x8 state with double-precision numerics,
+ *  - nk_dot_through_f32 state for 16-bit float inputs with single-precision numerics.
+ *
+ *  @code{c}
+ *  nk_dot_f64x8_state_skylake_t state_first, state_second, state_third, state_fourth;
+ *  nk_b512_vec_t query_f64x8, target_first_f64x8, target_second_f64x8, target_third_f64x8, target_fourth_f64x8;
+ *  nk_dot_f64x8_init_skylake(&state_first);
+ *  nk_dot_f64x8_init_skylake(&state_second);
+ *  nk_dot_f64x8_init_skylake(&state_third);
+ *  nk_dot_f64x8_init_skylake(&state_fourth);
+ *  for (nk_size_t idx = 0; idx + 8 <= depth; idx += 8) {
+ *      query_f64x8.zmm_pd = _mm512_loadu_pd(query_ptr + idx);
+ *      target_first_f64x8.zmm_pd = _mm512_loadu_pd(target_first_ptr + idx);
+ *      target_second_f64x8.zmm_pd = _mm512_loadu_pd(target_second_ptr + idx);
+ *      target_third_f64x8.zmm_pd = _mm512_loadu_pd(target_third_ptr + idx);
+ *      target_fourth_f64x8.zmm_pd = _mm512_loadu_pd(target_fourth_ptr + idx);
+ *      nk_dot_f64x8_update_skylake(&state_first, query_f64x8, target_first_f64x8, idx, 8);
+ *      nk_dot_f64x8_update_skylake(&state_second, query_f64x8, target_second_f64x8, idx, 8);
+ *      nk_dot_f64x8_update_skylake(&state_third, query_f64x8, target_third_f64x8, idx, 8);
+ *      nk_dot_f64x8_update_skylake(&state_fourth, query_f64x8, target_fourth_f64x8, idx, 8);
+ *  }
+ *  nk_b256_vec_t results_f64x4;
+ *  nk_dot_f64x8_finalize_skylake(&state_first, &state_second, &state_third, &state_fourth, depth, &results_f64x4);
+ *  @endcode
+ *
+ *  Smaller float types like f16 and bf16 on Skylake use ISA-specific upcasting to f32 combined with native
+ *  FMA instructions, sharing the `nk_dot_through_f32` accumulation logic:
+ *
+ *  @code{c}
+ *  nk_dot_f16x16_state_skylake_t state_first, state_second, state_third, state_fourth;
+ *  nk_b512_vec_t query_f32x16, target_first_f32x16, target_second_f32x16, target_third_f32x16, target_fourth_f32x16;
+ *  nk_dot_through_f32_init_skylake_(&state_first);
+ *  nk_dot_through_f32_init_skylake_(&state_second);
+ *  nk_dot_through_f32_init_skylake_(&state_third);
+ *  nk_dot_through_f32_init_skylake_(&state_fourth);
+ *  for (nk_size_t idx = 0; idx + 16 <= depth; idx += 16) {
+ *      nk_load_f16x16_to_f32x16_skylake_(query_ptr + idx, &query_f32x16);
+ *      nk_load_f16x16_to_f32x16_skylake_(target_first_ptr + idx, &target_first_f32x16);
+ *      nk_load_f16x16_to_f32x16_skylake_(target_second_ptr + idx, &target_second_f32x16);
+ *      nk_load_f16x16_to_f32x16_skylake_(target_third_ptr + idx, &target_third_f32x16);
+ *      nk_load_f16x16_to_f32x16_skylake_(target_fourth_ptr + idx, &target_fourth_f32x16);
+ *      nk_dot_through_f32_update_skylake_(&state_first, query_f32x16, target_first_f32x16, idx, 16);
+ *      nk_dot_through_f32_update_skylake_(&state_second, query_f32x16, target_second_f32x16, idx, 16);
+ *      nk_dot_through_f32_update_skylake_(&state_third, query_f32x16, target_third_f32x16, idx, 16);
+ *      nk_dot_through_f32_update_skylake_(&state_fourth, query_f32x16, target_fourth_f32x16, idx, 16);
+ *  }
+ *  nk_b128_vec_t results_f32x4;
+ *  nk_dot_through_f32_finalize_skylake_(&state_first, &state_second, &state_third, &state_fourth,
+ *      depth, &results_f32x4);
+ *  @endcode
+ */
+#ifndef NK_DOT_SKYLAKE_H
+#define NK_DOT_SKYLAKE_H
+#if NK_TARGET_X86_
+#if NK_TARGET_SKYLAKE
+#include "numkong/cast/skylake.h"   // `nk_bf16x16_to_f32x16_skylake_`
+#include "numkong/reduce/skylake.h" // `nk_reduce_add_f32x16_skylake_`
+#include "numkong/dot/haswell.h"    // `nk_dot_stable_sum_f64x4_haswell_`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512dq,f16c,fma,bmi,bmi2"))), \
+                             apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "f16c", "fma", "bmi", "bmi2")
+#endif
+/** @brief Compensated horizontal sum of 8 f64 lanes via TwoSum tree reduction. */
+NK_INTERNAL nk_f64_t nk_dot_stable_sum_f64x8_skylake_(__m512d sum_f64x8, __m512d compensation_f64x8) {
+    // Stage 0: TwoSum merge of sum + compensation (8-wide)
+    __m512d tentative_sum_f64x8 = _mm512_add_pd(sum_f64x8, compensation_f64x8);
+    __m512d virtual_addend_f64x8 = _mm512_sub_pd(tentative_sum_f64x8, sum_f64x8);
+    __m512d rounding_error_f64x8 = _mm512_add_pd(
+        _mm512_sub_pd(sum_f64x8, _mm512_sub_pd(tentative_sum_f64x8, virtual_addend_f64x8)),
+        _mm512_sub_pd(compensation_f64x8, virtual_addend_f64x8));
+    // Stage 1: TwoSum halving 8→4
+    __m256d lower_sum_f64x4 = _mm512_castpd512_pd256(tentative_sum_f64x8);
+    __m256d upper_sum_f64x4 = _mm512_extractf64x4_pd(tentative_sum_f64x8, 1);
+    __m256d tentative_sum_f64x4 = _mm256_add_pd(lower_sum_f64x4, upper_sum_f64x4);
+    __m256d virtual_addend_f64x4 = _mm256_sub_pd(tentative_sum_f64x4, lower_sum_f64x4);
+    __m256d rounding_error_f64x4 = _mm256_add_pd(
+        _mm256_sub_pd(lower_sum_f64x4, _mm256_sub_pd(tentative_sum_f64x4, virtual_addend_f64x4)),
+        _mm256_sub_pd(upper_sum_f64x4, virtual_addend_f64x4));
+    __m256d lower_error_f64x4 = _mm512_castpd512_pd256(rounding_error_f64x8);
+    __m256d upper_error_f64x4 = _mm512_extractf64x4_pd(rounding_error_f64x8, 1);
+    __m256d accumulated_error_f64x4 = _mm256_add_pd(_mm256_add_pd(lower_error_f64x4, upper_error_f64x4),
+                                                    rounding_error_f64x4);
+    // Stages 2-3: Delegate to Haswell for 4→2→1 reduction
+    return nk_dot_stable_sum_f64x4_haswell_(tentative_sum_f64x4, accumulated_error_f64x4);
+}
+#pragma region - Traditional Floats
+/**
+ *  @brief Internal helper state for dot-products of low-precision types, where 32-bit accumulation is enough.
+ *  @sa nk_dot_f16x16_state_skylake_t, nk_dot_bf16x16_state_skylake_t
+ *  @sa nk_dot_e4m3x16_state_skylake_t, nk_dot_e5m2x16_state_skylake_t
+ */
+typedef struct nk_dot_through_f32_state_skylake_t_ {
+    __m512 sum_f32x16;
+} nk_dot_through_f32_state_skylake_t_;
+/**
+ *  @brief Initializes 32-bit accumulators for low-precision dot-products.
+ *  @sa nk_dot_f16x16_init_skylake, nk_dot_bf16x16_init_skylake
+ *  @sa nk_dot_e4m3x16_init_skylake, nk_dot_e5m2x16_init_skylake
+ */
+NK_INTERNAL void nk_dot_through_f32_init_skylake_(nk_dot_through_f32_state_skylake_t_ *state) {
+    state->sum_f32x16 = _mm512_setzero_ps();
+}
+/**
+ *  @brief Fuses 32-bit multiplication and accumulation for low-precision dot-products.
+ *  @sa nk_dot_f16x16_udpate_skylake, nk_dot_bf16x16_udpate_skylake
+ *  @sa nk_dot_e4m3x16_udpate_skylake, nk_dot_e5m2x16_udpate_skylake
+ */
+NK_INTERNAL void nk_dot_through_f32_update_skylake_(nk_dot_through_f32_state_skylake_t_ *state, nk_b512_vec_t a,
+                                                    nk_b512_vec_t b, nk_size_t depth_offset,
+                                                    nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    state->sum_f32x16 = _mm512_fmadd_ps(a.zmm_ps, b.zmm_ps, state->sum_f32x16);
+}
+/**
+ *  @brief Finalizes 4x low-precision dot-products placing them into 4x consecutive 32-bit slots.
+ *  @sa nk_dot_f16x16_udpate_skylake, nk_dot_bf16x16_udpate_skylake
+ *  @sa nk_dot_e4m3x16_udpate_skylake, nk_dot_e5m2x16_udpate_skylake
+ *
+ *  The goal of this kernel is simple - compute 4x horizontal reductions, each involing 16x floats.
+ *  The lack of vectorized horizontal instruction implies many consecutive shuffles producing a tree-like
+ *  reduction. This kernel allow combinding some of those operations between different dot products.
+ */
+NK_INTERNAL void nk_dot_through_f32_finalize_skylake_(                                                      //
+    nk_dot_through_f32_state_skylake_t_ const *state_a, nk_dot_through_f32_state_skylake_t_ const *state_b, //
+    nk_dot_through_f32_state_skylake_t_ const *state_c, nk_dot_through_f32_state_skylake_t_ const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    __m512 const sum_a_f32x16 = state_a->sum_f32x16, sum_b_f32x16 = state_b->sum_f32x16,
+                 sum_c_f32x16 = state_c->sum_f32x16, sum_d_f32x16 = state_d->sum_f32x16;
+    // ILP-optimized 4-way horizontal reduction for f32x16 in AVX-512
+    // Step 1: 16 → 8 for all 4 states (extract high 256-bit half and add to low half)
+    __m256 sum_a_f32x8 = _mm256_add_ps(_mm512_castps512_ps256(sum_a_f32x16), _mm512_extractf32x8_ps(sum_a_f32x16, 1));
+    __m256 sum_b_f32x8 = _mm256_add_ps(_mm512_castps512_ps256(sum_b_f32x16), _mm512_extractf32x8_ps(sum_b_f32x16, 1));
+    __m256 sum_c_f32x8 = _mm256_add_ps(_mm512_castps512_ps256(sum_c_f32x16), _mm512_extractf32x8_ps(sum_c_f32x16, 1));
+    __m256 sum_d_f32x8 = _mm256_add_ps(_mm512_castps512_ps256(sum_d_f32x16), _mm512_extractf32x8_ps(sum_d_f32x16, 1));
+    // Step 2: 8 → 4 for all 4 states (extract high 128-bit half and add to low half)
+    __m128 sum_a_f32x4 = _mm_add_ps(_mm256_castps256_ps128(sum_a_f32x8), _mm256_extractf128_ps(sum_a_f32x8, 1));
+    __m128 sum_b_f32x4 = _mm_add_ps(_mm256_castps256_ps128(sum_b_f32x8), _mm256_extractf128_ps(sum_b_f32x8, 1));
+    __m128 sum_c_f32x4 = _mm_add_ps(_mm256_castps256_ps128(sum_c_f32x8), _mm256_extractf128_ps(sum_c_f32x8, 1));
+    __m128 sum_d_f32x4 = _mm_add_ps(_mm256_castps256_ps128(sum_d_f32x8), _mm256_extractf128_ps(sum_d_f32x8, 1));
+    // Step 3: Transpose 4x4 and reduce to get final 4 scalars
+    __m128 transpose_ab_low_f32x4 = _mm_unpacklo_ps(sum_a_f32x4, sum_b_f32x4);
+    __m128 transpose_cd_low_f32x4 = _mm_unpacklo_ps(sum_c_f32x4, sum_d_f32x4);
+    __m128 transpose_ab_high_f32x4 = _mm_unpackhi_ps(sum_a_f32x4, sum_b_f32x4);
+    __m128 transpose_cd_high_f32x4 = _mm_unpackhi_ps(sum_c_f32x4, sum_d_f32x4);
+    __m128 sum_lane0_f32x4 = _mm_movelh_ps(transpose_ab_low_f32x4, transpose_cd_low_f32x4);
+    __m128 sum_lane1_f32x4 = _mm_movehl_ps(transpose_cd_low_f32x4, transpose_ab_low_f32x4);
+    __m128 sum_lane2_f32x4 = _mm_movelh_ps(transpose_ab_high_f32x4, transpose_cd_high_f32x4);
+    __m128 sum_lane3_f32x4 = _mm_movehl_ps(transpose_cd_high_f32x4, transpose_ab_high_f32x4);
+    __m128 final_sum_f32x4 = _mm_add_ps(_mm_add_ps(sum_lane0_f32x4, sum_lane1_f32x4),
+                                        _mm_add_ps(sum_lane2_f32x4, sum_lane3_f32x4));
+    result->xmm = _mm_castps_si128(final_sum_f32x4);
+}
+NK_PUBLIC void nk_dot_f32_skylake(nk_f32_t const *a_scalars, nk_f32_t const *b_scalars, nk_size_t count_scalars,
+                                  nk_f64_t *result) {
+    __m256 a_f32x8, b_f32x8;
+    __m512d sum_f64x8 = _mm512_setzero_pd();
+nk_dot_f32_skylake_cycle:
+    if (count_scalars < 8) {
+        __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, count_scalars);
+        a_f32x8 = _mm256_maskz_loadu_ps(mask, a_scalars);
+        b_f32x8 = _mm256_maskz_loadu_ps(mask, b_scalars);
+        count_scalars = 0;
+    }
+    else {
+        a_f32x8 = _mm256_loadu_ps(a_scalars);
+        b_f32x8 = _mm256_loadu_ps(b_scalars);
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    sum_f64x8 = _mm512_fmadd_pd(_mm512_cvtps_pd(a_f32x8), _mm512_cvtps_pd(b_f32x8), sum_f64x8);
+    if (count_scalars) goto nk_dot_f32_skylake_cycle;
+    *result = _mm512_reduce_add_pd(sum_f64x8);
+}
+NK_PUBLIC void nk_dot_f64_skylake(nk_f64_t const *a_scalars, nk_f64_t const *b_scalars, nk_size_t count_scalars,
+                                  nk_f64_t *result) {
+    // Dot2 algorithm (Ogita-Rump-Oishi 2005) for compensated dot product
+    __m512d a_f64x8, b_f64x8;
+    __m512d sum_f64x8 = _mm512_setzero_pd();
+    __m512d compensation_f64x8 = _mm512_setzero_pd();
+nk_dot_f64_skylake_cycle:
+    if (count_scalars < 8) {
+        __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, count_scalars);
+        a_f64x8 = _mm512_maskz_loadu_pd(mask, a_scalars);
+        b_f64x8 = _mm512_maskz_loadu_pd(mask, b_scalars);
+        count_scalars = 0;
+    }
+    else {
+        a_f64x8 = _mm512_loadu_pd(a_scalars);
+        b_f64x8 = _mm512_loadu_pd(b_scalars);
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    // TwoProd: h = a * b, r = fma(a, b, -h) captures the rounding error
+    __m512d product_f64x8 = _mm512_mul_pd(a_f64x8, b_f64x8);
+    __m512d product_error_f64x8 = _mm512_fmsub_pd(a_f64x8, b_f64x8, product_f64x8);
+    // TwoSum: (t, q) = TwoSum(sum, h) where t = sum + h rounded, q = error
+    __m512d tentative_sum_f64x8 = _mm512_add_pd(sum_f64x8, product_f64x8);
+    __m512d virtual_addend_f64x8 = _mm512_sub_pd(tentative_sum_f64x8, sum_f64x8);
+    __m512d sum_error_f64x8 = _mm512_add_pd(
+        _mm512_sub_pd(sum_f64x8, _mm512_sub_pd(tentative_sum_f64x8, virtual_addend_f64x8)),
+        _mm512_sub_pd(product_f64x8, virtual_addend_f64x8));
+    // Update: sum = t, compensation += q + r
+    sum_f64x8 = tentative_sum_f64x8;
+    compensation_f64x8 = _mm512_add_pd(compensation_f64x8, _mm512_add_pd(sum_error_f64x8, product_error_f64x8));
+    if (count_scalars) goto nk_dot_f64_skylake_cycle;
+    // Compensated horizontal reduction preserving Dot2 error tracking
+    *result = nk_dot_stable_sum_f64x8_skylake_(sum_f64x8, compensation_f64x8);
+}
+NK_PUBLIC void nk_dot_f32c_skylake(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
+                                   nk_f64c_t *result) {
+    __m256 a_f32x8, b_f32x8;
+    __m512d sum_real_f64x8 = _mm512_setzero_pd();
+    __m512d sum_imag_f64x8 = _mm512_setzero_pd();
+    // We take into account, that FMS is the same as FMA with a negative multiplier.
+    // To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
+    // This way we can avoid the shuffling and the need for separate real and imaginary parts.
+    // For the imaginary part of the product, we would need to swap the real and imaginary parts of
+    // one of the vectors.
+    __m512i const sign_flip_f64x8 = _mm512_set_epi64(0x8000000000000000, 0, 0x8000000000000000, 0, 0x8000000000000000,
+                                                     0, 0x8000000000000000, 0);
+nk_dot_f32c_skylake_cycle:
+    if (count_pairs < 4) {
+        __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
+        a_f32x8 = _mm256_maskz_loadu_ps(mask, (nk_f32_t const *)a_pairs);
+        b_f32x8 = _mm256_maskz_loadu_ps(mask, (nk_f32_t const *)b_pairs);
+        count_pairs = 0;
+    }
+    else {
+        a_f32x8 = _mm256_loadu_ps((nk_f32_t const *)a_pairs);
+        b_f32x8 = _mm256_loadu_ps((nk_f32_t const *)b_pairs);
+        a_pairs += 4, b_pairs += 4, count_pairs -= 4;
+    }
+    __m512d a_f64x8 = _mm512_cvtps_pd(a_f32x8);
+    __m512d b_f64x8 = _mm512_cvtps_pd(b_f32x8);
+    __m512d b_swapped_f64x8 = _mm512_permute_pd(b_f64x8, 0x55);
+    sum_real_f64x8 = _mm512_fmadd_pd(a_f64x8, b_f64x8, sum_real_f64x8);
+    sum_imag_f64x8 = _mm512_fmadd_pd(a_f64x8, b_swapped_f64x8, sum_imag_f64x8);
+    if (count_pairs) goto nk_dot_f32c_skylake_cycle;
+    // Flip the sign bit in every second f64 before accumulation:
+    sum_real_f64x8 = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(sum_real_f64x8), sign_flip_f64x8));
+    // Reduce horizontal sums:
+    result->real = _mm512_reduce_add_pd(sum_real_f64x8);
+    result->imag = _mm512_reduce_add_pd(sum_imag_f64x8);
+}
+NK_PUBLIC void nk_vdot_f32c_skylake(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
+                                    nk_f64c_t *result) {
+    __m256 a_f32x8, b_f32x8;
+    __m512d sum_real_f64x8 = _mm512_setzero_pd();
+    __m512d sum_imag_f64x8 = _mm512_setzero_pd();
+    // We take into account, that FMS is the same as FMA with a negative multiplier.
+    // To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
+    // This way we can avoid the shuffling and the need for separate real and imaginary parts.
+    // For the imaginary part of the product, we would need to swap the real and imaginary parts of
+    // one of the vectors.
+    __m512i const sign_flip_f64x8 = _mm512_set_epi64(0x8000000000000000, 0, 0x8000000000000000, 0, 0x8000000000000000,
+                                                     0, 0x8000000000000000, 0);
+nk_vdot_f32c_skylake_cycle:
+    if (count_pairs < 4) {
+        __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
+        a_f32x8 = _mm256_maskz_loadu_ps(mask, (nk_f32_t const *)a_pairs);
+        b_f32x8 = _mm256_maskz_loadu_ps(mask, (nk_f32_t const *)b_pairs);
+        count_pairs = 0;
+    }
+    else {
+        a_f32x8 = _mm256_loadu_ps((nk_f32_t const *)a_pairs);
+        b_f32x8 = _mm256_loadu_ps((nk_f32_t const *)b_pairs);
+        a_pairs += 4, b_pairs += 4, count_pairs -= 4;
+    }
+    __m512d a_f64x8 = _mm512_cvtps_pd(a_f32x8);
+    __m512d b_f64x8 = _mm512_cvtps_pd(b_f32x8);
+    sum_real_f64x8 = _mm512_fmadd_pd(a_f64x8, b_f64x8, sum_real_f64x8);
+    __m512d b_swapped_f64x8 = _mm512_permute_pd(b_f64x8, 0x55);
+    sum_imag_f64x8 = _mm512_fmadd_pd(a_f64x8, b_swapped_f64x8, sum_imag_f64x8);
+    if (count_pairs) goto nk_vdot_f32c_skylake_cycle;
+    // Flip the sign bit in every second f64 before accumulation:
+    sum_imag_f64x8 = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(sum_imag_f64x8), sign_flip_f64x8));
+    // Reduce horizontal sums:
+    result->real = _mm512_reduce_add_pd(sum_real_f64x8);
+    result->imag = _mm512_reduce_add_pd(sum_imag_f64x8);
+}
+NK_PUBLIC void nk_dot_f64c_skylake(nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_size_t count_pairs,
+                                   nk_f64c_t *result) {
+    // Dot2 algorithm (Ogita-Rump-Oishi 2005) for compensated complex dot product
+    __m512d a_f64x8, b_f64x8;
+    __m512d sum_real_f64x8 = _mm512_setzero_pd();
+    __m512d sum_imag_f64x8 = _mm512_setzero_pd();
+    __m512d compensation_real_f64x8 = _mm512_setzero_pd();
+    __m512d compensation_imag_f64x8 = _mm512_setzero_pd();
+    // We take into account, that FMS is the same as FMA with a negative multiplier.
+    // To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
+    // This way we can avoid the shuffling and the need for separate real and imaginary parts.
+    // For the imaginary part of the product, we would need to swap the real and imaginary parts of
+    // one of the vectors.
+    __m512i const sign_flip_f64x8 = _mm512_set_epi64(                                   //
+        0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000, //
+        0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000  //
+    );
+nk_dot_f64c_skylake_cycle:
+    if (count_pairs < 4) {
+        __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
+        a_f64x8 = _mm512_maskz_loadu_pd(mask, a_pairs);
+        b_f64x8 = _mm512_maskz_loadu_pd(mask, b_pairs);
+        count_pairs = 0;
+    }
+    else {
+        a_f64x8 = _mm512_loadu_pd(a_pairs);
+        b_f64x8 = _mm512_loadu_pd(b_pairs);
+        a_pairs += 4, b_pairs += 4, count_pairs -= 4;
+    }
+    __m512d b_swapped_f64x8 = _mm512_permute_pd(b_f64x8, 0x55); //? Same as 0b01010101.
+    // TwoProd for real part: a * b
+    __m512d product_real_f64x8 = _mm512_mul_pd(a_f64x8, b_f64x8);
+    __m512d product_real_error_f64x8 = _mm512_fmsub_pd(a_f64x8, b_f64x8, product_real_f64x8);
+    // TwoSum for real part
+    __m512d tentative_sum_real_f64x8 = _mm512_add_pd(sum_real_f64x8, product_real_f64x8);
+    __m512d virtual_addend_real_f64x8 = _mm512_sub_pd(tentative_sum_real_f64x8, sum_real_f64x8);
+    __m512d sum_real_error_f64x8 = _mm512_add_pd(
+        _mm512_sub_pd(sum_real_f64x8, _mm512_sub_pd(tentative_sum_real_f64x8, virtual_addend_real_f64x8)),
+        _mm512_sub_pd(product_real_f64x8, virtual_addend_real_f64x8));
+    sum_real_f64x8 = tentative_sum_real_f64x8;
+    compensation_real_f64x8 = _mm512_add_pd(compensation_real_f64x8,
+                                            _mm512_add_pd(sum_real_error_f64x8, product_real_error_f64x8));
+    // TwoProd for imag part: a * b_swapped
+    __m512d product_imag_f64x8 = _mm512_mul_pd(a_f64x8, b_swapped_f64x8);
+    __m512d product_imag_error_f64x8 = _mm512_fmsub_pd(a_f64x8, b_swapped_f64x8, product_imag_f64x8);
+    // TwoSum for imag part
+    __m512d tentative_sum_imag_f64x8 = _mm512_add_pd(sum_imag_f64x8, product_imag_f64x8);
+    __m512d virtual_addend_imag_f64x8 = _mm512_sub_pd(tentative_sum_imag_f64x8, sum_imag_f64x8);
+    __m512d sum_imag_error_f64x8 = _mm512_add_pd(
+        _mm512_sub_pd(sum_imag_f64x8, _mm512_sub_pd(tentative_sum_imag_f64x8, virtual_addend_imag_f64x8)),
+        _mm512_sub_pd(product_imag_f64x8, virtual_addend_imag_f64x8));
+    sum_imag_f64x8 = tentative_sum_imag_f64x8;
+    compensation_imag_f64x8 = _mm512_add_pd(compensation_imag_f64x8,
+                                            _mm512_add_pd(sum_imag_error_f64x8, product_imag_error_f64x8));
+    if (count_pairs) goto nk_dot_f64c_skylake_cycle;
+    // Flip the sign bit in every second scalar before accumulation (to get a_r*b_r - a_i*b_i):
+    sum_real_f64x8 = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(sum_real_f64x8), sign_flip_f64x8));
+    compensation_real_f64x8 = _mm512_castsi512_pd(
+        _mm512_xor_si512(_mm512_castpd_si512(compensation_real_f64x8), sign_flip_f64x8));
+    // Compensated horizontal reduction preserving Dot2 error tracking
+    result->real = nk_dot_stable_sum_f64x8_skylake_(sum_real_f64x8, compensation_real_f64x8);
+    result->imag = nk_dot_stable_sum_f64x8_skylake_(sum_imag_f64x8, compensation_imag_f64x8);
+}
+NK_PUBLIC void nk_vdot_f64c_skylake(nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_size_t count_pairs,
+                                    nk_f64c_t *result) {
+    // Dot2 algorithm (Ogita-Rump-Oishi 2005) for compensated conjugate dot product
+    __m512d a_f64x8, b_f64x8;
+    __m512d sum_real_f64x8 = _mm512_setzero_pd();
+    __m512d sum_imag_f64x8 = _mm512_setzero_pd();
+    __m512d compensation_real_f64x8 = _mm512_setzero_pd();
+    __m512d compensation_imag_f64x8 = _mm512_setzero_pd();
+    // We take into account, that FMS is the same as FMA with a negative multiplier.
+    // To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit.
+    // This way we can avoid the shuffling and the need for separate real and imaginary parts.
+    // For the imaginary part of the product, we would need to swap the real and imaginary parts of
+    // one of the vectors.
+    __m512i const sign_flip_f64x8 = _mm512_set_epi64(                                   //
+        0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000, //
+        0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000  //
+    );
+nk_vdot_f64c_skylake_cycle:
+    if (count_pairs < 4) {
+        __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, count_pairs * 2);
+        a_f64x8 = _mm512_maskz_loadu_pd(mask, (nk_f64_t const *)a_pairs);
+        b_f64x8 = _mm512_maskz_loadu_pd(mask, (nk_f64_t const *)b_pairs);
+        count_pairs = 0;
+    }
+    else {
+        a_f64x8 = _mm512_loadu_pd((nk_f64_t const *)a_pairs);
+        b_f64x8 = _mm512_loadu_pd((nk_f64_t const *)b_pairs);
+        a_pairs += 4, b_pairs += 4, count_pairs -= 4;
+    }
+    __m512d b_swapped_f64x8 = _mm512_permute_pd(b_f64x8, 0x55); //? Same as 0b01010101.
+    // TwoProd for real part: a * b
+    __m512d product_real_f64x8 = _mm512_mul_pd(a_f64x8, b_f64x8);
+    __m512d product_real_error_f64x8 = _mm512_fmsub_pd(a_f64x8, b_f64x8, product_real_f64x8);
+    // TwoSum for real part
+    __m512d tentative_sum_real_f64x8 = _mm512_add_pd(sum_real_f64x8, product_real_f64x8);
+    __m512d virtual_addend_real_f64x8 = _mm512_sub_pd(tentative_sum_real_f64x8, sum_real_f64x8);
+    __m512d sum_real_error_f64x8 = _mm512_add_pd(
+        _mm512_sub_pd(sum_real_f64x8, _mm512_sub_pd(tentative_sum_real_f64x8, virtual_addend_real_f64x8)),
+        _mm512_sub_pd(product_real_f64x8, virtual_addend_real_f64x8));
+    sum_real_f64x8 = tentative_sum_real_f64x8;
+    compensation_real_f64x8 = _mm512_add_pd(compensation_real_f64x8,
+                                            _mm512_add_pd(sum_real_error_f64x8, product_real_error_f64x8));
+    // TwoProd for imag part: a * b_swapped
+    __m512d product_imag_f64x8 = _mm512_mul_pd(a_f64x8, b_swapped_f64x8);
+    __m512d product_imag_error_f64x8 = _mm512_fmsub_pd(a_f64x8, b_swapped_f64x8, product_imag_f64x8);
+    // TwoSum for imag part
+    __m512d tentative_sum_imag_f64x8 = _mm512_add_pd(sum_imag_f64x8, product_imag_f64x8);
+    __m512d virtual_addend_imag_f64x8 = _mm512_sub_pd(tentative_sum_imag_f64x8, sum_imag_f64x8);
+    __m512d sum_imag_error_f64x8 = _mm512_add_pd(
+        _mm512_sub_pd(sum_imag_f64x8, _mm512_sub_pd(tentative_sum_imag_f64x8, virtual_addend_imag_f64x8)),
+        _mm512_sub_pd(product_imag_f64x8, virtual_addend_imag_f64x8));
+    sum_imag_f64x8 = tentative_sum_imag_f64x8;
+    compensation_imag_f64x8 = _mm512_add_pd(compensation_imag_f64x8,
+                                            _mm512_add_pd(sum_imag_error_f64x8, product_imag_error_f64x8));
+    if (count_pairs) goto nk_vdot_f64c_skylake_cycle;
+    // Flip the sign bit in every second scalar before accumulation (to get a_r*b_i - a_i*b_r):
+    sum_imag_f64x8 = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(sum_imag_f64x8), sign_flip_f64x8));
+    compensation_imag_f64x8 = _mm512_castsi512_pd(
+        _mm512_xor_si512(_mm512_castpd_si512(compensation_imag_f64x8), sign_flip_f64x8));
+    // Compensated horizontal reduction preserving Dot2 error tracking
+    result->real = nk_dot_stable_sum_f64x8_skylake_(sum_real_f64x8, compensation_real_f64x8);
+    result->imag = nk_dot_stable_sum_f64x8_skylake_(sum_imag_f64x8, compensation_imag_f64x8);
+}
+#pragma region - Smaller Floats
+NK_PUBLIC void nk_dot_f16_skylake(nk_f16_t const *a_scalars, nk_f16_t const *b_scalars, nk_size_t count_scalars,
+                                  nk_f32_t *result) {
+    __m256i a_f16x16, b_f16x16;
+    __m512 sum_f32x16 = _mm512_setzero_ps();
+nk_dot_f16_skylake_cycle:
+    if (count_scalars < 16) {
+        __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, count_scalars);
+        a_f16x16 = _mm256_maskz_loadu_epi16(mask, a_scalars);
+        b_f16x16 = _mm256_maskz_loadu_epi16(mask, b_scalars);
+        count_scalars = 0;
+    }
+    else {
+        a_f16x16 = _mm256_loadu_si256((__m256i const *)a_scalars);
+        b_f16x16 = _mm256_loadu_si256((__m256i const *)b_scalars);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    __m512 a_f32x16 = _mm512_cvtph_ps(a_f16x16);
+    __m512 b_f32x16 = _mm512_cvtph_ps(b_f16x16);
+    sum_f32x16 = _mm512_fmadd_ps(a_f32x16, b_f32x16, sum_f32x16);
+    if (count_scalars) goto nk_dot_f16_skylake_cycle;
+    *result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
+}
+NK_PUBLIC void nk_dot_bf16_skylake(nk_bf16_t const *a_scalars, nk_bf16_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f32_t *result) {
+    __m256i a_bf16x16, b_bf16x16;
+    __m512 sum_f32x16 = _mm512_setzero_ps();
+nk_dot_bf16_skylake_cycle:
+    if (count_scalars < 16) {
+        __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, count_scalars);
+        a_bf16x16 = _mm256_maskz_loadu_epi16(mask, a_scalars);
+        b_bf16x16 = _mm256_maskz_loadu_epi16(mask, b_scalars);
+        count_scalars = 0;
+    }
+    else {
+        a_bf16x16 = _mm256_loadu_si256((__m256i const *)a_scalars);
+        b_bf16x16 = _mm256_loadu_si256((__m256i const *)b_scalars);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    __m512 a_f32x16 = nk_bf16x16_to_f32x16_skylake_(a_bf16x16);
+    __m512 b_f32x16 = nk_bf16x16_to_f32x16_skylake_(b_bf16x16);
+    sum_f32x16 = _mm512_fmadd_ps(a_f32x16, b_f32x16, sum_f32x16);
+    if (count_scalars) goto nk_dot_bf16_skylake_cycle;
+    *result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
+}
+NK_PUBLIC void nk_dot_e4m3_skylake(nk_e4m3_t const *a_scalars, nk_e4m3_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f32_t *result) {
+    __m128i a_e4m3x16, b_e4m3x16;
+    __m512 sum_f32x16 = _mm512_setzero_ps();
+nk_dot_e4m3_skylake_cycle:
+    if (count_scalars < 16) {
+        __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, count_scalars);
+        a_e4m3x16 = _mm_maskz_loadu_epi8(mask, a_scalars);
+        b_e4m3x16 = _mm_maskz_loadu_epi8(mask, b_scalars);
+        count_scalars = 0;
+    }
+    else {
+        a_e4m3x16 = _mm_loadu_si128((__m128i const *)a_scalars);
+        b_e4m3x16 = _mm_loadu_si128((__m128i const *)b_scalars);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    __m512 a_f32x16 = nk_e4m3x16_to_f32x16_skylake_(a_e4m3x16);
+    __m512 b_f32x16 = nk_e4m3x16_to_f32x16_skylake_(b_e4m3x16);
+    sum_f32x16 = _mm512_fmadd_ps(a_f32x16, b_f32x16, sum_f32x16);
+    if (count_scalars) goto nk_dot_e4m3_skylake_cycle;
+    *result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
+}
+NK_PUBLIC void nk_dot_e5m2_skylake(nk_e5m2_t const *a_scalars, nk_e5m2_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f32_t *result) {
+    __m128i a_e5m2x16, b_e5m2x16;
+    __m512 sum_f32x16 = _mm512_setzero_ps();
+nk_dot_e5m2_skylake_cycle:
+    if (count_scalars < 16) {
+        __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, count_scalars);
+        a_e5m2x16 = _mm_maskz_loadu_epi8(mask, a_scalars);
+        b_e5m2x16 = _mm_maskz_loadu_epi8(mask, b_scalars);
+        count_scalars = 0;
+    }
+    else {
+        a_e5m2x16 = _mm_loadu_si128((__m128i const *)a_scalars);
+        b_e5m2x16 = _mm_loadu_si128((__m128i const *)b_scalars);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    __m512 a_f32x16 = nk_e5m2x16_to_f32x16_skylake_(a_e5m2x16);
+    __m512 b_f32x16 = nk_e5m2x16_to_f32x16_skylake_(b_e5m2x16);
+    sum_f32x16 = _mm512_fmadd_ps(a_f32x16, b_f32x16, sum_f32x16);
+    if (count_scalars) goto nk_dot_e5m2_skylake_cycle;
+    *result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
+}
+NK_PUBLIC void nk_dot_e2m3_skylake(nk_e2m3_t const *a_scalars, nk_e2m3_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f32_t *result) {
+    // Integer dot product for e2m3 using dual-VPSHUFB (LUT) + VPMADDUBSW (unsigned×signed).
+    // 64 elements per iteration using AVX-512BW. Result = i32_dot / 256.0f (exact).
+    //
+    // LUTs replicated 4× for 512-bit VPSHUFB (operates per 128-bit lane):
+    __m512i const lut_lower_u8x64 = _mm512_set_epi8(               //
+        30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, //
+        30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, //
+        30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, //
+        30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i const lut_upper_u8x64 = _mm512_set_epi8(                       //
+        120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32, //
+        120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32, //
+        120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32, //
+        120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32);
+    __m512i const nibble_mask_u8x64 = _mm512_set1_epi8(0x0F);
+    __m512i const magnitude_mask_u8x64 = _mm512_set1_epi8(0x1F);
+    __m512i const half_select_u8x64 = _mm512_set1_epi8(0x10);
+    __m512i const sign_mask_u8x64 = _mm512_set1_epi8(0x20);
+    __m512i const ones_i16x32 = _mm512_set1_epi16(1);
+    __m512i sum_i32x16 = _mm512_setzero_si512();
+    __m512i a_e2m3_u8x64, b_e2m3_u8x64;
+nk_dot_e2m3_skylake_cycle:
+    if (count_scalars < 64) {
+        __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, count_scalars);
+        a_e2m3_u8x64 = _mm512_maskz_loadu_epi8(mask, a_scalars);
+        b_e2m3_u8x64 = _mm512_maskz_loadu_epi8(mask, b_scalars);
+        count_scalars = 0;
+    }
+    else {
+        a_e2m3_u8x64 = _mm512_loadu_si512((__m512i const *)a_scalars);
+        b_e2m3_u8x64 = _mm512_loadu_si512((__m512i const *)b_scalars);
+        a_scalars += 64, b_scalars += 64, count_scalars -= 64;
+    }
+    // Extract 5-bit magnitude, then split into low 4 bits (VPSHUFB index) and bit 4 (hi/lo select)
+    __m512i a_magnitude_u8x64 = _mm512_and_si512(a_e2m3_u8x64, magnitude_mask_u8x64);
+    __m512i b_magnitude_u8x64 = _mm512_and_si512(b_e2m3_u8x64, magnitude_mask_u8x64);
+    __m512i a_shuffle_index_u8x64 = _mm512_and_si512(a_magnitude_u8x64, nibble_mask_u8x64);
+    __m512i b_shuffle_index_u8x64 = _mm512_and_si512(b_magnitude_u8x64, nibble_mask_u8x64);
+    // Bit-4 select via kmask (cleaner than Haswell's vector compare)
+    __mmask64 a_upper_select = _mm512_test_epi8_mask(a_magnitude_u8x64, half_select_u8x64);
+    __mmask64 b_upper_select = _mm512_test_epi8_mask(b_magnitude_u8x64, half_select_u8x64);
+    // Dual VPSHUFB + mask-blend for 32-entry LUT
+    __m512i a_unsigned_u8x64 = _mm512_mask_blend_epi8(a_upper_select,
+                                                      _mm512_shuffle_epi8(lut_lower_u8x64, a_shuffle_index_u8x64),
+                                                      _mm512_shuffle_epi8(lut_upper_u8x64, a_shuffle_index_u8x64));
+    __m512i b_unsigned_u8x64 = _mm512_mask_blend_epi8(b_upper_select,
+                                                      _mm512_shuffle_epi8(lut_lower_u8x64, b_shuffle_index_u8x64),
+                                                      _mm512_shuffle_epi8(lut_upper_u8x64, b_shuffle_index_u8x64));
+    // Combined sign: (a ^ b) & 0x20, negate b where signs differ using kmask
+    __m512i sign_combined_u8x64 = _mm512_and_si512(_mm512_xor_si512(a_e2m3_u8x64, b_e2m3_u8x64), sign_mask_u8x64);
+    __mmask64 negate_mask = _mm512_test_epi8_mask(sign_combined_u8x64, sign_combined_u8x64);
+    __m512i b_signed_i8x64 = _mm512_mask_sub_epi8(b_unsigned_u8x64, negate_mask, _mm512_setzero_si512(),
+                                                  b_unsigned_u8x64);
+    // VPMADDUBSW: a_unsigned[u8] × b_signed[i8] → i16 pairs
+    __m512i products_i16x32 = _mm512_maddubs_epi16(a_unsigned_u8x64, b_signed_i8x64);
+    // VPMADDWD with ones: i16 pairs → i32
+    sum_i32x16 = _mm512_add_epi32(sum_i32x16, _mm512_madd_epi16(products_i16x32, ones_i16x32));
+    if (count_scalars) goto nk_dot_e2m3_skylake_cycle;
+    *result = (nk_f32_t)_mm512_reduce_add_epi32(sum_i32x16) / 256.0f;
+}
+NK_PUBLIC void nk_dot_e3m2_skylake(nk_e3m2_t const *a_scalars, nk_e3m2_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f32_t *result) {
+    // Integer dot product for e3m2 using dual-VPSHUFB (low-byte LUT) + VPMADDWD (i16×i16→i32).
+    // 64 elements per iteration using AVX-512BW. Magnitudes reach 448, requiring i16.
+    // Result = i32_dot / 256.0f (exact, no rounding error).
+    //
+    __m512i const lut_lo_lower_u8x64 = _mm512_set_epi8(        //
+        28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0, //
+        28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0, //
+        28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0, //
+        28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m512i const lut_lo_upper_u8x64 = _mm512_set_epi8(                                                           //
+        (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32, //
+        (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32, //
+        (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32, //
+        (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32);
+    __m512i const nibble_mask_u8x64 = _mm512_set1_epi8(0x0F);
+    __m512i const magnitude_mask_u8x64 = _mm512_set1_epi8(0x1F);
+    __m512i const half_select_u8x64 = _mm512_set1_epi8(0x10);
+    __m512i const sign_mask_u8x64 = _mm512_set1_epi8(0x20);
+    __m512i const ones_u8x64 = _mm512_set1_epi8(1);
+    __m512i sum_i32x16 = _mm512_setzero_si512();
+    __m512i a_e3m2_u8x64, b_e3m2_u8x64;
+nk_dot_e3m2_skylake_cycle:
+    if (count_scalars < 64) {
+        __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, count_scalars);
+        a_e3m2_u8x64 = _mm512_maskz_loadu_epi8(mask, a_scalars);
+        b_e3m2_u8x64 = _mm512_maskz_loadu_epi8(mask, b_scalars);
+        count_scalars = 0;
+    }
+    else {
+        a_e3m2_u8x64 = _mm512_loadu_si512((__m512i const *)a_scalars);
+        b_e3m2_u8x64 = _mm512_loadu_si512((__m512i const *)b_scalars);
+        a_scalars += 64, b_scalars += 64, count_scalars -= 64;
+    }
+    // Extract 5-bit magnitude, split into low 4 bits and bit 4
+    __m512i a_magnitude_u8x64 = _mm512_and_si512(a_e3m2_u8x64, magnitude_mask_u8x64);
+    __m512i b_magnitude_u8x64 = _mm512_and_si512(b_e3m2_u8x64, magnitude_mask_u8x64);
+    __m512i a_shuffle_index_u8x64 = _mm512_and_si512(a_magnitude_u8x64, nibble_mask_u8x64);
+    __m512i b_shuffle_index_u8x64 = _mm512_and_si512(b_magnitude_u8x64, nibble_mask_u8x64);
+    // Bit-4 select via kmask
+    __mmask64 a_upper_select = _mm512_test_epi8_mask(a_magnitude_u8x64, half_select_u8x64);
+    __mmask64 b_upper_select = _mm512_test_epi8_mask(b_magnitude_u8x64, half_select_u8x64);
+    // Dual VPSHUFB + mask-blend for low bytes
+    __m512i a_lo_bytes_u8x64 = _mm512_mask_blend_epi8(a_upper_select,
+                                                      _mm512_shuffle_epi8(lut_lo_lower_u8x64, a_shuffle_index_u8x64),
+                                                      _mm512_shuffle_epi8(lut_lo_upper_u8x64, a_shuffle_index_u8x64));
+    __m512i b_lo_bytes_u8x64 = _mm512_mask_blend_epi8(b_upper_select,
+                                                      _mm512_shuffle_epi8(lut_lo_lower_u8x64, b_shuffle_index_u8x64),
+                                                      _mm512_shuffle_epi8(lut_lo_upper_u8x64, b_shuffle_index_u8x64));
+    // High byte: 1 iff magnitude >= 28 (unsigned compare via _mm512_cmpge_epu8_mask)
+    __mmask64 a_hi_mask = _mm512_cmpge_epu8_mask(a_magnitude_u8x64, _mm512_set1_epi8(28));
+    __mmask64 b_hi_mask = _mm512_cmpge_epu8_mask(b_magnitude_u8x64, _mm512_set1_epi8(28));
+    __m512i a_hi_bytes_u8x64 = _mm512_maskz_mov_epi8(a_hi_mask, ones_u8x64);
+    __m512i b_hi_bytes_u8x64 = _mm512_maskz_mov_epi8(b_hi_mask, ones_u8x64);
+    // Interleave low and high bytes into i16
+    __m512i a_lo_i16x32 = _mm512_unpacklo_epi8(a_lo_bytes_u8x64, a_hi_bytes_u8x64);
+    __m512i a_hi_i16x32 = _mm512_unpackhi_epi8(a_lo_bytes_u8x64, a_hi_bytes_u8x64);
+    __m512i b_lo_i16x32 = _mm512_unpacklo_epi8(b_lo_bytes_u8x64, b_hi_bytes_u8x64);
+    __m512i b_hi_i16x32 = _mm512_unpackhi_epi8(b_lo_bytes_u8x64, b_hi_bytes_u8x64);
+    // Combined sign: (a ^ b) & 0x20, need to apply at i16 level
+    // Compute sign mask at u8 level, widen to match unpacklo/unpackhi ordering via PEXT
+    __m512i sign_combined_u8x64 = _mm512_and_si512(_mm512_xor_si512(a_e3m2_u8x64, b_e3m2_u8x64), sign_mask_u8x64);
+    __mmask64 negate_u8_mask = _mm512_test_epi8_mask(sign_combined_u8x64, sign_combined_u8x64);
+    // Extract bits matching unpacklo element ordering (bytes 0-7,16-23,32-39,48-55 per 64-byte vector)
+    __mmask32 negate_lo_i16 = (__mmask32)_pext_u64(negate_u8_mask, 0x00FF00FF00FF00FFULL);
+    __mmask32 negate_hi_i16 = (__mmask32)_pext_u64(negate_u8_mask, 0xFF00FF00FF00FF00ULL);
+    // Negate b at i16 level using mask_sub
+    __m512i b_signed_lo_i16x32 = _mm512_mask_sub_epi16(b_lo_i16x32, negate_lo_i16, _mm512_setzero_si512(), b_lo_i16x32);
+    __m512i b_signed_hi_i16x32 = _mm512_mask_sub_epi16(b_hi_i16x32, negate_hi_i16, _mm512_setzero_si512(), b_hi_i16x32);
+    // VPMADDWD: a_i16 × b_signed_i16 → i32 accumulator
+    sum_i32x16 = _mm512_add_epi32(sum_i32x16, _mm512_madd_epi16(a_lo_i16x32, b_signed_lo_i16x32));
+    sum_i32x16 = _mm512_add_epi32(sum_i32x16, _mm512_madd_epi16(a_hi_i16x32, b_signed_hi_i16x32));
+    if (count_scalars) goto nk_dot_e3m2_skylake_cycle;
+    *result = (nk_f32_t)_mm512_reduce_add_epi32(sum_i32x16) / 256.0f;
+}
+#pragma endregion - Smaller Floats
+#pragma region - Small Integers
+NK_PUBLIC void nk_dot_i8_skylake(nk_i8_t const *a_scalars, nk_i8_t const *b_scalars, nk_size_t count_scalars,
+                                 nk_i32_t *result) {
+    __m512i sum_i32x16 = _mm512_setzero_si512();
+    nk_size_t idx_scalars = 0;
+    for (; idx_scalars + 32 <= count_scalars; idx_scalars += 32) {
+        // Load 32 bytes at a time and widen to i16
+        __m256i a_i8x32 = _mm256_loadu_si256((__m256i const *)(a_scalars + idx_scalars));
+        __m256i b_i8x32 = _mm256_loadu_si256((__m256i const *)(b_scalars + idx_scalars));
+        __m512i a_i16x32 = _mm512_cvtepi8_epi16(a_i8x32);
+        __m512i b_i16x32 = _mm512_cvtepi8_epi16(b_i8x32);
+        // VPMADDWD: 5cy (0.5/cy) @ p05 - multiply adjacent i16 pairs, add to i32
+        sum_i32x16 = _mm512_add_epi32(sum_i32x16, _mm512_madd_epi16(a_i16x32, b_i16x32));
+    }
+    nk_i32_t sum = _mm512_reduce_add_epi32(sum_i32x16);
+    for (; idx_scalars < count_scalars; ++idx_scalars) sum += (nk_i32_t)a_scalars[idx_scalars] * b_scalars[idx_scalars];
+    *result = sum;
+}
+NK_PUBLIC void nk_dot_u8_skylake(nk_u8_t const *a_scalars, nk_u8_t const *b_scalars, nk_size_t count_scalars,
+                                 nk_u32_t *result) {
+    __m512i sum_i32x16 = _mm512_setzero_si512();
+    nk_size_t idx_scalars = 0;
+    for (; idx_scalars + 32 <= count_scalars; idx_scalars += 32) {
+        // Load 32 bytes and zero-extend to i16 (u8 → u16 via zero-extension)
+        __m256i a_u8x32 = _mm256_loadu_si256((__m256i const *)(a_scalars + idx_scalars));
+        __m256i b_u8x32 = _mm256_loadu_si256((__m256i const *)(b_scalars + idx_scalars));
+        __m512i a_u16x32 = _mm512_cvtepu8_epi16(a_u8x32);
+        __m512i b_u16x32 = _mm512_cvtepu8_epi16(b_u8x32);
+        // VPMADDWD: 5cy (0.5/cy) @ p05 - multiply adjacent i16 pairs, add to i32
+        sum_i32x16 = _mm512_add_epi32(sum_i32x16, _mm512_madd_epi16(a_u16x32, b_u16x32));
+    }
+    nk_u32_t sum = (nk_u32_t)_mm512_reduce_add_epi32(sum_i32x16);
+    for (; idx_scalars < count_scalars; ++idx_scalars) sum += (nk_u32_t)a_scalars[idx_scalars] * b_scalars[idx_scalars];
+    *result = sum;
+}
+typedef struct nk_dot_f64x8_state_skylake_t {
+    __m512d sum_f64x8;
+    __m512d compensation_f64x8;
+} nk_dot_f64x8_state_skylake_t;
+NK_INTERNAL void nk_dot_f64x8_init_skylake(nk_dot_f64x8_state_skylake_t *state) {
+    state->sum_f64x8 = _mm512_setzero_pd();
+    state->compensation_f64x8 = _mm512_setzero_pd();
+}
+NK_INTERNAL void nk_dot_f64x8_update_skylake(nk_dot_f64x8_state_skylake_t *state, nk_b512_vec_t a, nk_b512_vec_t b,
+                                             nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    __m512d sum_f64x8 = state->sum_f64x8;
+    __m512d compensation_f64x8 = state->compensation_f64x8;
+    __m512d a_f64x8 = a.zmm_pd;
+    __m512d b_f64x8 = b.zmm_pd;
+    // TwoProd: h = a * b, r = fma(a, b, -h) captures the rounding error
+    __m512d product_f64x8 = _mm512_mul_pd(a_f64x8, b_f64x8);
+    __m512d product_error_f64x8 = _mm512_fmsub_pd(a_f64x8, b_f64x8, product_f64x8);
+    // TwoSum: (t, q) = TwoSum(sum, h) where t = sum + h rounded, q = error
+    __m512d tentative_sum_f64x8 = _mm512_add_pd(sum_f64x8, product_f64x8);
+    __m512d virtual_addend_f64x8 = _mm512_sub_pd(tentative_sum_f64x8, sum_f64x8);
+    __m512d sum_error_f64x8 = _mm512_add_pd(
+        _mm512_sub_pd(sum_f64x8, _mm512_sub_pd(tentative_sum_f64x8, virtual_addend_f64x8)),
+        _mm512_sub_pd(product_f64x8, virtual_addend_f64x8));
+    // Update: sum = t, compensation += q + r
+    state->sum_f64x8 = tentative_sum_f64x8;
+    state->compensation_f64x8 = _mm512_add_pd(compensation_f64x8, _mm512_add_pd(sum_error_f64x8, product_error_f64x8));
+}
+NK_INTERNAL void nk_dot_f64x8_finalize_skylake(                                               //
+    nk_dot_f64x8_state_skylake_t const *state_a, nk_dot_f64x8_state_skylake_t const *state_b, //
+    nk_dot_f64x8_state_skylake_t const *state_c, nk_dot_f64x8_state_skylake_t const *state_d, //
+    nk_size_t total_dimensions, nk_b256_vec_t *result) {
+    nk_unused_(total_dimensions);
+    // Compensated horizontal reduction preserving Dot2 error tracking per state
+    result->f64s[0] = nk_dot_stable_sum_f64x8_skylake_(state_a->sum_f64x8, state_a->compensation_f64x8);
+    result->f64s[1] = nk_dot_stable_sum_f64x8_skylake_(state_b->sum_f64x8, state_b->compensation_f64x8);
+    result->f64s[2] = nk_dot_stable_sum_f64x8_skylake_(state_c->sum_f64x8, state_c->compensation_f64x8);
+    result->f64s[3] = nk_dot_stable_sum_f64x8_skylake_(state_d->sum_f64x8, state_d->compensation_f64x8);
+}
+typedef struct nk_dot_f32x8_state_skylake_t {
+    __m512d sum_f64x8;
+} nk_dot_f32x8_state_skylake_t;
+NK_INTERNAL void nk_dot_f32x8_init_skylake(nk_dot_f32x8_state_skylake_t *state) {
+    state->sum_f64x8 = _mm512_setzero_pd();
+}
+NK_INTERNAL void nk_dot_f32x8_update_skylake(nk_dot_f32x8_state_skylake_t *state, nk_b256_vec_t a, nk_b256_vec_t b,
+                                             nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Upcast 8 f32s to f64 for high-precision accumulation
+    __m512d a_f64x8 = _mm512_cvtps_pd(a.ymm_ps);
+    __m512d b_f64x8 = _mm512_cvtps_pd(b.ymm_ps);
+    // Simple FMA accumulation in f64
+    state->sum_f64x8 = _mm512_fmadd_pd(a_f64x8, b_f64x8, state->sum_f64x8);
+}
+NK_INTERNAL void nk_dot_f32x8_finalize_skylake(                                               //
+    nk_dot_f32x8_state_skylake_t const *state_a, nk_dot_f32x8_state_skylake_t const *state_b, //
+    nk_dot_f32x8_state_skylake_t const *state_c, nk_dot_f32x8_state_skylake_t const *state_d, //
+    nk_size_t total_dimensions, nk_b256_vec_t *result) {
+    nk_unused_(total_dimensions);
+    // ILP-optimized 4-way horizontal reduction for f64
+    // Step 1: 8->4 for all 4 states (extract high 256-bit half and add to low half)
+    __m256d sum_a_f64x4 = _mm256_add_pd(_mm512_castpd512_pd256(state_a->sum_f64x8),
+                                        _mm512_extractf64x4_pd(state_a->sum_f64x8, 1));
+    __m256d sum_b_f64x4 = _mm256_add_pd(_mm512_castpd512_pd256(state_b->sum_f64x8),
+                                        _mm512_extractf64x4_pd(state_b->sum_f64x8, 1));
+    __m256d sum_c_f64x4 = _mm256_add_pd(_mm512_castpd512_pd256(state_c->sum_f64x8),
+                                        _mm512_extractf64x4_pd(state_c->sum_f64x8, 1));
+    __m256d sum_d_f64x4 = _mm256_add_pd(_mm512_castpd512_pd256(state_d->sum_f64x8),
+                                        _mm512_extractf64x4_pd(state_d->sum_f64x8, 1));
+    // Step 2: 4->2 for all 4 states (extract high 128-bit half and add to low half)
+    __m128d sum_a_f64x2 = _mm_add_pd(_mm256_castpd256_pd128(sum_a_f64x4), _mm256_extractf128_pd(sum_a_f64x4, 1));
+    __m128d sum_b_f64x2 = _mm_add_pd(_mm256_castpd256_pd128(sum_b_f64x4), _mm256_extractf128_pd(sum_b_f64x4, 1));
+    __m128d sum_c_f64x2 = _mm_add_pd(_mm256_castpd256_pd128(sum_c_f64x4), _mm256_extractf128_pd(sum_c_f64x4, 1));
+    __m128d sum_d_f64x2 = _mm_add_pd(_mm256_castpd256_pd128(sum_d_f64x4), _mm256_extractf128_pd(sum_d_f64x4, 1));
+    // Step 3: Horizontal add pairs: [a0+a1, b0+b1] and [c0+c1, d0+d1]
+    __m128d sum_ab_f64x2 = _mm_hadd_pd(sum_a_f64x2, sum_b_f64x2);
+    __m128d sum_cd_f64x2 = _mm_hadd_pd(sum_c_f64x2, sum_d_f64x2);
+    result->ymm_pd = _mm256_set_m128d(sum_cd_f64x2, sum_ab_f64x2);
+}
+#pragma endregion - Traditional Floats
+typedef nk_dot_through_f32_state_skylake_t_ nk_dot_bf16x16_state_skylake_t;
+typedef nk_dot_through_f32_state_skylake_t_ nk_dot_f16x16_state_skylake_t;
+typedef struct nk_dot_e2m3x64_state_skylake_t {
+    __m512i sum_i32x16;
+} nk_dot_e2m3x64_state_skylake_t;
+NK_INTERNAL void nk_dot_e2m3x64_init_skylake(nk_dot_e2m3x64_state_skylake_t *state) {
+    state->sum_i32x16 = _mm512_setzero_si512();
+}
+NK_INTERNAL void nk_dot_e2m3x64_update_skylake(nk_dot_e2m3x64_state_skylake_t *state, nk_b512_vec_t a, nk_b512_vec_t b,
+                                               nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    __m512i const lut_lower_u8x64 = _mm512_set_epi8(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28,
+                                                    26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24,
+                                                    22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20,
+                                                    18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i const lut_upper_u8x64 = _mm512_set_epi8(120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32,
+                                                    120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32,
+                                                    120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32,
+                                                    120, 112, 104, 96, 88, 80, 72, 64, 60, 56, 52, 48, 44, 40, 36, 32);
+    __m512i const nibble_mask_u8x64 = _mm512_set1_epi8(0x0F);
+    __m512i const magnitude_mask_u8x64 = _mm512_set1_epi8(0x1F);
+    __m512i const half_select_u8x64 = _mm512_set1_epi8(0x10);
+    __m512i const sign_mask_u8x64 = _mm512_set1_epi8(0x20);
+    __m512i const ones_i16x32 = _mm512_set1_epi16(1);
+    __m512i a_u8x64 = a.zmm;
+    __m512i b_u8x64 = b.zmm;
+    __m512i a_magnitude = _mm512_and_si512(a_u8x64, magnitude_mask_u8x64);
+    __m512i b_magnitude = _mm512_and_si512(b_u8x64, magnitude_mask_u8x64);
+    __m512i a_shuffle_idx = _mm512_and_si512(a_magnitude, nibble_mask_u8x64);
+    __m512i b_shuffle_idx = _mm512_and_si512(b_magnitude, nibble_mask_u8x64);
+    __mmask64 a_upper = _mm512_test_epi8_mask(a_magnitude, half_select_u8x64);
+    __mmask64 b_upper = _mm512_test_epi8_mask(b_magnitude, half_select_u8x64);
+    __m512i a_unsigned = _mm512_mask_blend_epi8(a_upper, _mm512_shuffle_epi8(lut_lower_u8x64, a_shuffle_idx),
+                                                _mm512_shuffle_epi8(lut_upper_u8x64, a_shuffle_idx));
+    __m512i b_unsigned = _mm512_mask_blend_epi8(b_upper, _mm512_shuffle_epi8(lut_lower_u8x64, b_shuffle_idx),
+                                                _mm512_shuffle_epi8(lut_upper_u8x64, b_shuffle_idx));
+    __m512i sign_combined = _mm512_and_si512(_mm512_xor_si512(a_u8x64, b_u8x64), sign_mask_u8x64);
+    __mmask64 negate_mask = _mm512_test_epi8_mask(sign_combined, sign_combined);
+    __m512i b_signed = _mm512_mask_sub_epi8(b_unsigned, negate_mask, _mm512_setzero_si512(), b_unsigned);
+    __m512i products_i16x32 = _mm512_maddubs_epi16(a_unsigned, b_signed);
+    state->sum_i32x16 = _mm512_add_epi32(state->sum_i32x16, _mm512_madd_epi16(products_i16x32, ones_i16x32));
+}
+NK_INTERNAL void nk_dot_e2m3x64_finalize_skylake(                                                 //
+    nk_dot_e2m3x64_state_skylake_t const *state_a, nk_dot_e2m3x64_state_skylake_t const *state_b, //
+    nk_dot_e2m3x64_state_skylake_t const *state_c, nk_dot_e2m3x64_state_skylake_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *results) {
+    nk_unused_(total_dimensions);
+    // 16→8 for all 4 states (extract high 256-bit half and add to low half)
+    __m256i sum_a_i32x8 = _mm256_add_epi32(_mm512_castsi512_si256(state_a->sum_i32x16),
+                                           _mm512_extracti32x8_epi32(state_a->sum_i32x16, 1));
+    __m256i sum_b_i32x8 = _mm256_add_epi32(_mm512_castsi512_si256(state_b->sum_i32x16),
+                                           _mm512_extracti32x8_epi32(state_b->sum_i32x16, 1));
+    __m256i sum_c_i32x8 = _mm256_add_epi32(_mm512_castsi512_si256(state_c->sum_i32x16),
+                                           _mm512_extracti32x8_epi32(state_c->sum_i32x16, 1));
+    __m256i sum_d_i32x8 = _mm256_add_epi32(_mm512_castsi512_si256(state_d->sum_i32x16),
+                                           _mm512_extracti32x8_epi32(state_d->sum_i32x16, 1));
+    // 8→4: extract high 128-bit half and add to low half
+    __m128i sum_a_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(sum_a_i32x8), _mm256_extracti128_si256(sum_a_i32x8, 1));
+    __m128i sum_b_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(sum_b_i32x8), _mm256_extracti128_si256(sum_b_i32x8, 1));
+    __m128i sum_c_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(sum_c_i32x8), _mm256_extracti128_si256(sum_c_i32x8, 1));
+    __m128i sum_d_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(sum_d_i32x8), _mm256_extracti128_si256(sum_d_i32x8, 1));
+    // 4×4 transpose and reduce (same as Sierra/Haswell integer finalize)
+    __m128i transpose_ab_low_i32x4 = _mm_unpacklo_epi32(sum_a_i32x4, sum_b_i32x4);
+    __m128i transpose_cd_low_i32x4 = _mm_unpacklo_epi32(sum_c_i32x4, sum_d_i32x4);
+    __m128i transpose_ab_high_i32x4 = _mm_unpackhi_epi32(sum_a_i32x4, sum_b_i32x4);
+    __m128i transpose_cd_high_i32x4 = _mm_unpackhi_epi32(sum_c_i32x4, sum_d_i32x4);
+    __m128i lane0_i32x4 = _mm_unpacklo_epi64(transpose_ab_low_i32x4, transpose_cd_low_i32x4);
+    __m128i lane1_i32x4 = _mm_unpackhi_epi64(transpose_ab_low_i32x4, transpose_cd_low_i32x4);
+    __m128i lane2_i32x4 = _mm_unpacklo_epi64(transpose_ab_high_i32x4, transpose_cd_high_i32x4);
+    __m128i lane3_i32x4 = _mm_unpackhi_epi64(transpose_ab_high_i32x4, transpose_cd_high_i32x4);
+    __m128i sum_i32x4 = _mm_add_epi32(_mm_add_epi32(lane0_i32x4, lane1_i32x4), _mm_add_epi32(lane2_i32x4, lane3_i32x4));
+    __m128 sum_f32x4 = _mm_mul_ps(_mm_cvtepi32_ps(sum_i32x4), _mm_set1_ps(1.0f / 256.0f));
+    results->xmm = _mm_castps_si128(sum_f32x4);
+}
+typedef struct nk_dot_e3m2x64_state_skylake_t {
+    __m512i sum_a_i32x16;
+    __m512i sum_b_i32x16;
+} nk_dot_e3m2x64_state_skylake_t;
+NK_INTERNAL void nk_dot_e3m2x64_init_skylake(nk_dot_e3m2x64_state_skylake_t *state) {
+    state->sum_a_i32x16 = _mm512_setzero_si512();
+    state->sum_b_i32x16 = _mm512_setzero_si512();
+}
+NK_INTERNAL void nk_dot_e3m2x64_update_skylake(nk_dot_e3m2x64_state_skylake_t *state, nk_b512_vec_t a, nk_b512_vec_t b,
+                                               nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    __m512i const lut_lo_lower_u8x64 = _mm512_set_epi8(                                                               //
+        28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0, 28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0, //
+        28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0, 28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m512i const lut_lo_upper_u8x64 = _mm512_set_epi8(                                                           //
+        (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32, //
+        (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32, //
+        (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32, //
+        (char)192, (char)128, 64, 0, (char)224, (char)192, (char)160, (char)128, 112, 96, 80, 64, 56, 48, 40, 32);
+    __m512i const nibble_mask_u8x64 = _mm512_set1_epi8(0x0F);
+    __m512i const magnitude_mask_u8x64 = _mm512_set1_epi8(0x1F);
+    __m512i const half_select_u8x64 = _mm512_set1_epi8(0x10);
+    __m512i const sign_mask_u8x64 = _mm512_set1_epi8(0x20);
+    __m512i const ones_u8x64 = _mm512_set1_epi8(1);
+    __m512i a_u8x64 = a.zmm;
+    __m512i b_u8x64 = b.zmm;
+    __m512i a_magnitude = _mm512_and_si512(a_u8x64, magnitude_mask_u8x64);
+    __m512i b_magnitude = _mm512_and_si512(b_u8x64, magnitude_mask_u8x64);
+    __m512i a_shuffle_idx = _mm512_and_si512(a_magnitude, nibble_mask_u8x64);
+    __m512i b_shuffle_idx = _mm512_and_si512(b_magnitude, nibble_mask_u8x64);
+    __mmask64 a_upper = _mm512_test_epi8_mask(a_magnitude, half_select_u8x64);
+    __mmask64 b_upper = _mm512_test_epi8_mask(b_magnitude, half_select_u8x64);
+    __m512i a_lo_bytes = _mm512_mask_blend_epi8(a_upper, _mm512_shuffle_epi8(lut_lo_lower_u8x64, a_shuffle_idx),
+                                                _mm512_shuffle_epi8(lut_lo_upper_u8x64, a_shuffle_idx));
+    __m512i b_lo_bytes = _mm512_mask_blend_epi8(b_upper, _mm512_shuffle_epi8(lut_lo_lower_u8x64, b_shuffle_idx),
+                                                _mm512_shuffle_epi8(lut_lo_upper_u8x64, b_shuffle_idx));
+    __mmask64 a_hi_mask = _mm512_cmpge_epu8_mask(a_magnitude, _mm512_set1_epi8(28));
+    __mmask64 b_hi_mask = _mm512_cmpge_epu8_mask(b_magnitude, _mm512_set1_epi8(28));
+    __m512i a_hi_bytes = _mm512_maskz_mov_epi8(a_hi_mask, ones_u8x64);
+    __m512i b_hi_bytes = _mm512_maskz_mov_epi8(b_hi_mask, ones_u8x64);
+    __m512i a_lo_i16 = _mm512_unpacklo_epi8(a_lo_bytes, a_hi_bytes);
+    __m512i a_hi_i16 = _mm512_unpackhi_epi8(a_lo_bytes, a_hi_bytes);
+    __m512i b_lo_i16 = _mm512_unpacklo_epi8(b_lo_bytes, b_hi_bytes);
+    __m512i b_hi_i16 = _mm512_unpackhi_epi8(b_lo_bytes, b_hi_bytes);
+    // Combined sign: negate b at i16 level via PEXT + mask_sub
+    __m512i sign_combined = _mm512_and_si512(_mm512_xor_si512(a_u8x64, b_u8x64), sign_mask_u8x64);
+    __mmask64 negate_u8 = _mm512_test_epi8_mask(sign_combined, sign_combined);
+    __mmask32 negate_lo = (__mmask32)_pext_u64(negate_u8, 0x00FF00FF00FF00FFULL);
+    __mmask32 negate_hi = (__mmask32)_pext_u64(negate_u8, 0xFF00FF00FF00FF00ULL);
+    __m512i b_signed_lo = _mm512_mask_sub_epi16(b_lo_i16, negate_lo, _mm512_setzero_si512(), b_lo_i16);
+    __m512i b_signed_hi = _mm512_mask_sub_epi16(b_hi_i16, negate_hi, _mm512_setzero_si512(), b_hi_i16);
+    state->sum_a_i32x16 = _mm512_add_epi32(state->sum_a_i32x16, _mm512_madd_epi16(a_lo_i16, b_signed_lo));
+    state->sum_b_i32x16 = _mm512_add_epi32(state->sum_b_i32x16, _mm512_madd_epi16(a_hi_i16, b_signed_hi));
+}
+NK_INTERNAL void nk_dot_e3m2x64_finalize_skylake(                                                 //
+    nk_dot_e3m2x64_state_skylake_t const *state_a, nk_dot_e3m2x64_state_skylake_t const *state_b, //
+    nk_dot_e3m2x64_state_skylake_t const *state_c, nk_dot_e3m2x64_state_skylake_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *results) {
+    nk_unused_(total_dimensions);
+    // Merge two accumulators per state
+    __m512i merged_a = _mm512_add_epi32(state_a->sum_a_i32x16, state_a->sum_b_i32x16);
+    __m512i merged_b = _mm512_add_epi32(state_b->sum_a_i32x16, state_b->sum_b_i32x16);
+    __m512i merged_c = _mm512_add_epi32(state_c->sum_a_i32x16, state_c->sum_b_i32x16);
+    __m512i merged_d = _mm512_add_epi32(state_d->sum_a_i32x16, state_d->sum_b_i32x16);
+    // 16→8
+    __m256i sum_a_i32x8 = _mm256_add_epi32(_mm512_castsi512_si256(merged_a), _mm512_extracti32x8_epi32(merged_a, 1));
+    __m256i sum_b_i32x8 = _mm256_add_epi32(_mm512_castsi512_si256(merged_b), _mm512_extracti32x8_epi32(merged_b, 1));
+    __m256i sum_c_i32x8 = _mm256_add_epi32(_mm512_castsi512_si256(merged_c), _mm512_extracti32x8_epi32(merged_c, 1));
+    __m256i sum_d_i32x8 = _mm256_add_epi32(_mm512_castsi512_si256(merged_d), _mm512_extracti32x8_epi32(merged_d, 1));
+    // 8→4
+    __m128i sum_a_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(sum_a_i32x8), _mm256_extracti128_si256(sum_a_i32x8, 1));
+    __m128i sum_b_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(sum_b_i32x8), _mm256_extracti128_si256(sum_b_i32x8, 1));
+    __m128i sum_c_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(sum_c_i32x8), _mm256_extracti128_si256(sum_c_i32x8, 1));
+    __m128i sum_d_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(sum_d_i32x8), _mm256_extracti128_si256(sum_d_i32x8, 1));
+    // 4×4 transpose and reduce
+    __m128i transpose_ab_low_i32x4 = _mm_unpacklo_epi32(sum_a_i32x4, sum_b_i32x4);
+    __m128i transpose_cd_low_i32x4 = _mm_unpacklo_epi32(sum_c_i32x4, sum_d_i32x4);
+    __m128i transpose_ab_high_i32x4 = _mm_unpackhi_epi32(sum_a_i32x4, sum_b_i32x4);
+    __m128i transpose_cd_high_i32x4 = _mm_unpackhi_epi32(sum_c_i32x4, sum_d_i32x4);
+    __m128i lane0_i32x4 = _mm_unpacklo_epi64(transpose_ab_low_i32x4, transpose_cd_low_i32x4);
+    __m128i lane1_i32x4 = _mm_unpackhi_epi64(transpose_ab_low_i32x4, transpose_cd_low_i32x4);
+    __m128i lane2_i32x4 = _mm_unpacklo_epi64(transpose_ab_high_i32x4, transpose_cd_high_i32x4);
+    __m128i lane3_i32x4 = _mm_unpackhi_epi64(transpose_ab_high_i32x4, transpose_cd_high_i32x4);
+    __m128i sum_i32x4 = _mm_add_epi32(_mm_add_epi32(lane0_i32x4, lane1_i32x4), _mm_add_epi32(lane2_i32x4, lane3_i32x4));
+    __m128 sum_f32x4 = _mm_mul_ps(_mm_cvtepi32_ps(sum_i32x4), _mm_set1_ps(1.0f / 256.0f));
+    results->xmm = _mm_castps_si128(sum_f32x4);
+}
+#pragma endregion - Small Integers
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_SKYLAKE
+#endif // NK_TARGET_X86_
+#endif // NK_DOT_SKYLAKE_H