npm - numkong - Versions diffs - 7.4.5 → 7.6.0 - Mend

numkong 7.4.5 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/README.md +1 -0
package/binding.gyp +99 -5
package/c/dispatch_e5m2.c +23 -3
package/c/dispatch_f16.c +23 -0
package/c/numkong.c +0 -13
package/include/numkong/attention/sme.h +34 -31
package/include/numkong/capabilities.h +2 -15
package/include/numkong/cast/README.md +3 -0
package/include/numkong/cast/haswell.h +28 -64
package/include/numkong/cast/neon.h +15 -0
package/include/numkong/cast/serial.h +17 -0
package/include/numkong/cast/skylake.h +67 -52
package/include/numkong/cast.h +1 -0
package/include/numkong/curved/smef64.h +82 -62
package/include/numkong/dot/README.md +1 -0
package/include/numkong/dot/haswell.h +92 -13
package/include/numkong/dot/rvvbf16.h +1 -1
package/include/numkong/dot/rvvhalf.h +1 -1
package/include/numkong/dot/serial.h +15 -0
package/include/numkong/dot/skylake.h +61 -14
package/include/numkong/dot/sve.h +6 -5
package/include/numkong/dot/svebfdot.h +2 -1
package/include/numkong/dot/svehalf.h +6 -5
package/include/numkong/dot/svesdot.h +3 -2
package/include/numkong/dots/README.md +2 -0
package/include/numkong/dots/graniteamx.h +1167 -0
package/include/numkong/dots/haswell.h +28 -28
package/include/numkong/dots/sapphireamx.h +1 -1
package/include/numkong/dots/serial.h +33 -11
package/include/numkong/dots/skylake.h +28 -23
package/include/numkong/dots/sme.h +172 -140
package/include/numkong/dots/smebi32.h +14 -11
package/include/numkong/dots/smef64.h +31 -26
package/include/numkong/dots.h +41 -3
package/include/numkong/each/serial.h +39 -0
package/include/numkong/geospatial/haswell.h +1 -1
package/include/numkong/geospatial/neon.h +1 -1
package/include/numkong/geospatial/serial.h +15 -4
package/include/numkong/geospatial/skylake.h +1 -1
package/include/numkong/maxsim/serial.h +15 -0
package/include/numkong/maxsim/sme.h +34 -33
package/include/numkong/mesh/README.md +50 -44
package/include/numkong/mesh/genoa.h +462 -0
package/include/numkong/mesh/haswell.h +806 -933
package/include/numkong/mesh/neon.h +871 -943
package/include/numkong/mesh/neonbfdot.h +382 -522
package/include/numkong/mesh/neonfhm.h +676 -0
package/include/numkong/mesh/rvv.h +404 -319
package/include/numkong/mesh/serial.h +225 -161
package/include/numkong/mesh/skylake.h +1029 -1585
package/include/numkong/mesh/v128relaxed.h +403 -377
package/include/numkong/mesh.h +38 -0
package/include/numkong/reduce/neon.h +29 -0
package/include/numkong/reduce/neonbfdot.h +2 -2
package/include/numkong/reduce/neonfhm.h +4 -4
package/include/numkong/reduce/serial.h +15 -1
package/include/numkong/reduce/sve.h +52 -0
package/include/numkong/reduce.h +4 -0
package/include/numkong/set/sve.h +6 -5
package/include/numkong/sets/smebi32.h +35 -30
package/include/numkong/sparse/serial.h +17 -2
package/include/numkong/sparse/sve2.h +3 -2
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +98 -56
package/include/numkong/spatial/serial.h +15 -0
package/include/numkong/spatial/skylake.h +114 -54
package/include/numkong/spatial/sve.h +7 -6
package/include/numkong/spatial/svebfdot.h +7 -4
package/include/numkong/spatial/svehalf.h +5 -4
package/include/numkong/spatial/svesdot.h +9 -8
package/include/numkong/spatial.h +0 -12
package/include/numkong/spatials/graniteamx.h +301 -0
package/include/numkong/spatials/serial.h +39 -0
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +391 -350
package/include/numkong/spatials/smef64.h +79 -70
package/include/numkong/spatials.h +54 -4
package/include/numkong/tensor.hpp +107 -23
package/include/numkong/types.h +59 -0
package/javascript/dist/cjs/numkong.js +13 -0
package/javascript/dist/esm/numkong.js +13 -0
package/javascript/numkong.c +59 -14
package/javascript/numkong.ts +13 -0
package/package.json +7 -7
package/probes/probe.js +2 -2
package/wasm/numkong.wasm +0 -0

package/include/numkong/dots/haswell.h CHANGED Viewed

@@ -115,45 +115,45 @@ nk_define_cross_packed_(dots, bf16, haswell, bf16, bf16, f32, nk_b256_vec_t, nk_
                         nk_partial_store_b32x4_haswell_,
                         /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
-/* E4M3 GEMM: depth_simd_dimensions=8 (8 e4m3s = 8 bytes) → upcasted to 8×f32 (256-bit) */
-nk_define_cross_pack_size_(dots, e4m3, haswell, e4m3, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/8,
+/* E4M3 GEMM: depth_simd_dimensions=32 (byte-level batch; widen inside the update helper) */
+nk_define_cross_pack_size_(dots, e4m3, haswell, e4m3, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/32,
                            /*dimensions_per_value=*/1)
-nk_define_cross_pack_(dots, e4m3, haswell, e4m3, f32, nk_b256_vec_t, nk_load_e4m3x8_to_f32x8_haswell_,
-                      nk_partial_load_e4m3x8_to_f32x8_haswell_, nk_store_b256_haswell_, nk_partial_store_b32x8_serial_,
-                      /*simd_width=*/8, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e4m3_,
-                      /*depth_simd_dimensions=*/8, /*dimensions_per_value=*/1)
+nk_define_cross_pack_(dots, e4m3, haswell, e4m3, f32, nk_b256_vec_t, nk_load_b256_haswell_,
+                      nk_partial_load_b8x32_serial_, nk_store_b256_haswell_, nk_partial_store_b8x32_serial_,
+                      /*simd_width=*/32, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e4m3_,
+                      /*depth_simd_dimensions=*/32, /*dimensions_per_value=*/1)
 nk_define_cross_symmetric_(dots, e4m3, haswell, e4m3, f32, nk_b256_vec_t, nk_dot_through_f32_state_haswell_t_,
-                           nk_b128_vec_t, nk_dot_through_f32_init_haswell_, nk_load_e4m3x8_to_f32x8_haswell_,
-                           nk_partial_load_e4m3x8_to_f32x8_haswell_, nk_dot_through_f32_update_haswell_,
+                           nk_b128_vec_t, nk_dot_through_f32_init_haswell_, nk_load_b256_haswell_,
+                           nk_partial_load_b8x32_serial_, nk_dot_e4m3x32_update_haswell_,
                            nk_dot_through_f32_finalize_haswell_, nk_store_b128_haswell_,
                            nk_partial_store_b32x4_haswell_,
-                           /*depth_simd_dimensions=*/8, /*dimensions_per_value=*/1)
+                           /*depth_simd_dimensions=*/32, /*dimensions_per_value=*/1)
 nk_define_cross_packed_(dots, e4m3, haswell, e4m3, f32, f32, nk_b256_vec_t, nk_dot_through_f32_state_haswell_t_,
-                        nk_b128_vec_t, nk_dot_through_f32_init_haswell_, nk_load_e4m3x8_to_f32x8_haswell_,
-                        nk_partial_load_e4m3x8_to_f32x8_haswell_, nk_load_b256_haswell_, nk_partial_load_b32x8_serial_,
-                        nk_dot_through_f32_update_haswell_, nk_dot_through_f32_finalize_haswell_,
-                        nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_,
-                        /*depth_simd_dimensions=*/8, /*dimensions_per_value=*/1)
+                        nk_b128_vec_t, nk_dot_through_f32_init_haswell_, nk_load_b256_haswell_,
+                        nk_partial_load_b8x32_serial_, nk_load_b256_haswell_, nk_partial_load_b8x32_serial_,
+                        nk_dot_e4m3x32_update_haswell_, nk_dot_through_f32_finalize_haswell_, nk_store_b128_haswell_,
+                        nk_partial_store_b32x4_haswell_,
+                        /*depth_simd_dimensions=*/32, /*dimensions_per_value=*/1)
-/* E5M2 GEMM: depth_simd_dimensions=8 (8 e5m2s = 8 bytes) → upcasted to 8×f32 (256-bit) */
-nk_define_cross_pack_size_(dots, e5m2, haswell, e5m2, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/8,
+/* E5M2 GEMM: depth_simd_dimensions=32 (byte-level batch; widen inside the update helper) */
+nk_define_cross_pack_size_(dots, e5m2, haswell, e5m2, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/32,
                            /*dimensions_per_value=*/1)
-nk_define_cross_pack_(dots, e5m2, haswell, e5m2, f32, nk_b256_vec_t, nk_load_e5m2x8_to_f32x8_haswell_,
-                      nk_partial_load_e5m2x8_to_f32x8_haswell_, nk_store_b256_haswell_, nk_partial_store_b32x8_serial_,
-                      /*simd_width=*/8, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e5m2_,
-                      /*depth_simd_dimensions=*/8, /*dimensions_per_value=*/1)
+nk_define_cross_pack_(dots, e5m2, haswell, e5m2, f32, nk_b256_vec_t, nk_load_b256_haswell_,
+                      nk_partial_load_b8x32_serial_, nk_store_b256_haswell_, nk_partial_store_b8x32_serial_,
+                      /*simd_width=*/32, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e5m2_,
+                      /*depth_simd_dimensions=*/32, /*dimensions_per_value=*/1)
 nk_define_cross_symmetric_(dots, e5m2, haswell, e5m2, f32, nk_b256_vec_t, nk_dot_through_f32_state_haswell_t_,
-                           nk_b128_vec_t, nk_dot_through_f32_init_haswell_, nk_load_e5m2x8_to_f32x8_haswell_,
-                           nk_partial_load_e5m2x8_to_f32x8_haswell_, nk_dot_through_f32_update_haswell_,
+                           nk_b128_vec_t, nk_dot_through_f32_init_haswell_, nk_load_b256_haswell_,
+                           nk_partial_load_b8x32_serial_, nk_dot_e5m2x32_update_haswell_,
                            nk_dot_through_f32_finalize_haswell_, nk_store_b128_haswell_,
                            nk_partial_store_b32x4_haswell_,
-                           /*depth_simd_dimensions=*/8, /*dimensions_per_value=*/1)
+                           /*depth_simd_dimensions=*/32, /*dimensions_per_value=*/1)
 nk_define_cross_packed_(dots, e5m2, haswell, e5m2, f32, f32, nk_b256_vec_t, nk_dot_through_f32_state_haswell_t_,
-                        nk_b128_vec_t, nk_dot_through_f32_init_haswell_, nk_load_e5m2x8_to_f32x8_haswell_,
-                        nk_partial_load_e5m2x8_to_f32x8_haswell_, nk_load_b256_haswell_, nk_partial_load_b32x8_serial_,
-                        nk_dot_through_f32_update_haswell_, nk_dot_through_f32_finalize_haswell_,
-                        nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_,
-                        /*depth_simd_dimensions=*/8, /*dimensions_per_value=*/1)
+                        nk_b128_vec_t, nk_dot_through_f32_init_haswell_, nk_load_b256_haswell_,
+                        nk_partial_load_b8x32_serial_, nk_load_b256_haswell_, nk_partial_load_b8x32_serial_,
+                        nk_dot_e5m2x32_update_haswell_, nk_dot_through_f32_finalize_haswell_, nk_store_b128_haswell_,
+                        nk_partial_store_b32x4_haswell_,
+                        /*depth_simd_dimensions=*/32, /*dimensions_per_value=*/1)
 /* E2M3 GEMM: integer LUT path, depth_simd_dimensions=32 (32 e2m3s = 32 bytes = AVX2 register width) */
 nk_define_cross_pack_size_(dots, e2m3, haswell, e2m3, e2m3, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/32,

package/include/numkong/dots/sapphireamx.h CHANGED Viewed

@@ -73,7 +73,7 @@
 #if NK_TARGET_SAPPHIREAMX
 #include "numkong/cast/icelake.h" // For FP8 ↔ BF16 conversions
-#include "numkong/dots/serial.h"  // For nk_dots_reduce_sumsq_bf16_
+#include "numkong/dots/serial.h"  // `nk_dots_reduce_sumsq_bf16_`
 #if defined(__cplusplus)
 extern "C" {

package/include/numkong/dots/serial.h CHANGED Viewed

@@ -522,7 +522,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
                                 load_a_vec_fn, partial_load_a_vec_fn, load_b_vec_fn, partial_load_b_vec_fn,            \
                                 inner_product_fn, reduce_accumulators_fn, store_fn, partial_store_fn,                  \
                                 depth_simd_dimensions, dimensions_per_value)                                           \
-    NK_PUBLIC void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_(                                 \
+    NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_(                               \
         nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix,  \
         nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes,                     \
         nk_size_t c_stride_in_bytes) {                                                                                 \
@@ -698,7 +698,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
             }                                                                                                          \
         }                                                                                                              \
     }                                                                                                                  \
-    NK_PUBLIC void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_(                             \
+    NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_(                           \
         nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix,  \
         nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes,                     \
         nk_size_t c_stride_in_bytes) {                                                                                 \
@@ -1090,7 +1090,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
     norm_value_type, vec_type, state_type, result_vec_type, init_accumulator_fn, load_a_vec_fn, partial_load_a_vec_fn, \
     load_b_vec_fn, partial_load_b_vec_fn, inner_product_fn, compensated_finalize_fn, store_fn, partial_store_fn,       \
     load_sum_fn, partial_load_sum_fn, compute_a_sum_fn, depth_simd_dimensions, dimensions_per_value)                   \
-    NK_PUBLIC void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_(                                 \
+    NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_(                               \
         nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix,  \
         nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes,                     \
         nk_size_t c_stride_in_bytes) {                                                                                 \
@@ -1200,7 +1200,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
             }                                                                                                          \
         }                                                                                                              \
     }                                                                                                                  \
-    NK_PUBLIC void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_(                             \
+    NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_(                           \
         nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix,  \
         nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes,                     \
         nk_size_t c_stride_in_bytes) {                                                                                 \
@@ -2431,13 +2431,25 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
         }                                                                                                              \
     }
-/* Optimize serial GEMM instantiations for size rather than speed.
- * These fallback kernels are only used when no SIMD backend is available, so aggressive inlining/unrolling from -O3
- * wastes ~1.3 MB of binary space with negligible performance benefit on the serial path. Sadly, a scoped application
- * of `__attribute__((optimize("Os"))` isn't supported on Clang, so this flag only applies to GCC builds.
- */
+/*  Keep the serial instantiations below actually scalar, regardless of build type.
+ *  Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
+ *  callers in dispatch_*.c, which wastes ~1 MB of binary and — more importantly —
+ *  breaks the nk_*_serial-as-scalar-oracle contract that tests and the numerical-
+ *  stability docs in this header rely on. */
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
+#endif
+/*  Size bias for release. Gated on NDEBUG so Debug builds keep -O0 for stepping. */
 #if defined(NDEBUG)
-#if defined(__GNUC__) && !defined(__clang__)
+#if defined(_MSC_VER)
+#pragma optimize("s", on)
+#elif defined(__clang__)
+#pragma clang attribute push(__attribute__((minsize)), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC optimize("Os")
 #endif
@@ -2677,11 +2689,21 @@ nk_define_cross_packed_(dots, u1, serial, u1x8, u1x8, u32, nk_b128_vec_t, nk_dot
                         /*depth_simd_dimensions=*/128, /*dimensions_per_value=*/8)
 #if defined(NDEBUG)
-#if defined(__GNUC__) && !defined(__clang__)
+#if defined(_MSC_VER)
+#pragma optimize("", on)
+#elif defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
 #endif
 #endif
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
 /*  BF16 compact: truncate F32 → BF16 in-place.
  *  Reads F32 matrix with c_stride_in_bytes, writes BF16 tightly packed (stride_in_bytes = column_count × sizeof(bf16)).
  */

package/include/numkong/dots/skylake.h CHANGED Viewed

@@ -114,45 +114,50 @@ nk_define_cross_packed_(dots, f16, skylake, f16, f32, f32, nk_b512_vec_t, nk_dot
                         nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
                         /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
-/* E4M3 GEMM: depth_simd_dimensions=16 (16 e4m3s = 16 bytes = quarter cache line), F32 accumulator */
-nk_define_cross_pack_size_(dots, e4m3, skylake, e4m3, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/16,
+/* E4M3 GEMM: F16-pack with asymmetric A/B representations at compute time. Pack converts
+ * E4M3 → F16 once (~10 ops/16 elements, 2 bytes/elt stored). A-stream uses the Giesen E4M3→F32
+ * cast (identical cost to F32-pack path). B-loader widens F16 → F32 inline (1 vcvtph2ps per 16
+ * lanes). Update takes both as F32 → plain fmadd. Saves 2 bytes/elt vs F32-pack; inner loop
+ * adds one cvtph2ps per B-read. Symmetric uses E4M3→F32 for both sides (no pack involved). */
+nk_define_cross_pack_size_(dots, e4m3, skylake, e4m3, f16, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/16,
                            /*dimensions_per_value=*/1)
-nk_define_cross_pack_(dots, e4m3, skylake, e4m3, f32, nk_b512_vec_t, nk_load_e4m3x16_to_f32x16_skylake_,
-                      nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_store_b512_skylake_,
-                      nk_partial_store_b32x16_skylake_, /*simd_width=*/16, /*norm_value_type=*/f32,
-                      nk_dots_reduce_sumsq_e4m3_, /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
+nk_define_cross_pack_(dots, e4m3, skylake, e4m3, f16, nk_b256_vec_t, nk_load_e4m3x16_to_f16x16_skylake_,
+                      nk_partial_load_e4m3x16_to_f16x16_skylake_, nk_store_b256_haswell_,
+                      nk_partial_store_b16x16_serial_,
+                      /*simd_width=*/16, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e4m3_,
+                      /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
 nk_define_cross_symmetric_(dots, e4m3, skylake, e4m3, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
                            nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e4m3x16_to_f32x16_skylake_,
                            nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_dot_through_f32_update_skylake_,
                            nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_,
                            nk_partial_store_b32x4_skylake_,
                            /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
-nk_define_cross_packed_(dots, e4m3, skylake, e4m3, f32, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
+nk_define_cross_packed_(dots, e4m3, skylake, e4m3, f16, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
                         nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e4m3x16_to_f32x16_skylake_,
-                        nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_load_b512_skylake_,
-                        nk_partial_load_b32x16_skylake_, nk_dot_through_f32_update_skylake_,
+                        nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_load_f16x16_to_f32x16_skylake_,
+                        nk_partial_load_f16x16_to_f32x16_skylake_, nk_dot_through_f32_update_skylake_,
                         nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
                         /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
-/* E5M2 GEMM: depth_simd_dimensions=16 (16 e5m2s = 16 bytes = quarter cache line), F32 accumulator */
-nk_define_cross_pack_size_(dots, e5m2, skylake, e5m2, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/16,
+/* E5M2 GEMM: depth_simd_dimensions=64 (byte-level batch; widen inside the update helper) */
+nk_define_cross_pack_size_(dots, e5m2, skylake, e5m2, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/64,
                            /*dimensions_per_value=*/1)
-nk_define_cross_pack_(dots, e5m2, skylake, e5m2, f32, nk_b512_vec_t, nk_load_e5m2x16_to_f32x16_skylake_,
-                      nk_partial_load_e5m2x16_to_f32x16_skylake_, nk_store_b512_skylake_,
-                      nk_partial_store_b32x16_skylake_, /*simd_width=*/16, /*norm_value_type=*/f32,
-                      nk_dots_reduce_sumsq_e5m2_, /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
+nk_define_cross_pack_(dots, e5m2, skylake, e5m2, f32, nk_b512_vec_t, nk_load_b512_skylake_,
+                      nk_partial_load_b8x64_skylake_, nk_store_b512_skylake_, nk_partial_store_b8x64_skylake_,
+                      /*simd_width=*/64, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e5m2_,
+                      /*depth_simd_dimensions=*/64, /*dimensions_per_value=*/1)
 nk_define_cross_symmetric_(dots, e5m2, skylake, e5m2, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
-                           nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e5m2x16_to_f32x16_skylake_,
-                           nk_partial_load_e5m2x16_to_f32x16_skylake_, nk_dot_through_f32_update_skylake_,
+                           nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_b512_skylake_,
+                           nk_partial_load_b8x64_skylake_, nk_dot_e5m2x64_update_skylake_,
                            nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_,
                            nk_partial_store_b32x4_skylake_,
-                           /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
+                           /*depth_simd_dimensions=*/64, /*dimensions_per_value=*/1)
 nk_define_cross_packed_(dots, e5m2, skylake, e5m2, f32, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
-                        nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e5m2x16_to_f32x16_skylake_,
-                        nk_partial_load_e5m2x16_to_f32x16_skylake_, nk_load_b512_skylake_,
-                        nk_partial_load_b32x16_skylake_, nk_dot_through_f32_update_skylake_,
-                        nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
-                        /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
+                        nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_b512_skylake_,
+                        nk_partial_load_b8x64_skylake_, nk_load_b512_skylake_, nk_partial_load_b8x64_skylake_,
+                        nk_dot_e5m2x64_update_skylake_, nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_,
+                        nk_partial_store_b32x4_skylake_,
+                        /*depth_simd_dimensions=*/64, /*dimensions_per_value=*/1)
 /* E2M3 GEMM: integer LUT path, depth_simd_dimensions=64 (64 e2m3s = 64 bytes = AVX-512 register width) */
 nk_define_cross_pack_size_(dots, e2m3, skylake, e2m3, e2m3, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/64,