npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/matrix.hpp ADDED Viewed

@@ -0,0 +1,336 @@
+/**
+ *  @brief NumKong packed_matrix type for efficient GEMM.
+ *  @file include/numkong/matrix.hpp
+ *  @author Ash Vardanian
+ *  @date March 2026
+ *
+ *  Provides a pre-packed matrix type that wraps `dots_pack` / `dots_packed` for
+ *  cache-efficient matrix multiplication.
+ *
+ *  @code
+ *  auto b = nk::tensor<nk::f32_t>::try_zeros({256, 512});
+ *  auto packed = nk::packed_matrix<nk::f32_t>::try_pack(b.view());
+ *  // multiply many times with different A matrices
+ *  @endcode
+ */
+#ifndef NK_MATRIX_HPP
+#define NK_MATRIX_HPP
+#include <cstring>
+#include <type_traits>
+#include "numkong/dots.h"
+#include "numkong/maxsim.h"
+#include "numkong/tensor.hpp"
+namespace ashvardanian::numkong {
+#pragma region - Packing Utilities
+/**
+ *  @brief Estimates the memory requirements for packed B matrix.
+ *  @param[in] row_count Number of rows in B (n)
+ *  @param[in] depth Number of dimensions per row (k)
+ *  @return Size in bytes for row-major B data plus stride metadata
+ *
+ *  @tparam in_type_ Input element type
+ *  @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
+ */
+template <numeric_dtype in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
+NK_PUBLIC size_t dots_packed_size(size_t row_count, size_t depth) {
+    constexpr bool simd = allow_simd_ == prefer_simd_k;
+    if constexpr (std::is_same_v<in_type_, f64_t> && simd) return nk_dots_packed_size_f64(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, f32_t> && simd) return nk_dots_packed_size_f32(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, f16_t> && simd) return nk_dots_packed_size_f16(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, bf16_t> && simd) return nk_dots_packed_size_bf16(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, i8_t> && simd) return nk_dots_packed_size_i8(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, u8_t> && simd) return nk_dots_packed_size_u8(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, e4m3_t> && simd) return nk_dots_packed_size_e4m3(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, e5m2_t> && simd) return nk_dots_packed_size_e5m2(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, e2m3_t> && simd) return nk_dots_packed_size_e2m3(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, e3m2_t> && simd) return nk_dots_packed_size_e3m2(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, u4x2_t> && simd) return nk_dots_packed_size_u4(row_count, depth);
+    else if constexpr (std::is_same_v<in_type_, i4x2_t> && simd) return nk_dots_packed_size_i4(row_count, depth);
+    else {
+        // We need enough space for the pointer to the original B matrix and its stride
+        return sizeof(void *) + sizeof(size_t);
+    }
+}
+/**
+ *  @brief Packs matrix B into row-major form for efficient dots_packed access.
+ *  @param[in] b Input matrix B in row-major form [row_count x depth]
+ *  @param[in] row_count Number of rows in B (n)
+ *  @param[in] depth Number of dimensions per row (k)
+ *  @param[in] b_stride_in_bytes Stride between rows of B in bytes
+ *  @param[out] b_packed Output buffer for packed row-major B with metadata
+ *
+ *  @tparam in_type_ Input element type
+ *  @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
+ */
+template <numeric_dtype in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
+NK_PUBLIC void dots_pack(in_type_ const *b, size_t row_count, size_t depth, size_t b_stride_in_bytes, void *b_packed) {
+    using raw_t = typename in_type_::raw_t;
+    constexpr bool simd = allow_simd_ == prefer_simd_k;
+    if constexpr (std::is_same_v<in_type_, f64_t> && simd)
+        nk_dots_pack_f64(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, f32_t> && simd)
+        nk_dots_pack_f32(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, f16_t> && simd)
+        nk_dots_pack_f16(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, bf16_t> && simd)
+        nk_dots_pack_bf16(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, i8_t> && simd)
+        nk_dots_pack_i8(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, u8_t> && simd)
+        nk_dots_pack_u8(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, e4m3_t> && simd)
+        nk_dots_pack_e4m3(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, e5m2_t> && simd)
+        nk_dots_pack_e5m2(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, e2m3_t> && simd)
+        nk_dots_pack_e2m3(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, e3m2_t> && simd)
+        nk_dots_pack_e3m2(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, u4x2_t> && simd)
+        nk_dots_pack_u4(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else if constexpr (std::is_same_v<in_type_, i4x2_t> && simd)
+        nk_dots_pack_i4(reinterpret_cast<raw_t const *>(b), row_count, depth, b_stride_in_bytes, b_packed);
+    else {
+        // Persist the pointer to the original B matrix and its stride
+        char *b_packed_bytes = reinterpret_cast<char *>(b_packed);
+        std::memcpy(b_packed_bytes, &b, sizeof(void *));
+        std::memcpy(b_packed_bytes + sizeof(void *), &b_stride_in_bytes, sizeof(size_t));
+    }
+}
+/**
+ *  @brief Estimates the memory requirements for a maxsim packed vector set.
+ *  @param[in] vector_count Number of vectors to pack.
+ *  @param[in] depth Number of dimensions per vector.
+ *  @return Size in bytes for the packed buffer.
+ *
+ *  @tparam in_type_ Input element type (bf16_t, f32_t, f16_t).
+ *  @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`.
+ */
+template <numeric_dtype in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
+NK_PUBLIC std::size_t maxsim_packed_size(std::size_t vector_count, std::size_t depth) {
+    constexpr bool simd = allow_simd_ == prefer_simd_k;
+    if constexpr (std::is_same_v<in_type_, bf16_t> && simd) return nk_maxsim_packed_size_bf16(vector_count, depth);
+    else if constexpr (std::is_same_v<in_type_, f32_t> && simd) return nk_maxsim_packed_size_f32(vector_count, depth);
+    else if constexpr (std::is_same_v<in_type_, f16_t> && simd) return nk_maxsim_packed_size_f16(vector_count, depth);
+    else return sizeof(void *) + sizeof(std::size_t);
+}
+/**
+ *  @brief Packs vectors into a backend-specific layout for maxsim computation.
+ *  @param[in] vectors Input vectors in row-major order.
+ *  @param[in] vector_count Number of vectors.
+ *  @param[in] depth Number of dimensions per vector.
+ *  @param[in] stride Row stride in bytes for the input vectors.
+ *  @param[out] packed Output packed buffer from maxsim_packed_size.
+ *
+ *  @tparam in_type_ Input element type (bf16_t, f32_t, f16_t).
+ *  @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`.
+ */
+template <numeric_dtype in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
+NK_PUBLIC void maxsim_pack(typename in_type_::raw_t const *vectors, std::size_t vector_count, std::size_t depth,
+                           std::size_t stride, void *packed) {
+    constexpr bool simd = allow_simd_ == prefer_simd_k;
+    if constexpr (std::is_same_v<in_type_, bf16_t> && simd)
+        nk_maxsim_pack_bf16(vectors, vector_count, depth, stride, packed);
+    else if constexpr (std::is_same_v<in_type_, f32_t> && simd)
+        nk_maxsim_pack_f32(vectors, vector_count, depth, stride, packed);
+    else if constexpr (std::is_same_v<in_type_, f16_t> && simd)
+        nk_maxsim_pack_f16(vectors, vector_count, depth, stride, packed);
+    else {
+        char *packed_bytes = reinterpret_cast<char *>(packed);
+        std::memcpy(packed_bytes, &vectors, sizeof(void *));
+        std::memcpy(packed_bytes + sizeof(void *), &stride, sizeof(std::size_t));
+    }
+}
+#pragma endregion - Packing Utilities
+#pragma region - Packed Containers
+/**
+ *  @brief Owning, move-only, pre-packed matrix for efficient GEMM.
+ *  @tparam value_type_ Element type (e.g., f32_t, bf16_t).
+ *  @tparam allocator_type_ Allocator for the packed buffer (default: aligned_allocator<char>).
+ *
+ *  Wraps `dots_pack` to pre-arrange a matrix B into a cache-friendly layout.
+ *  Use `try_pack()` to create from a matrix_view, then pass to `dots_packed()` for computation.
+ */
+template <numeric_dtype value_type_, typename allocator_type_ = aligned_allocator<char>>
+struct packed_matrix {
+    using value_type = value_type_;
+    using result_type = typename value_type_::dot_result_t;
+    using allocator_type = allocator_type_;
+    using alloc_traits = std::allocator_traits<allocator_type_>;
+    using size_type = std::size_t;
+  private:
+    char *data_ = nullptr;
+    size_type size_bytes_ = 0;
+    size_type rows_ = 0;  // n (number of rows in B)
+    size_type depth_ = 0; // k (number of columns in B)
+    [[no_unique_address]] allocator_type_ alloc_;
+  public:
+    packed_matrix() noexcept = default;
+    explicit packed_matrix(allocator_type_ const &alloc) noexcept : alloc_(alloc) {}
+    ~packed_matrix() noexcept {
+        if (data_) alloc_traits::deallocate(alloc_, data_, size_bytes_);
+    }
+    packed_matrix(packed_matrix &&other) noexcept
+        : data_(std::exchange(other.data_, nullptr)), size_bytes_(std::exchange(other.size_bytes_, 0)),
+          rows_(std::exchange(other.rows_, 0)), depth_(std::exchange(other.depth_, 0)),
+          alloc_(std::move(other.alloc_)) {}
+    packed_matrix &operator=(packed_matrix &&other) noexcept {
+        if (this != &other) {
+            if (data_) alloc_traits::deallocate(alloc_, data_, size_bytes_);
+            if constexpr (alloc_traits::propagate_on_container_move_assignment::value) alloc_ = std::move(other.alloc_);
+            data_ = std::exchange(other.data_, nullptr);
+            size_bytes_ = std::exchange(other.size_bytes_, 0);
+            rows_ = std::exchange(other.rows_, 0);
+            depth_ = std::exchange(other.depth_, 0);
+        }
+        return *this;
+    }
+    packed_matrix(packed_matrix const &) = delete;
+    packed_matrix &operator=(packed_matrix const &) = delete;
+    /**
+     *  @brief Pack a 2D matrix_view into cache-efficient layout.
+     *  @param b 2D matrix view. Uses extents[0] as rows, extents[1] as depth.
+     *  @param alloc Allocator instance.
+     *  @return Non-empty packed_matrix on success, empty on failure.
+     */
+    [[nodiscard]] static packed_matrix try_pack(matrix_view<value_type_> b, allocator_type_ alloc = {}) noexcept {
+        packed_matrix pm(alloc);
+        if (b.rank() < 2) return pm;
+        pm.rows_ = b.extent(0);
+        pm.depth_ = b.extent(1);
+        pm.size_bytes_ = dots_packed_size<value_type_>(pm.rows_, pm.depth_);
+        if (pm.size_bytes_ == 0) return pm;
+        pm.data_ = alloc_traits::allocate(pm.alloc_, pm.size_bytes_);
+        if (!pm.data_) {
+            pm.size_bytes_ = 0;
+            return pm;
+        }
+        dots_pack<value_type_>(b.data(), pm.rows_, pm.depth_, static_cast<size_type>(b.stride_bytes(0)), pm.data_);
+        return pm;
+    }
+    /** @brief Number of rows in the packed matrix (n). */
+    constexpr size_type rows() const noexcept { return rows_; }
+    /** @brief Number of columns / depth (k). */
+    constexpr size_type depth() const noexcept { return depth_; }
+    /** @brief Size of the packed buffer in bytes. */
+    constexpr size_type size_bytes() const noexcept { return size_bytes_; }
+    /** @brief True if no matrix is packed. */
+    constexpr bool empty() const noexcept { return data_ == nullptr; }
+    /** @brief Raw pointer to the packed data. */
+    constexpr void const *data() const noexcept { return data_; }
+};
+/**
+ *  @brief Pre-packed vector set for MaxSim (ColBERT late-interaction).
+ *
+ *  MaxSim computes Σᵢ minⱼ angular(qᵢ, dⱼ) using quantized i8 screening
+ *  followed by full-precision refinement. Both queries and documents must
+ *  be independently packed before calling `maxsim()`.
+ *
+ *  Supported types: bf16_t, f32_t, f16_t.
+ */
+template <numeric_dtype value_type_, typename allocator_type_ = aligned_allocator<char>>
+class packed_maxsim {
+    using alloc_traits = std::allocator_traits<allocator_type_>;
+    char *data_ = nullptr;
+    std::size_t size_bytes_ = 0;
+    std::size_t vector_count_ = 0;
+    std::size_t depth_ = 0;
+    [[no_unique_address]] allocator_type_ alloc_;
+  public:
+    packed_maxsim() noexcept = default;
+    explicit packed_maxsim(allocator_type_ const &alloc) noexcept : alloc_(alloc) {}
+    ~packed_maxsim() noexcept {
+        if (data_) alloc_traits::deallocate(alloc_, data_, size_bytes_);
+    }
+    packed_maxsim(packed_maxsim &&o) noexcept
+        : data_(std::exchange(o.data_, nullptr)), size_bytes_(std::exchange(o.size_bytes_, 0)),
+          vector_count_(std::exchange(o.vector_count_, 0)), depth_(std::exchange(o.depth_, 0)),
+          alloc_(std::move(o.alloc_)) {}
+    packed_maxsim &operator=(packed_maxsim &&o) noexcept {
+        if (this != &o) {
+            if (data_) alloc_traits::deallocate(alloc_, data_, size_bytes_);
+            if constexpr (alloc_traits::propagate_on_container_move_assignment::value) alloc_ = std::move(o.alloc_);
+            data_ = std::exchange(o.data_, nullptr);
+            size_bytes_ = std::exchange(o.size_bytes_, 0);
+            vector_count_ = std::exchange(o.vector_count_, 0);
+            depth_ = std::exchange(o.depth_, 0);
+        }
+        return *this;
+    }
+    packed_maxsim(packed_maxsim const &) = delete;
+    packed_maxsim &operator=(packed_maxsim const &) = delete;
+    /** @brief Pack a 2D matrix of vectors. Returns empty on failure. */
+    [[nodiscard]] static packed_maxsim try_pack(matrix_view<value_type_> vectors, allocator_type_ alloc = {}) noexcept {
+        packed_maxsim pm(alloc);
+        if (vectors.rank() < 2) return pm;
+        pm.vector_count_ = vectors.extent(0);
+        pm.depth_ = vectors.extent(1);
+        pm.size_bytes_ = maxsim_packed_size<value_type_>(pm.vector_count_, pm.depth_);
+        if (pm.size_bytes_ == 0) return pm;
+        pm.data_ = alloc_traits::allocate(pm.alloc_, pm.size_bytes_);
+        if (!pm.data_) {
+            pm.size_bytes_ = 0;
+            return pm;
+        }
+        maxsim_pack<value_type_>(reinterpret_cast<typename value_type_::raw_t const *>(vectors.data()),
+                                 pm.vector_count_, pm.depth_, static_cast<std::size_t>(vectors.stride_bytes(0)),
+                                 pm.data_);
+        return pm;
+    }
+    std::size_t vector_count() const noexcept { return vector_count_; }
+    std::size_t rows() const noexcept { return vector_count_; }
+    std::size_t depth() const noexcept { return depth_; }
+    bool empty() const noexcept { return data_ == nullptr; }
+    void const *data() const noexcept { return data_; }
+    std::size_t size_bytes() const noexcept { return size_bytes_; }
+};
+#pragma endregion - Packed Containers
+} // namespace ashvardanian::numkong
+#endif // NK_MATRIX_HPP

package/include/numkong/maxsim/README.md ADDED Viewed

@@ -0,0 +1,187 @@
+# MaxSim Late-Interaction Scoring in NumKong
+NumKong implements ColBERT-style late-interaction scoring: the MaxSim score sums, over each query token, the minimum angular distance to any document token. A two-stage coarse-to-fine strategy uses i8-quantized screening to find the best document per query, then full-precision refinement computes the final angular distance.
+MaxSim score:
+```math
+\text{MaxSim}(Q, D) = \sum_{i=0}^{m-1} \min_{j=0}^{n-1} \text{angular}(q_i, d_j)
+```
+Coarse screening finds the best document via i8 dot products as a proxy for argmin angular:
+```math
+j^* = \arg\max_j \text{dot}_{\text{i8}}(q_i, d_j)
+```
+Full-precision refinement:
+```math
+\text{angular}(q_i, d_{j^*}) = 1 - \frac{\text{dot}(q_i, d_{j^*})}{\|q_i\| \cdot \|d_{j^*}\|}
+```
+Reformulating as Python pseudocode:
+```python
+import numpy as np
+def maxsim(queries: np.ndarray, documents: np.ndarray) -> float:
+    score = 0.0
+    for q in queries:
+        dots = documents @ q
+        best = np.argmax(dots)
+        d = documents[best]
+        angular = 1 - np.dot(q, d) / (np.linalg.norm(q) * np.linalg.norm(d))
+        score += angular
+    return score
+```
+## Input & Output Types
+| Input Type | Output Type | Description                        |
+| ---------- | ----------- | ---------------------------------- |
+| `bf16`     | `f32`       | 16-bit brain float, widened output |
+| `f32`      | `f32`       | 32-bit IEEE 754 single precision   |
+| `f16`      | `f32`       | 16-bit IEEE 754 half precision     |
+## Optimizations
+### Dual Pre-Packing Advantage
+`nk_maxsim_packed_bf16_sme`, `nk_maxsim_packed_f32_sme` benefit from having _both_ query and document matrices pre-packed into identical contiguous formats, unlike the `nk_dots_packed_*` family where only B is pre-packed and A is accessed with arbitrary stride.
+In the dots GEMM, one ZA tile must be reserved for A-side staging (loading unpacked A rows into the tile array), leaving 3 ZA tiles for accumulation.
+With both sides pre-packed, all 4 ZA tiles (ZA0–ZA3) serve as accumulators — a +33% increase in MOPA throughput.
+No output matrix materialization: dots_packed writes a full M×N f32 result matrix, while maxsim reduces each query row to a single argmax index in-flight, eliminating the M×N memory round-trip.
+Benchmark data (Apple M4, SVL=512):
+| Dimensions           | dots_packed GEMM | maxsim fused | GEMM speedup | End-to-end |
+| -------------------- | ---------------: | -----------: | -----------: | ---------: |
+| 32×128×128 (ColBERT) |       840 GFLOPS |  1516 GFLOPS |        1.81× |      5.10× |
+| 32×256×128           |      1037 GFLOPS |  1591 GFLOPS |        1.53× |      5.17× |
+| 64×512×128           |      1016 GFLOPS |  1651 GFLOPS |        1.62× |      5.42× |
+| 32×128×256           |       859 GFLOPS |  1725 GFLOPS |        2.01× |      4.06× |
+| 32×1024×768 (BERT)   |      1124 GFLOPS |  1932 GFLOPS |        1.72× |      2.61× |
+End-to-end speedup (5×) exceeds GEMM-only speedup (1.5–2×) because maxsim eliminates output materialization and fuses argmax+angular refinement into the tile extraction loop.
+### Two-Stage Coarse-to-Fine Scoring
+All backends use i8-quantized coarse screening at O(m·n·k) with 1 byte/element instead of 2–4, followed by full-precision refinement at O(m·k) for only the winning pairs.
+Break-even at ~4 documents per query — beyond that, coarse screening dominates and the i8 bandwidth advantage compounds.
+### ISA-Specific Quantization Ranges
+Haswell uses [-79, 79] — `VPMADDUBSW` produces i16 intermediates, must avoid saturation (2×depth×79 < 32767).
+Alder Lake and Ice Lake use [-127, 127] — `VPDPBUSD` accumulates directly to i32, no i16 bottleneck.
+WASM v128relaxed uses [-63, 63] — `i32x4_relaxed_dot_i8x16_i7x16_add` requires 7-bit operands.
+Serial uses [-127, 127].
+### XOR-0x80 Bias Correction
+`nk_maxsim_packed_bf16_haswell`, `nk_maxsim_packed_f32_alder`, `nk_maxsim_packed_f32_icelake` work around the unsigned×signed operand requirement of `VPMADDUBSW` and `VPDPBUSD`.
+Both query and document are signed after quantization, so queries are XOR'd with `0x80` to shift to unsigned range.
+Post-multiply correction subtracts $128 \cdot \text{sum\_i8}(d_j)$ per document, where sums are precomputed in packed metadata.
+### Vertical Column Extraction on SME
+`nk_maxsim_packed_bf16_sme`, `nk_maxsim_packed_f32_sme` accumulate Q×D dot products into ZA tiles (4 tiles ZA0–ZA3, each SVL×SVL).
+The argmax operation needs to find the best document for each query.
+The naive approach reads rows horizontally (`svread_hor_za32`) and reduces each row with `svmaxv` — but `svmaxv` is a horizontal reduction costing ~8 cycles on typical SVE implementations.
+Vertical column extraction flips the access pattern: `svread_ver_za32_f32_m` reads one _column_ of ZA, returning one dot-product score per query for a single document.
+Element-wise `svcmpgt_f32` + `svsel_f32` (~1 cycle each) update the running maximum across all queries simultaneously.
+For 32 queries × 256 documents: horizontal approach = 32 × 256 × `svmaxv` = 8,192 horizontal reductions; vertical approach = 256 column reads × 1 element-wise `svmax` = 256 vertical reads + 256 comparisons (~270 cycles vs ~2,048 cycles for the argmax phase alone).
+The argmax index is tracked in-flight using `svsel` to conditionally update an index vector alongside the maximum values — no separate argmax pass needed.
+After finding the best document index per query, full-precision angular refinement uses the originals stored in the packed buffer's third region.
+### Three-Region Packed Buffer
+All backends use a three-region packed buffer layout: [Header 64B] [i8 vectors, 64B-aligned] [metadata, 64B-aligned] [originals, 64B-aligned].
+Per-vector metadata (12 bytes) stores quantization scale, i8 sum (for bias correction), and inverse norm (for angular finalization).
+The originals region stores full-precision vectors for refinement via existing `nk_dot_*` primitives.
+## Performance
+The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
+The input size is controlled by `NK_MATRIX_HEIGHT`, `NK_MATRIX_WIDTH`, and `NK_MATRIX_DEPTH` environment variables, all set to the same value for late-interaction scoring over square matrices.
+Columns show throughput for 256³, 1024³, and 4096³ configurations.
+The throughput is measured in GSO/s as Giga Scalar Operations per Second, with $\text{ops} = 2 \cdot M \cdot N \cdot K$ complexity for scoring $M$ query tokens against $N$ document tokens of dimension $K$.
+Accuracy is reported as mean ULP (units in last place) unless noted otherwise — the average number of representable floating-point values between the result and the exact answer.
+Each kernel runs for at least 20 seconds per configuration.
+Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
+Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
+### Intel Sapphire Rapids
+#### Native
+| Kernel                              |                     256³ |                    1024³ |                    4096³ |
+| :---------------------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f32__                             | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_f32_serial`       |    15.7 gso/s, 48.9K ulp |    15.2 gso/s, 48.9K ulp |    16.3 gso/s, 48.9K ulp |
+| `nk_maxsim_packed_f32_haswell`      |    77.2 gso/s, 49.3K ulp |    70.7 gso/s, 49.3K ulp |    74.5 gso/s, 49.3K ulp |
+| `nk_maxsim_packed_f32_alder`        |    99.7 gso/s, 48.9K ulp |    97.7 gso/s, 48.9K ulp |    94.5 gso/s, 48.9K ulp |
+| `nk_maxsim_packed_f32_icelake`      |     131 gso/s, 48.9K ulp |     124 gso/s, 48.9K ulp |     136 gso/s, 48.9K ulp |
+| `nk_maxsim_packed_f32_sapphireamx`  |     273 gso/s, 48.9K ulp |     293 gso/s, 48.9K ulp |     285 gso/s, 48.9K ulp |
+| __bf16__                            | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_bf16_serial`      |    15.9 gso/s, 49.0K ulp |    17.0 gso/s, 49.0K ulp |    15.3 gso/s, 49.0K ulp |
+| `nk_maxsim_packed_bf16_haswell`     |    79.2 gso/s, 49.3K ulp |    85.0 gso/s, 49.3K ulp |    81.0 gso/s, 49.3K ulp |
+| `nk_maxsim_packed_bf16_alder`       |     114 gso/s, 49.0K ulp |     110 gso/s, 49.0K ulp |     115 gso/s, 49.0K ulp |
+| `nk_maxsim_packed_bf16_genoa`       |     163 gso/s, 49.0K ulp |     165 gso/s, 49.0K ulp |     174 gso/s, 49.0K ulp |
+| `nk_maxsim_packed_bf16_sapphireamx` |       418 gso/s, 994 ulp |       418 gso/s, 994 ulp |       445 gso/s, 994 ulp |
+| __f16__                             | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_f16_serial`       |    15.5 gso/s, 49.4K ulp |    15.6 gso/s, 49.4K ulp |    16.9 gso/s, 49.4K ulp |
+| `nk_maxsim_packed_f16_haswell`      |    79.1 gso/s, 49.8K ulp |    78.1 gso/s, 49.8K ulp |    79.1 gso/s, 49.8K ulp |
+| `nk_maxsim_packed_f16_alder`        |     113 gso/s, 49.4K ulp |     112 gso/s, 49.4K ulp |     107 gso/s, 49.4K ulp |
+| `nk_maxsim_packed_f16_icelake`      |     154 gso/s, 49.4K ulp |     164 gso/s, 49.4K ulp |     163 gso/s, 49.4K ulp |
+| `nk_maxsim_packed_f16_sapphireamx`  |     339 gso/s, 49.5K ulp |     395 gso/s, 49.5K ulp |     381 gso/s, 49.5K ulp |
+#### WASM
+Measured with Wasmtime v42 (Cranelift backend).
+| Kernel                              |                     256³ |                    1024³ |                    4096³ |
+| :---------------------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f32__                             | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_f32_serial`       |       ? gso/s, 46.8K ulp |       ? gso/s, 46.8K ulp |       ? gso/s, 46.8K ulp |
+| `nk_maxsim_packed_f32_v128relaxed`  |       ? gso/s, 1.58M ulp |       ? gso/s, 1.58M ulp |       ? gso/s, 1.58M ulp |
+| __bf16__                            | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_bf16_serial`      |       ? gso/s, 47.0K ulp |       ? gso/s, 47.0K ulp |       ? gso/s, 47.0K ulp |
+| `nk_maxsim_packed_bf16_v128relaxed` |       ? gso/s, 1.58M ulp |       ? gso/s, 1.58M ulp |       ? gso/s, 1.58M ulp |
+| __f16__                             | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_f16_serial`       |       ? gso/s, 46.4K ulp |       ? gso/s, 46.4K ulp |       ? gso/s, 46.4K ulp |
+| `nk_maxsim_packed_f16_v128relaxed`  |       ? gso/s, 1.58M ulp |       ? gso/s, 1.58M ulp |       ? gso/s, 1.58M ulp |
+### Apple M4
+#### Native
+| Kernel                           |                     256³ |                    1024³ |                    4096³ |
+| :------------------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f32__                          | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_f32_serial`    |      124 gso/s, 166K ulp |      136 gso/s, 104K ulp |     130 gso/s, 55.1K ulp |
+| `nk_maxsim_packed_f32_neonsdot`  |      170 gso/s, 167K ulp |      240 gso/s, 104K ulp |     167 gso/s, 55.1K ulp |
+| `nk_maxsim_packed_f32_sme`       |      291 gso/s, 200K ulp |   1,800 gso/s, 64.6K ulp |           ? gso/s, ? ulp |
+| __bf16__                         | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_bf16_serial`   |      135 gso/s, 167K ulp |      139 gso/s, 105K ulp |     132 gso/s, 54.8K ulp |
+| `nk_maxsim_packed_bf16_neonsdot` |      192 gso/s, 167K ulp |      257 gso/s, 105K ulp |     161 gso/s, 54.8K ulp |
+| `nk_maxsim_packed_bf16_sme`      |     580 gso/s, 16.1K ulp |     1,620 gso/s, 735 ulp |           ? gso/s, ? ulp |
+| __f16__                          | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_f16_serial`    |      136 gso/s, 169K ulp |      140 gso/s, 104K ulp |     134 gso/s, 55.1K ulp |
+| `nk_maxsim_packed_f16_neonsdot`  |      193 gso/s, 166K ulp |      255 gso/s, 104K ulp |     172 gso/s, 55.1K ulp |
+| `nk_maxsim_packed_f16_sme`       |     573 gso/s, 16.0K ulp |     1,620 gso/s, 725 ulp |           ? gso/s, ? ulp |
+#### WASM
+Measured with Wasmtime v42 (Cranelift backend).
+| Kernel                              |                     256³ |                    1024³ |                    4096³ |
+| :---------------------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f32__                             | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_f32_serial`       |    9.22 gso/s, 46.8K ulp |    10.1 gso/s, 46.8K ulp |    10.5 gso/s, 46.8K ulp |
+| `nk_maxsim_packed_f32_v128relaxed`  |    28.9 gso/s, 46.0K ulp |    31.2 gso/s, 46.0K ulp |    32.0 gso/s, 46.0K ulp |
+| __bf16__                            | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_bf16_serial`      |    8.95 gso/s, 49.2K ulp |    10.1 gso/s, 49.2K ulp |    10.0 gso/s, 49.2K ulp |
+| `nk_maxsim_packed_bf16_v128relaxed` |    29.6 gso/s, 49.4K ulp |    31.9 gso/s, 49.4K ulp |    31.6 gso/s, 49.4K ulp |
+| __f16__                             | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_maxsim_packed_f16_serial`       |    9.21 gso/s, 49.5K ulp |    10.3 gso/s, 49.5K ulp |    10.6 gso/s, 49.5K ulp |
+| `nk_maxsim_packed_f16_v128relaxed`  |    27.2 gso/s, 49.3K ulp |    33.7 gso/s, 49.3K ulp |    31.5 gso/s, 49.3K ulp |