npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/cast/v128relaxed.h ADDED Viewed

@@ -0,0 +1,180 @@
+/**
+ *  @brief SIMD-accelerated Type Conversions for WASM.
+ *  @file include/numkong/cast/v128relaxed.h
+ */
+#ifndef NK_CAST_V128RELAXED_H
+#define NK_CAST_V128RELAXED_H
+#if NK_TARGET_V128RELAXED
+#include "numkong/types.h"
+#include "numkong/cast/serial.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("relaxed-simd"))), apply_to = function)
+#endif
+/** @brief Native WASM SIMD 128-bit load. */
+NK_INTERNAL void nk_load_b128_v128relaxed_(void const *src, nk_b128_vec_t *dst) { dst->v128 = wasm_v128_load(src); }
+/** @brief Native WASM SIMD 256-bit load using two v128 loads. */
+NK_INTERNAL void nk_load_b256_v128relaxed_(void const *src, nk_b256_vec_t *dst) {
+    dst->v128s[0] = wasm_v128_load(src);
+    dst->v128s[1] = wasm_v128_load((char const *)src + 16);
+}
+/** @brief Native WASM SIMD 128-bit store. */
+NK_INTERNAL void nk_store_b128_v128relaxed_(nk_b128_vec_t const *src, void *dst) { wasm_v128_store(dst, src->v128); }
+/** @brief Native WASM SIMD 256-bit store using two v128 stores. */
+NK_INTERNAL void nk_store_b256_v128relaxed_(nk_b256_vec_t const *src, void *dst) {
+    wasm_v128_store(dst, src->v128s[0]);
+    wasm_v128_store((char *)dst + 16, src->v128s[1]);
+}
+/** @brief BF16 is the upper 16 bits of F32, so zero-extend to u32 and shift left by 16. */
+NK_INTERNAL nk_b128_vec_t nk_bf16x4_to_f32x4_v128relaxed_(nk_b64_vec_t bf16_vec) {
+    v128_t bf16_u16x4_in_u64 = wasm_v128_load64_zero(&bf16_vec.u64);
+    v128_t bf16_u32x4_low = wasm_u32x4_extend_low_u16x8(bf16_u16x4_in_u64);
+    nk_b128_vec_t result;
+    result.v128 = wasm_i32x4_shl(bf16_u32x4_low, 16);
+    return result;
+}
+/**
+ *  @brief F16→F32: extract sign/exp/mantissa, rebias exponent (F16 bias=15, F32 bias=127, delta=112),
+ *  widen mantissa from 10 to 23 bits.  Early-exit when all lanes are normal (exp in [1,30]),
+ *  skipping the expensive f32x4.convert_u32x4 needed for denormal FPU-based normalization.
+ */
+NK_INTERNAL nk_b128_vec_t nk_f16x4_to_f32x4_v128relaxed_(nk_b64_vec_t f16_vec) {
+    v128_t f16_u16x4_in_u64 = wasm_v128_load64_zero(&f16_vec.u64);
+    v128_t f16_u32x4 = wasm_u32x4_extend_low_u16x8(f16_u16x4_in_u64);
+    v128_t sign_u32x4 = wasm_v128_and(f16_u32x4, wasm_i32x4_splat(0x8000));                  // bit 15
+    v128_t exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(f16_u32x4, 10), wasm_i32x4_splat(0x1F)); // bits 14-10
+    v128_t mant_u32x4 = wasm_v128_and(f16_u32x4, wasm_i32x4_splat(0x03FF));                  // bits 9-0
+    v128_t sign_f32_u32x4 = wasm_i32x4_shl(sign_u32x4, 16); // shift sign to F32 bit 31
+    // Normal path: rebias exponent, widen mantissa
+    v128_t exp_rebiased_u32x4 = wasm_i32x4_add(exp_u32x4, wasm_i32x4_splat(112));
+    v128_t normal_exp_u32x4 = wasm_i32x4_shl(exp_rebiased_u32x4, 23);
+    v128_t normal_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 13);
+    v128_t normal_bits_u32x4 = wasm_v128_or(sign_f32_u32x4, wasm_v128_or(normal_exp_u32x4, normal_mant_u32x4));
+    // Early exit: skip zero/denormal/inf/NaN handling when all lanes are normal
+    v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
+    v128_t exp_max_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(31));
+    v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, exp_max_mask);
+    if (!wasm_v128_any_true(exceptional_mask)) {
+        nk_b128_vec_t result;
+        result.v128 = normal_bits_u32x4;
+        return result;
+    }
+    // Slow path: handle zero (exp=0, mant=0), denormal (exp=0, mant!=0), inf/NaN (exp=31)
+    v128_t zero_bits_u32x4 = sign_f32_u32x4;
+    v128_t inf_nan_bits_u32x4 = wasm_v128_or(
+        sign_f32_u32x4, wasm_v128_or(wasm_i32x4_splat(0x7F800000), wasm_i32x4_shl(mant_u32x4, 13)));
+    // Denormals: convert mantissa to f32 and multiply by 2^-24, letting the FPU normalize.
+    // This avoids a manual CLZ+shift loop.  The f32x4.convert_u32x4 legalizes to a
+    // multi-instruction sequence on x86 (no native u32→f32 until AVX-512), which is why
+    // the early exit above is so valuable.
+    v128_t mant_f32x4 = wasm_f32x4_convert_u32x4(mant_u32x4);
+    v128_t denorm_normalized_f32x4 = wasm_f32x4_mul(mant_f32x4, wasm_f32x4_splat(0x1p-24f));
+    v128_t denorm_bits_u32x4 = wasm_v128_or(denorm_normalized_f32x4, sign_f32_u32x4);
+    v128_t mant_zero_mask = wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(0));
+    v128_t is_zero_mask = wasm_v128_and(exp_zero_mask, mant_zero_mask);
+    v128_t is_denormal_mask = wasm_v128_andnot(exp_zero_mask, mant_zero_mask);
+    // Blend via relaxed_laneselect (1 instruction: vblendvps on x86, vs 3 for and/andn/or)
+    v128_t result_u32x4 = normal_bits_u32x4;
+    result_u32x4 = wasm_i32x4_relaxed_laneselect(zero_bits_u32x4, result_u32x4, is_zero_mask);
+    result_u32x4 = wasm_i32x4_relaxed_laneselect(denorm_bits_u32x4, result_u32x4, is_denormal_mask);
+    result_u32x4 = wasm_i32x4_relaxed_laneselect(inf_nan_bits_u32x4, result_u32x4, exp_max_mask);
+    nk_b128_vec_t result;
+    result.v128 = result_u32x4;
+    return result;
+}
+/**
+ *  @brief E4M3→F32: 4-bit exponent (bias=7→127, delta=120), 3-bit mantissa (shift by 20).
+ *  Subnormal via FPU: mant * (1/512) = mant * 2^-9.  NaN only at exp=15,mant=7.
+ */
+NK_INTERNAL nk_b128_vec_t nk_e4m3x4_to_f32x4_v128relaxed_(nk_b32_vec_t e4m3_vec) {
+    v128_t e4m3_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_v128_load32_zero(&e4m3_vec.u32)));
+    v128_t exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(e4m3_u32x4, 3), wasm_i32x4_splat(0x0F));
+    v128_t mant_u32x4 = wasm_v128_and(e4m3_u32x4, wasm_i32x4_splat(0x07));
+    v128_t sign_u32x4 = wasm_i32x4_shl(wasm_u32x4_shr(e4m3_u32x4, 7), 31);
+    v128_t f32_exp_u32x4 = wasm_i32x4_shl(wasm_i32x4_add(exp_u32x4, wasm_i32x4_splat(120)), 23);
+    v128_t f32_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 20);
+    v128_t normal_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_v128_or(f32_exp_u32x4, f32_mant_u32x4));
+    v128_t subnorm_abs_f32x4 = wasm_f32x4_mul(wasm_f32x4_convert_u32x4(mant_u32x4), wasm_f32x4_splat(1.0f / 512.0f));
+    v128_t subnorm_f32x4 = wasm_v128_or(subnorm_abs_f32x4, sign_u32x4);
+    v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
+    v128_t is_nan_mask = wasm_v128_and(wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(15)),
+                                       wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(7)));
+    v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, is_nan_mask);
+    if (!wasm_v128_any_true(exceptional_mask)) {
+        nk_b128_vec_t result;
+        result.v128 = normal_bits_u32x4;
+        return result;
+    }
+    v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(subnorm_f32x4, normal_bits_u32x4, exp_zero_mask);
+    if (wasm_v128_any_true(is_nan_mask)) {
+        v128_t nan_bits = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7FC00000));
+        result_u32x4 = wasm_i32x4_relaxed_laneselect(nan_bits, result_u32x4, is_nan_mask);
+    }
+    nk_b128_vec_t result;
+    result.v128 = result_u32x4;
+    return result;
+}
+/**
+ *  @brief E5M2→F32: same exponent encoding as F16 (5-bit, bias=15, delta=112), 2-bit mantissa (shift by 21).
+ *  Subnormal via FPU: mant * (1/65536) = mant * 2^-16.  Inf at exp=31,mant=0; NaN otherwise.
+ */
+NK_INTERNAL nk_b128_vec_t nk_e5m2x4_to_f32x4_v128relaxed_(nk_b32_vec_t e5m2_vec) {
+    v128_t e5m2_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_v128_load32_zero(&e5m2_vec.u32)));
+    v128_t exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(e5m2_u32x4, 2), wasm_i32x4_splat(0x1F));
+    v128_t mant_u32x4 = wasm_v128_and(e5m2_u32x4, wasm_i32x4_splat(0x03));
+    v128_t sign_u32x4 = wasm_i32x4_shl(wasm_u32x4_shr(e5m2_u32x4, 7), 31);
+    v128_t f32_exp_u32x4 = wasm_i32x4_shl(wasm_i32x4_add(exp_u32x4, wasm_i32x4_splat(112)), 23);
+    v128_t f32_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 21);
+    v128_t normal_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_v128_or(f32_exp_u32x4, f32_mant_u32x4));
+    v128_t subnorm_abs_f32x4 = wasm_f32x4_mul(wasm_f32x4_convert_u32x4(mant_u32x4), wasm_f32x4_splat(1.0f / 65536.0f));
+    v128_t subnorm_f32x4 = wasm_v128_or(subnorm_abs_f32x4, sign_u32x4);
+    v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
+    v128_t exp_max_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(31));
+    v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, exp_max_mask);
+    if (!wasm_v128_any_true(exceptional_mask)) {
+        nk_b128_vec_t result;
+        result.v128 = normal_bits_u32x4;
+        return result;
+    }
+    v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(subnorm_f32x4, normal_bits_u32x4, exp_zero_mask);
+    v128_t mant_zero_mask = wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(0));
+    v128_t inf_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7F800000));
+    v128_t nan_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7FC00000));
+    v128_t special_bits_u32x4 = wasm_i32x4_relaxed_laneselect(inf_bits_u32x4, nan_bits_u32x4, mant_zero_mask);
+    result_u32x4 = wasm_i32x4_relaxed_laneselect(special_bits_u32x4, result_u32x4, exp_max_mask);
+    nk_b128_vec_t result;
+    result.v128 = result_u32x4;
+    return result;
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_V128RELAXED
+#endif // NK_CAST_V128RELAXED_H

package/include/numkong/cast.h ADDED Viewed

@@ -0,0 +1,230 @@
+/**
+ *  @brief SIMD-accelerated Type Conversions.
+ *  @file include/numkong/cast.h
+ *  @author Ash Vardanian
+ *  @date January 2, 2026
+ *
+ *  This file focuses on numeric types not uniformly supported across platforms, prioritizing:
+ *
+ *  - `e5m2` & `e4m3` ↔ `f16` & `bf16` - used for low-precision dot-products on modern CPUs,
+ *  - `e5m2` & `e4m3` ↔ `f32` - used for low-precision dot-products on older CPUs,
+ *  - `f16` & `bf16` ↔ `f32` - often used for half-precision dot-products on older CPUs,
+ *
+ *  Unlike most operation classes in NumKong, these are dependent on two input types: "from" & "to".
+ *  It contains scalar helpers named like `nk_f16_to_f32_serial_` as well as buffer-to-buffer
+ *  `memcpy`-like vectorized operations, such as `nk_cast_f16_to_f32` with `nk_cast_f16_to_f32_serial`,
+ *  `nk_cast_f16_to_f32_neon`, `nk_cast_f16_to_f32_skylake`, and other platform-specific variants.
+ *
+ *  It also includes "partial load" and "partial store" type-punned helper functions for handling
+ *  IO between memory and registers, that are extensively reused in reductions, elementwise ops, and
+ *  dot-products.
+ *
+ *  Float-format narrowing uses round-to-nearest, ties-to-even. Float-to-integer narrowing follows
+ *  the same tie rule, saturates infinities, and maps NaNs to zero.
+ *
+ *  Assuming the overall breadth and sparsity of our type system, its clear, that not all type conversions
+ *  have equivalent relevance. With ~16 numeric types we'd be looking at 21x21=441 conversions for:
+ *
+ *              e4m3    e5m2    bf16    f16     f32     f64
+ *                              bf16c   f16c    f32c    f64c
+ *              i4      i8              i16     i32     i64
+ *      u1      u4      u8              u16     u32     u64
+ *
+ *  To simplify the design and make it more broadly applicable in AI workloads, we implement a slower
+ *  @b "hub-and-spoke" design to guiding most conversions through an intermediate type, like `f64` or `i64`.
+ *
+ */
+#ifndef NK_CAST_H
+#define NK_CAST_H
+#include "numkong/types.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/**
+ *  @brief Elementwise type-casting for arrays of entries.
+ *
+ *  @param[in] from The immutable input source array containing `n` elements of `from_type` type.
+ *  @param[in] from_type The type of elements in the immutable source array.
+ *  @param[in] n The number of elements in both input and output arrays.
+ *  @param[in] to The mutable output array containing `n` elements of `to_type` type.
+ *  @param[in] to_type The type of elements in the mutable target array.
+ */
+NK_DYNAMIC void nk_cast(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+/** @copydoc nk_cast */
+NK_PUBLIC void nk_cast_serial(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+/** @brief Scalar conversion from f16 to f32. */
+NK_DYNAMIC void nk_f16_to_f32(nk_f16_t const *src, nk_f32_t *dest);
+/** @brief Scalar conversion from bf16 to f32. */
+NK_DYNAMIC void nk_bf16_to_f32(nk_bf16_t const *src, nk_f32_t *dest);
+/** @brief Scalar conversion from e4m3 to f32. */
+NK_DYNAMIC void nk_e4m3_to_f32(nk_e4m3_t const *src, nk_f32_t *dest);
+/** @brief Scalar conversion from e5m2 to f32. */
+NK_DYNAMIC void nk_e5m2_to_f32(nk_e5m2_t const *src, nk_f32_t *dest);
+/** @brief Scalar conversion from e2m3 to f32. */
+NK_DYNAMIC void nk_e2m3_to_f32(nk_e2m3_t const *src, nk_f32_t *dest);
+/** @brief Scalar conversion from e3m2 to f32. */
+NK_DYNAMIC void nk_e3m2_to_f32(nk_e3m2_t const *src, nk_f32_t *dest);
+/** @brief Scalar conversion from f32 to f16. */
+NK_DYNAMIC void nk_f32_to_f16(nk_f32_t const *src, nk_f16_t *dest);
+/** @brief Scalar conversion from f32 to bf16. */
+NK_DYNAMIC void nk_f32_to_bf16(nk_f32_t const *src, nk_bf16_t *dest);
+/** @brief Scalar conversion from f32 to e4m3. */
+NK_DYNAMIC void nk_f32_to_e4m3(nk_f32_t const *src, nk_e4m3_t *dest);
+/** @brief Scalar conversion from f32 to e5m2. */
+NK_DYNAMIC void nk_f32_to_e5m2(nk_f32_t const *src, nk_e5m2_t *dest);
+/** @brief Scalar conversion from f32 to e2m3. */
+NK_DYNAMIC void nk_f32_to_e2m3(nk_f32_t const *src, nk_e2m3_t *dest);
+/** @brief Scalar conversion from f32 to e3m2. */
+NK_DYNAMIC void nk_f32_to_e3m2(nk_f32_t const *src, nk_e3m2_t *dest);
+/** @copydoc nk_f16_to_f32 */
+NK_PUBLIC void nk_f16_to_f32_serial(nk_f16_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_f16 */
+NK_PUBLIC void nk_f32_to_f16_serial(nk_f32_t const *src, nk_f16_t *dest);
+/** @copydoc nk_bf16_to_f32 */
+NK_PUBLIC void nk_bf16_to_f32_serial(nk_bf16_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_bf16 */
+NK_PUBLIC void nk_f32_to_bf16_serial(nk_f32_t const *src, nk_bf16_t *dest);
+/** @copydoc nk_e4m3_to_f32 */
+NK_PUBLIC void nk_e4m3_to_f32_serial(nk_e4m3_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_e4m3 */
+NK_PUBLIC void nk_f32_to_e4m3_serial(nk_f32_t const *src, nk_e4m3_t *dest);
+/** @copydoc nk_e5m2_to_f32 */
+NK_PUBLIC void nk_e5m2_to_f32_serial(nk_e5m2_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_e5m2 */
+NK_PUBLIC void nk_f32_to_e5m2_serial(nk_f32_t const *src, nk_e5m2_t *dest);
+/** @copydoc nk_e2m3_to_f32 */
+NK_PUBLIC void nk_e2m3_to_f32_serial(nk_e2m3_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_e2m3 */
+NK_PUBLIC void nk_f32_to_e2m3_serial(nk_f32_t const *src, nk_e2m3_t *dest);
+/** @copydoc nk_e3m2_to_f32 */
+NK_PUBLIC void nk_e3m2_to_f32_serial(nk_e3m2_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_e3m2 */
+NK_PUBLIC void nk_f32_to_e3m2_serial(nk_f32_t const *src, nk_e3m2_t *dest);
+#if NK_TARGET_NEON
+/** @copydoc nk_cast */
+NK_PUBLIC void nk_cast_neon(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+/** @copydoc nk_f16_to_f32 */
+NK_PUBLIC void nk_f16_to_f32_neon(nk_f16_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_f16 */
+NK_PUBLIC void nk_f32_to_f16_neon(nk_f32_t const *src, nk_f16_t *dest);
+#endif // NK_TARGET_NEON
+#if NK_TARGET_HASWELL
+/** @copydoc nk_cast */
+NK_PUBLIC void nk_cast_haswell(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+/** @copydoc nk_f16_to_f32 */
+NK_PUBLIC void nk_f16_to_f32_haswell(nk_f16_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_f16 */
+NK_PUBLIC void nk_f32_to_f16_haswell(nk_f32_t const *src, nk_f16_t *dest);
+#endif // NK_TARGET_HASWELL
+#if NK_TARGET_SKYLAKE
+/** @copydoc nk_cast */
+NK_PUBLIC void nk_cast_skylake(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+#endif // NK_TARGET_SKYLAKE
+#if NK_TARGET_ICELAKE
+/** @copydoc nk_cast */
+NK_PUBLIC void nk_cast_icelake(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+#endif // NK_TARGET_ICELAKE
+#if NK_TARGET_SAPPHIRE
+/** @copydoc nk_cast */
+NK_PUBLIC void nk_cast_sapphire(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+/** @copydoc nk_f16_to_f32 */
+NK_PUBLIC void nk_f16_to_f32_sapphire(nk_f16_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_f16 */
+NK_PUBLIC void nk_f32_to_f16_sapphire(nk_f32_t const *src, nk_f16_t *dest);
+#endif // NK_TARGET_SAPPHIRE
+#if NK_TARGET_RVV
+/** @copydoc nk_cast */
+NK_PUBLIC void nk_cast_rvv(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+#endif // NK_TARGET_RVV
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#include "numkong/cast/serial.h"
+#include "numkong/cast/neon.h"
+#include "numkong/cast/haswell.h"
+#include "numkong/cast/skylake.h"
+#include "numkong/cast/icelake.h"
+#include "numkong/cast/sapphire.h"
+#include "numkong/cast/rvv.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if !NK_DYNAMIC_DISPATCH
+NK_PUBLIC void nk_cast(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type) {
+#if NK_TARGET_SAPPHIRE
+    nk_cast_sapphire(from, from_type, n, to, to_type);
+#elif NK_TARGET_ICELAKE
+    nk_cast_icelake(from, from_type, n, to, to_type);
+#elif NK_TARGET_SKYLAKE
+    nk_cast_skylake(from, from_type, n, to, to_type);
+#elif NK_TARGET_HASWELL
+    nk_cast_haswell(from, from_type, n, to, to_type);
+#elif NK_TARGET_RVV
+    nk_cast_rvv(from, from_type, n, to, to_type);
+#elif NK_TARGET_NEON
+    nk_cast_neon(from, from_type, n, to, to_type);
+#else
+    nk_cast_serial(from, from_type, n, to, to_type);
+#endif
+}
+NK_PUBLIC void nk_f16_to_f32(nk_f16_t const *src, nk_f32_t *dest) {
+#if NK_TARGET_SAPPHIRE
+    nk_f16_to_f32_sapphire(src, dest);
+#elif NK_TARGET_HASWELL
+    nk_f16_to_f32_haswell(src, dest);
+#elif NK_TARGET_NEON
+    nk_f16_to_f32_neon(src, dest);
+#else
+    nk_f16_to_f32_serial(src, dest);
+#endif
+}
+NK_PUBLIC void nk_f32_to_f16(nk_f32_t const *src, nk_f16_t *dest) {
+#if NK_TARGET_SAPPHIRE
+    nk_f32_to_f16_sapphire(src, dest);
+#elif NK_TARGET_HASWELL
+    nk_f32_to_f16_haswell(src, dest);
+#elif NK_TARGET_NEON
+    nk_f32_to_f16_neon(src, dest);
+#else
+    nk_f32_to_f16_serial(src, dest);
+#endif
+}
+NK_PUBLIC void nk_bf16_to_f32(nk_bf16_t const *src, nk_f32_t *dest) { nk_bf16_to_f32_serial(src, dest); }
+NK_PUBLIC void nk_f32_to_bf16(nk_f32_t const *src, nk_bf16_t *dest) { nk_f32_to_bf16_serial(src, dest); }
+NK_PUBLIC void nk_e4m3_to_f32(nk_e4m3_t const *src, nk_f32_t *dest) { nk_e4m3_to_f32_serial(src, dest); }
+NK_PUBLIC void nk_f32_to_e4m3(nk_f32_t const *src, nk_e4m3_t *dest) { nk_f32_to_e4m3_serial(src, dest); }
+NK_PUBLIC void nk_e5m2_to_f32(nk_e5m2_t const *src, nk_f32_t *dest) { nk_e5m2_to_f32_serial(src, dest); }
+NK_PUBLIC void nk_f32_to_e5m2(nk_f32_t const *src, nk_e5m2_t *dest) { nk_f32_to_e5m2_serial(src, dest); }
+NK_PUBLIC void nk_e2m3_to_f32(nk_e2m3_t const *src, nk_f32_t *dest) { nk_e2m3_to_f32_serial(src, dest); }
+NK_PUBLIC void nk_f32_to_e2m3(nk_f32_t const *src, nk_e2m3_t *dest) { nk_f32_to_e2m3_serial(src, dest); }
+NK_PUBLIC void nk_e3m2_to_f32(nk_e3m2_t const *src, nk_f32_t *dest) { nk_e3m2_to_f32_serial(src, dest); }
+NK_PUBLIC void nk_f32_to_e3m2(nk_f32_t const *src, nk_e3m2_t *dest) { nk_f32_to_e3m2_serial(src, dest); }
+#endif // !NK_DYNAMIC_DISPATCH
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_CAST_H

package/include/numkong/curved/README.md ADDED Viewed

@@ -0,0 +1,223 @@
+# Curved Space Distances in NumKong
+NumKong implements distance functions for curved metric spaces: bilinear forms compute $a^T C b$ for an arbitrary metric tensor $C$, while Mahalanobis distance generalizes Euclidean distance to account for correlations between dimensions.
+Complex bilinear forms extend this to Hermitian inner products.
+These operations are central to Gaussian process inference, metric learning, and statistical distance measures.
+The bilinear form for real vectors is:
+```math
+\text{bilinear}(a, b, C) = a^T C b = \sum_{i=0}^{n-1} \sum_{j=0}^{n-1} a_i \cdot c_{ij} \cdot b_j
+```
+The Mahalanobis distance is:
+```math
+\text{mahalanobis}(a, b, C) = \sqrt{(a - b)^T C (a - b)}
+```
+For complex vectors, the bilinear form uses the conjugate transpose:
+```math
+\text{bilinear}(a, b, C) = a^H C b = \sum_{i=0}^{n-1} \sum_{j=0}^{n-1} \bar{a_i} \cdot c_{ij} \cdot b_j
+```
+Reformulating as Python pseudocode:
+```python
+import numpy as np
+def bilinear(a: np.ndarray, b: np.ndarray, C: np.ndarray) -> float:
+    return a @ C @ b
+def mahalanobis(a: np.ndarray, b: np.ndarray, C: np.ndarray) -> float:
+    diff = a - b
+    return np.sqrt(diff @ C @ diff)
+def bilinear_complex(a: np.ndarray, b: np.ndarray, C: np.ndarray) -> complex:
+    return np.conj(a) @ C @ b
+```
+## Input & Output Types
+Real bilinear and Mahalanobis:
+| Input Type | Output Type | Description                                    |
+| ---------- | ----------- | ---------------------------------------------- |
+| `f64`      | `f64`       | 64-bit IEEE 754 double precision               |
+| `f32`      | `f32`       | 32-bit IEEE 754 single precision               |
+| `f16`      | `f32`       | 16-bit IEEE 754 half precision, widened output |
+| `bf16`     | `f32`       | 16-bit brain float, widened output             |
+Complex bilinear:
+| Input Type | Output Type | Description                                |
+| ---------- | ----------- | ------------------------------------------ |
+| `f64c`     | `f64c`      | 64-bit complex pairs                       |
+| `f32c`     | `f32c`      | 32-bit complex pairs                       |
+| `f16c`     | `f32c`      | 16-bit complex pairs, widened output       |
+| `bf16c`    | `f32c`      | 16-bit brain complex pairs, widened output |
+## Optimizations
+### Row-Major Streaming with Nested Dot2
+`nk_bilinear_f64_skylake`, `nk_mahalanobis_f64_skylake` decompose the bilinear form $a^T C b$ as $\sum_i a_i \cdot \text{dot}(C_i, b)$ where $C_i$ is the $i$-th row of the metric tensor.
+Each inner dot product uses Dot2 compensation — TwoProd via FMA captures the rounding error of each $c_{ij} \cdot b_j$ product exactly, and a TwoSum chain propagates it through the accumulator.
+The outer sum over rows uses a second level of compensation, tracking the rounding error of each $a_i \cdot r_i$ accumulation.
+This nested structure gives $O(n)$ cache-friendly sequential access to the $n \times n$ matrix $C$, since each row is read once and discarded.
+`nk_bilinear_f32_neon`, `nk_bilinear_f32_skylake`, `nk_mahalanobis_f32_neon`, `nk_mahalanobis_f32_skylake` use the same row-major streaming pattern but accumulate in `f64` instead of Dot2, which provides sufficient precision for `f32` inputs.
+### SME Outer-Product Accumulation
+`nk_bilinear_f32_smef64`, `nk_bilinear_f64_smef64`, `nk_bilinear_f32c_smef64`, `nk_bilinear_f64c_smef64`, `nk_mahalanobis_f32_smef64`, `nk_mahalanobis_f64_smef64` use the Scalable Matrix Extension to compute the bilinear form as an outer-product accumulation.
+Each `FMOPA` instruction performs a rank-1 update $a_i \cdot b^T$ into the SME ZA tile array, and the matrix $C$ is streamed row-by-row and multiplied into the accumulator.
+This is fundamentally different from the row-major dot approach — it reformulates $a^T C b$ as a matrix-multiply problem where SME's 2D tile registers can exploit the matrix engine's throughput.
+For dimensions that align to the tile size, this approach achieves near-peak throughput; dimensions that do not align fall back to NEON for cleanup of the residual elements.
+### Complex Bilinear Decomposition
+`nk_bilinear_f32c_neon`, `nk_bilinear_f32c_skylake`, `nk_bilinear_f64c_skylake` compute $a^H C b$ where each element involves 4 real multiplications from the complex product $\bar{a_i} \cdot c_{ij} \cdot b_j$.
+The kernel decomposes this into real and imaginary dot products over rows of $C$: for each row $i$, it computes the real part as $a_{i,re} \cdot \text{dot}(C_i, b)_{re} + a_{i,im} \cdot \text{dot}(C_i, b)_{im}$ and the imaginary part with the conjugation baked in as sign flips.
+This fuses the conjugation of $a$ into the sign of the cross terms rather than explicitly negating the imaginary components, saving one negate operation per element.
+## Performance
+The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
+The input size is controlled by the `NK_CURVED_DIMENSIONS` environment variable.
+The metric tensor is a square matrix of side $N$, so each bilinear form $\mathbf{x}^\top M \mathbf{x}$ has $O(N^2)$ arithmetic complexity.
+Columns show matrix side length: 256², 1024², 4096².
+The throughput is measured in GSO/s as Giga Scalar Operations per Second.
+Accuracy is reported as mean ULP (units in last place) averaged over all test pairs — the average number of representable floating-point values between the computed result and the exact answer.
+Each kernel runs for at least 20 seconds per configuration.
+Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
+Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
+Rows marked `🧩` use external BLAS baselines rather than NumKong kernels.
+### Intel Sapphire Rapids
+#### Native
+| Kernel                        |                     256² |                    1024² |                    4096² |
+| :---------------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f64c__                      | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `bilinear_f64c_with_blas` 🧩   |               1.25 gso/s |               1.36 gso/s |               1.38 gso/s |
+| `nk_bilinear_f64c_serial`     |    0.0862 gso/s, 0.5 ulp |     0.161 gso/s, 0.2 ulp |     0.171 gso/s, 0.5 ulp |
+| `nk_bilinear_f64c_skylake`    |     0.583 gso/s, 3.5 ulp |     0.718 gso/s, 3.5 ulp |     0.765 gso/s, 3.5 ulp |
+| __f32c__                      | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `bilinear_f32c_with_blas` 🧩   |               2.14 gso/s |               2.61 gso/s |               2.57 gso/s |
+| `nk_bilinear_f32c_serial`     |       0.756 gso/s, 0 ulp |        1.37 gso/s, 0 ulp |        1.37 gso/s, 0 ulp |
+| `nk_bilinear_f32c_skylake`    |        1.72 gso/s, 0 ulp |        1.75 gso/s, 0 ulp |        1.46 gso/s, 0 ulp |
+| __bf16c__                     | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_bf16c_serial`    |       0.154 gso/s, 5 ulp |     0.158 gso/s, 5.8 ulp |       0.155 gso/s, 5 ulp |
+| `nk_bilinear_bf16c_genoa`     |        2.81 gso/s, 5 ulp |        4.57 gso/s, 5 ulp |        4.47 gso/s, 5 ulp |
+| __f16c__                      | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f16c_serial`     |     0.585 gso/s, 7.2 ulp |     0.592 gso/s, 7.2 ulp |     0.600 gso/s, 7.2 ulp |
+| __f64__                       | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `bilinear_f64_with_blas` 🧩    |               2.84 gso/s |               3.23 gso/s |               3.14 gso/s |
+| `nk_bilinear_f64_serial`      |     0.291 gso/s, 0.7 ulp |     0.565 gso/s, 0.4 ulp |     0.577 gso/s, 0.7 ulp |
+| `nk_mahalanobis_f64_serial`   |       0.267 gso/s, 0 ulp |       0.537 gso/s, 0 ulp |       0.539 gso/s, 0 ulp |
+| `nk_bilinear_f64_skylake`     |      1.79 gso/s, 1.6 ulp |      1.71 gso/s, 1.3 ulp |        1.59 gso/s, 1 ulp |
+| `nk_mahalanobis_f64_skylake`  |        1.77 gso/s, 0 ulp |        1.82 gso/s, 0 ulp |      2.12 gso/s, 0.2 ulp |
+| __f32__                       | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `bilinear_f32_with_blas` 🧩    |               4.09 gso/s |               5.61 gso/s |               6.59 gso/s |
+| `nk_bilinear_f32_serial`      |        1.19 gso/s, 0 ulp |        2.71 gso/s, 0 ulp |        2.68 gso/s, 0 ulp |
+| `nk_mahalanobis_f32_serial`   |        2.36 gso/s, 0 ulp |        2.53 gso/s, 0 ulp |        2.40 gso/s, 0 ulp |
+| `nk_bilinear_f32_haswell`     |        3.45 gso/s, 0 ulp |        3.66 gso/s, 0 ulp |        3.24 gso/s, 0 ulp |
+| `nk_mahalanobis_f32_haswell`  |        3.37 gso/s, 0 ulp |        3.28 gso/s, 0 ulp |        3.30 gso/s, 0 ulp |
+| `nk_bilinear_f32_skylake`     |        3.68 gso/s, 0 ulp |        3.08 gso/s, 0 ulp |        2.71 gso/s, 0 ulp |
+| `nk_mahalanobis_f32_skylake`  |        3.45 gso/s, 0 ulp |        2.94 gso/s, 0 ulp |        3.32 gso/s, 0 ulp |
+| __bf16__                      | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_bf16_serial`     |      0.321 gso/s, 16 ulp |      0.331 gso/s, 13 ulp |      0.314 gso/s, 12 ulp |
+| `nk_mahalanobis_bf16_serial`  |     0.216 gso/s, 2.2 ulp |     0.215 gso/s, 2.1 ulp |     0.211 gso/s, 2.3 ulp |
+| `nk_bilinear_bf16_haswell`    |       6.75 gso/s, 11 ulp |       7.04 gso/s, 13 ulp |       6.80 gso/s, 13 ulp |
+| `nk_mahalanobis_bf16_haswell` |        5.93 gso/s, 1 ulp |        5.77 gso/s, 1 ulp |        5.86 gso/s, 1 ulp |
+| `nk_bilinear_bf16_genoa`      |       6.22 gso/s, 18 ulp |       10.9 gso/s, 18 ulp |       10.3 gso/s, 18 ulp |
+| `nk_mahalanobis_bf16_genoa`   |    7.04 gso/s, 8.55K ulp |    8.76 gso/s, 8.41K ulp |    8.57 gso/s, 8.41K ulp |
+| __f16__                       | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f16_serial`      |      0.654 gso/s, 23 ulp |      0.652 gso/s, 23 ulp |      0.657 gso/s, 23 ulp |
+| `nk_mahalanobis_f16_serial`   |     0.510 gso/s, 2.7 ulp |     0.520 gso/s, 3.2 ulp |     0.500 gso/s, 2.7 ulp |
+| `nk_bilinear_f16_haswell`     |       7.36 gso/s, 37 ulp |       7.30 gso/s, 37 ulp |       7.29 gso/s, 37 ulp |
+| `nk_mahalanobis_f16_haswell`  |        6.75 gso/s, 1 ulp |        6.24 gso/s, 1 ulp |        6.83 gso/s, 1 ulp |
+#### WASM
+Measured with Wasmtime v42 (Cranelift backend).
+| Kernel                     |                     256² |                    1024² |                    4096² |
+| :------------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f64c__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f64c_serial`  |      0.21 gso/s, 1.2 ulp |      0.21 gso/s, 1.2 ulp |      0.21 gso/s, 1.2 ulp |
+| __f32c__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f32c_serial`  |        1.10 gso/s, 0 ulp |        1.07 gso/s, 0 ulp |        1.10 gso/s, 0 ulp |
+| __bf16c__                  | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_bf16c_serial` |      1.26 gso/s, 9.8 ulp |      1.31 gso/s, 9.8 ulp |      1.27 gso/s, 9.5 ulp |
+| __f16c__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f16c_serial`  |       0.40 gso/s, 39 ulp |       0.38 gso/s, 39 ulp |       0.40 gso/s, 39 ulp |
+| __f64__                    | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f64_serial`   |      0.49 gso/s, 0.6 ulp |      0.49 gso/s, 0.6 ulp |      0.48 gso/s, 0.6 ulp |
+| __f32__                    | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f32_serial`   |        2.54 gso/s, 0 ulp |        2.62 gso/s, 0 ulp |        2.53 gso/s, 0 ulp |
+| __bf16__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_bf16_serial`  |       2.91 gso/s, 27 ulp |       2.90 gso/s, 22 ulp |       2.98 gso/s, 22 ulp |
+| __f16__                    | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f16_serial`   |       0.76 gso/s, 74 ulp |       0.76 gso/s, 74 ulp |       0.78 gso/s, 74 ulp |
+### Apple M4
+#### Native
+| Kernel                          |                     256² |                    1024² |                    4096² |
+| :------------------------------ | -----------------------: | -----------------------: | -----------------------: |
+| __f64c__                        | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f64c_serial`       |     0.368 gso/s, 2.2 ulp |     0.371 gso/s, 2.2 ulp |     0.367 gso/s, 2.2 ulp |
+| __f32c__                        | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f32c_serial`       |        2.33 gso/s, 0 ulp |        2.27 gso/s, 0 ulp |        2.28 gso/s, 0 ulp |
+| `nk_bilinear_f32c_neon`         |        2.11 gso/s, 0 ulp |        1.89 gso/s, 0 ulp |        1.85 gso/s, 0 ulp |
+| __bf16c__                       | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_bf16c_serial`      |     2.83 gso/s, 33.0 ulp |     2.54 gso/s, 34.5 ulp |     2.49 gso/s, 34.5 ulp |
+| `nk_bilinear_bf16c_neonbfdot`   |     5.05 gso/s, 17.0 ulp |     4.20 gso/s, 17.0 ulp |     4.04 gso/s, 17.0 ulp |
+| __f16c__                        | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f16c_serial`       |     2.81 gso/s, 51.8 ulp |     2.54 gso/s, 51.8 ulp |     2.48 gso/s, 51.8 ulp |
+| `nk_bilinear_f16c_neonhalf`     |     5.00 gso/s, 17.3 ulp |     4.16 gso/s, 17.3 ulp |     4.00 gso/s, 16.4 ulp |
+| __f64__                         | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f64_serial`        |     0.717 gso/s, 0.4 ulp |     0.711 gso/s, 0.4 ulp |     0.721 gso/s, 0.4 ulp |
+| `nk_mahalanobis_f64_serial`     |     0.664 gso/s, 0.5 ulp |     0.667 gso/s, 0.5 ulp |     0.672 gso/s, 0.5 ulp |
+| __f32__                         | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f32_serial`        |        3.92 gso/s, 0 ulp |        3.05 gso/s, 0 ulp |        2.87 gso/s, 0 ulp |
+| `nk_mahalanobis_f32_serial`     |        3.42 gso/s, 0 ulp |        2.88 gso/s, 0 ulp |        2.74 gso/s, 0 ulp |
+| `nk_bilinear_f32_neon`          |        4.90 gso/s, 0 ulp |        3.82 gso/s, 0 ulp |        3.49 gso/s, 0 ulp |
+| `nk_mahalanobis_f32_neon`       |        4.68 gso/s, 0 ulp |        3.71 gso/s, 0 ulp |        3.48 gso/s, 0 ulp |
+| __bf16__                        | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_bf16_serial`       |     4.17 gso/s, 20.7 ulp |     3.19 gso/s, 21.2 ulp |     2.94 gso/s, 20.7 ulp |
+| `nk_mahalanobis_bf16_serial`    |      3.86 gso/s, 2.1 ulp |      2.98 gso/s, 2.2 ulp |      2.79 gso/s, 2.1 ulp |
+| `nk_bilinear_bf16_neonbfdot`    |     28.0 gso/s, 28.0 ulp |     23.5 gso/s, 41.2 ulp |     20.4 gso/s, 41.1 ulp |
+| `nk_mahalanobis_bf16_neonbfdot` |      9.14 gso/s, 2.2 ulp |      7.93 gso/s, 2.2 ulp |      7.43 gso/s, 2.2 ulp |
+| __f16__                         | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f16_serial`        |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_mahalanobis_f16_serial`     |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_bilinear_f16_neonhalf`      |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_mahalanobis_f16_neonhalf`   |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+#### WASM
+Measured with Wasmtime v42 (Cranelift backend).
+| Kernel                     |                     256² |                    1024² |                    4096² |
+| :------------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f64c__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f64c_serial`  |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| __f32c__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f32c_serial`  |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| __bf16c__                  | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_bf16c_serial` |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| __f16c__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f16c_serial`  |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| __f64__                    | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f64_serial`   |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| __f32__                    | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f32_serial`   |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| __bf16__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_bf16_serial`  |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| __f16__                    | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_bilinear_f16_serial`   |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |