npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/README.md CHANGED Viewed

@@ -1,20 +1,19 @@
 # NumKong: Mixed Precision for All
-NumKong (previously SimSIMD) delivers mixed-precision numerics that are often faster _and_ more accurate than standard BLAS libraries — in a 5 MB binary, across C, C++, Rust, Python, Go, JavaScript, and Swift.
-Over 1500 hand-tuned SIMD kernels for x86, Arm, RISC-V, and WASM power [Unum](https://www.unum.cloud/)'s open-source [USearch](https://github.com/unum-cloud/usearch) search engine and the DBMS & AI products built on it.
+Portable mixed-precision math, linear-algebra, & retrieval library with 2'000+ SIMD kernels for x86, Arm, RISC-V, LoongArch, Power, & WebAssembly, leveraging rare algebraic transforms with both 1D & 2D registers like AMX & SME, covering 15+ numeric types from 4-bit integers & 6-bit floats to 128-bit complex numbers, validated against 118-bit extended-precision baselines with saturation, casting, & rounding edge-case coverage, in a 5-100x smaller binary than other BLAS-like alternatives, co-designed with Tensor abstractions in C++, Python, Rust, JavaScript, GoLang, & Swift.
 ![NumKong banner](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/NumKong-v7.png?raw=true)
-## Latency, Throughput, & Numerical Stability Together in a Tiny Package
+## Latency, Throughput, & Numerical Stability
 Most libraries return dot products in the __same type as the input__ — Float16 × Float16 → Float16, Int8 × Int8 → Int8.
-That's a recipe for silent data corruption: a 2048-dimensional `i8` dot product can reach ±10 million, but `i8` maxes out at 127.
-NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results never overflow, and it's still faster.
+This leads to quiet overflow: a 2048-dimensional `i8` dot product can reach ±10 million, but `i8` maxes out at 127.
+NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results stay in range.
-> Single 2048-d dot product on Intel Sapphire Rapids (Xeon 8468), single-threaded, CPU-only packages.
+> Single 2048-d dot product on Intel [Sapphire Rapids](https://en.wikipedia.org/wiki/Sapphire_Rapids), single-threaded.
 > Each cell shows __gso/s, mean relative error__ vs higher-precision reference.
 > gso/s = Giga Scalar Operations per Second — a more suitable name than GFLOP/s when counting both integer and floating-point work.
-> Median of 5 runs × 500 K calls each. NumPy 2.4, PyTorch 2.10, JAX 0.9.
+> NumPy 2.4, PyTorch 2.10, JAX 0.9.
 | Input  |        NumPy + OpenBLAS |           PyTorch + MKL |                     JAX |               NumKong |
 | :----- | ----------------------: | ----------------------: | ----------------------: | --------------------: |
@@ -27,12 +26,12 @@ NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Flo
 | `i8`   | 1.1 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 0.5 gso/s, __overflow__ |    14.8 gso/s, 0% err |
 A fair objection: PyTorch and JAX are designed for throughput, not single-call latency.
-They lower execution graphs through XLA or vendored BLAS libraries like Intel MKL and Nvidia cuBLAS.
+They lower execution graphs through [XLA](https://openxla.org/) or vendored BLAS libraries like [Intel MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) and Nvidia [cuBLAS](https://developer.nvidia.com/cublas).
 So here's the same comparison on a throughput-oriented workload — matrix multiplication:
-> Matrix multiplication (2048 × 2048) × (2048 × 2048), single-threaded, same machine.
-> JAX/XLA numbers divided by 16 cores (XLA ignores thread restrictions).
-> NumKong uses `dots_packed` (pre-packed GEMM). Same format: __gso/s, mean relative error__.
+> Matrix multiplication (2048 × 2048) × (2048 × 2048) on Intel Sapphire Rapids, single-threaded.
+> gso/s = Giga Scalar Operations per Second, same format.
+> NumPy 2.4, PyTorch 2.10, JAX 0.9, same versions.
 | Input  |        NumPy + OpenBLAS |            PyTorch + MKL |                      JAX |              NumKong |
 | :----- | ----------------------: | -----------------------: | -----------------------: | -------------------: |
@@ -44,73 +43,87 @@ So here's the same comparison on a throughput-oriented workload — matrix multi
 | `e5m2` |                       — |      0.4 gso/s, 4.6% err |    ~26.4 gso/s, 4.6% err |    398 gso/s, 0% err |
 | `i8`   | 0.4 gso/s, __overflow__ | 50.0 gso/s, __overflow__ | ~0.0 gso/s, __overflow__ |   1279 gso/s, 0% err |
-For `f64`, NumKong's compensated "Dot2" summation is __10–50× more accurate__ than naive Float64 accumulation, depending on vector length.
-For `f32`, widening to Float64 gives __5–10× lower error__.
-For smaller types and especially integers, the gap is even more dramatic.
-And all of that fits into one of the smallest binaries in the industry:
-| Package                |   Size | Parallelism & Memory                                | Available For                       |
-| :--------------------- | -----: | :-------------------------------------------------- | :---------------------------------- |
-| PyTorch + MKL + oneDNN | 705 MB | Vector & Tile SIMD, OpenMP Threads, Internal Allocs | Python, C++, Java                   |
-| JAX + jaxlib           | 357 MB | Vector SIMD, XLA Threads, Internal Allocs           | Python                              |
-| NumPy + OpenBLAS       |  30 MB | Vector SIMD, Built-in Threads, Internal Allocs      | Python                              |
-| mathjs                 |   9 MB | No SIMD, No Threads, Countless Allocs               | JS                                  |
-| NumKong                |   5 MB | Vector & Tile SIMD, Your Threads, Your Allocs       | C, C++, Rust, Python, Go, JS, Swift |
-But kernels and precision are only part of the story — the larger investment is test coverage: every kernel is validated against 118-bit extended-precision baselines with per-type ULP budgets across log-normal, uniform, and Cauchy input distributions, enforcing triangle inequality, Cauchy-Schwarz bounds, NaN propagation, overflow detection, and probability-simplex constraints for every ISA variant in the table above, cross-validated against OpenBLAS, Intel MKL, and Apple Accelerate to catch regressions that no single reference can.
+For `f64`, compensated "Dot2" summation reduces error by 10–50× compared to naive Float64 accumulation, depending on vector length.
+For `f32`, widening to Float64 gives 5–10× lower error.
+The library ships as a relatively small binary:
+| Package          |   Size | Parallelism & Memory                              | Available For     |
+| :--------------- | -----: | :------------------------------------------------ | :---------------- |
+| PyTorch + MKL    | 705 MB | Vector & Tile SIMD, OpenMP Threads, Hidden Allocs | Python, C++, Java |
+| JAX + jaxlib     | 357 MB | Vector SIMD, XLA Threads, Hidden Allocs           | Python            |
+| NumPy + OpenBLAS |  30 MB | Vector SIMD, Built-in Threads, Hidden Allocs      | Python            |
+| mathjs           |   9 MB | No SIMD, No Threads, Many Allocs                  | JS                |
+| NumKong          |   5 MB | Vector & Tile SIMD, Your Threads, Your Allocs     | 7 languages       |
+Every kernel is validated against 118-bit extended-precision baselines with per-type ULP budgets across log-normal, uniform, and Cauchy input distributions.
+Tests check triangle inequality, Cauchy-Schwarz bounds, NaN propagation, overflow detection, and probability-simplex constraints for each ISA variant.
+Results are cross-validated against OpenBLAS, Intel MKL, and Apple Accelerate.
 A broader throughput comparison is maintained in [NumWars](https://github.com/ashvardanian/NumWars).
 ## Quick Start
-| Language   | Install                          | Compatible with                  | Guide                                        |
-| :--------- | :------------------------------- | :------------------------------- | :------------------------------------------- |
-| C / C++    | CMake, headers, or prebuilt      | Linux, macOS, Windows, Android   | [include/README.md](include/README.md)       |
-| Python     | `pip install`                    | Linux, macOS, Windows            | [python/README.md](python/README.md)         |
-| Rust       | `cargo add`                      | Linux, macOS, Windows            | [rust/README.md](rust/README.md)             |
-| JavaScript | `npm install` or `import` remote | Node.js, Bun, Deno & any browser | [javascript/README.md](javascript/README.md) |
-| Swift      | Swift Package Manager            | macOS, iOS, tvOS, watchOS        | [swift/README.md](swift/README.md)           |
-| Go         | `go get`                         | Linux, macOS, Windows via cGo    | [golang/README.md](golang/README.md)         |
+| Language | Install                    | Compatible with                | Guide                                        |
+| :------- | :------------------------- | :----------------------------- | :------------------------------------------- |
+| C / C++  | CMake, headers, & prebuilt | Linux, macOS, Windows, Android | [include/README.md](include/README.md)       |
+| Python   | `pip install`              | Linux, macOS, Windows          | [python/README.md](python/README.md)         |
+| Rust     | `cargo add`                | Linux, macOS, Windows          | [rust/README.md](rust/README.md)             |
+| JS       | `npm install` & `import`   | Node.js, Bun, Deno & browsers  | [javascript/README.md](javascript/README.md) |
+| Swift    | Swift Package Manager      | macOS, iOS, tvOS, watchOS      | [swift/README.md](swift/README.md)           |
+| Go       | `go get`                   | Linux, macOS, Windows via cGo  | [golang/README.md](golang/README.md)         |
 ## What's Inside
-NumKong spans 16 numeric types — from exotic GPU-only 6-bit floats to 64-bit complex numbers — across dozens of operations and 30+ SIMD backends, with hardware-aware defaults: Arm prioritizes `f16`, x86 prioritizes `bf16`.
-<div align="center">
-<pre><code>
-┌──────────────────────────────┬────────────────┬───────────────────────────┬────────────┐
-│          Operations          │   Datatypes    │         Backends          │ Ecosystems │
-├──────────────────────────────┼────────────────┼───────────────────────────┼────────────┤
-│ Vector-Vector                │ <a href="#numeric-types">Bits &amp; Ints</a>    │ <a href="#compile-time-and-run-time-dispatch">x86</a>                       │ Core       │
-│ <a href="include/README.md#dot-products">dot</a> · <a href="include/README.md#dense-distances">angular</a> · <a href="include/README.md#dense-distances">euclidean</a>    │ u1 · u4 · u8   │ Haswell · Alder Lake      │ <a href="include/README.md#the-c-abi">C 99</a>       │
-│ hamming · kld · jsd · …      │ i4 · i8        │ Sierra Forest · Skylake   │            │
-│                              │                │ Ice Lake · Genoa · Turin  │ Primary    │
-│ <a href="include/README.md#packed-matrix-kernels-for-gemm-like-workloads">Matrix-Matrix</a>                │ <a href="#mini-floats-e4m3-e5m2-e3m2--e2m3">Mini-floats</a>    │ Sapphire Rapids ·         │ <a href="include/README.md#the-c-layer">C++ 23</a>     │
-│ <a href="include/README.md#packed-matrix-kernels-for-gemm-like-workloads">dots_packed</a> · <a href="include/README.md#symmetric-kernels-for-syrk-like-workloads">dots_symmetric</a> │ e2m3 · e3m2    │ Granite Rapids            │ <a href="python/README.md">Python 3</a>   │
-│ <a href="include/README.md#packed-matrix-kernels-for-gemm-like-workloads">euclideans_packed</a> · …        │ e4m3 · e5m2    │                           │ <a href="rust/README.md">Rust</a>       │
-│                              │                │ <a href="#compile-time-and-run-time-dispatch">Arm</a>                       │            │
-│ Quadratic                    │ <a href="#float16--bfloat16-half-precision">Half &amp; Classic</a> │ NEON · NEONHalf · NEONFhm │ Additional │
-│ <a href="include/README.md#curved-metrics">bilinear</a> · mahalanobis       │ f16 · bf16     │ NEONBFDot · NEONSDot      │ <a href="swift/README.md">Swift</a> · <a href="javascript/README.md">JS</a> │
-│                              │ f32 · f64      │ SVE · SVEHalf · SVEBfDot  │ <a href="golang/README.md">Go</a>         │
-│ <a href="include/README.md#geospatial-metrics">Geospatial</a> &amp; <a href="include/README.md#geometric-mesh-alignment">Geometric</a>       │                │ SVESDot · SVE2            │            │
-│ haversine · vincenty         │ <a href="#complex-types">Complex</a>        │ SME · SMEF64 · SMEBI32    │ <a href="CONTRIBUTING.md">Tools</a>      │
-│ rmsd · kabsch · umeyama · …  │ f16c · bf16c   │                           │ <a href="test/README.md">Tests</a>      │
-│                              │ f32c · f64c    │ <a href="#compile-time-and-run-time-dispatch">RISC-V</a>                    │ <a href="bench/README.md">Benchmarks</a> │
-│ Bespoke                      │                │ RVV · RVVHalf             │ <a href="https://github.com/ashvardanian/NumWars">NumWars</a>    │
-│ <a href="include/numkong/each/README.md">fma</a> · blend · <a href="include/numkong/trigonometry/README.md">sin</a> · <a href="include/numkong/cast/README.md">cast</a>     │                │ RVVBf16 · RVVBB           │            │
-│ <a href="include/numkong/reduce/README.md">reduce_moments</a> · <a href="include/numkong/sparse/README.md">sparse_dot</a>  │                │                           │            │
-│ <a href="include/README.md#maxsim-and-late-interaction">maxsim</a> · intersect · …       │                │ <a href="CONTRIBUTING.md#cross-compilation">WASM</a>                      │            │
-│                              │                │ V128Relaxed               │            │
-└──────────────────────────────┴────────────────┴───────────────────────────┴────────────┘
-</code></pre>
-</div>
-Not every combination is implemented — only the ones that unlock interesting new opportunities.
-The `icelake` level doesn't get a `dot_bf16` variant, for example, and falls through to `dot_bf16_skylake`.
-Every operation has a `serial` fallback, but even types no CPU supports today get optimized via lookup tables and bit-twiddling hacks rather than scalar loops.
+NumKong covers 17 numeric types — from 6-bit floats to 64-bit complex numbers — across dozens of operations and 30+ SIMD backends, with hardware-aware defaults: Arm prioritizes `f16`, x86 prioritizes `bf16`.
+### Language Bindings
+| Operation                   | [C and C++][c] | [Python][py] | [Rust][rs] | [JavaScript][js] | [Swift][swift] | [GoLang][go] |
+| :-------------------------- | :------------: | :----------: | :--------: | :--------------: | :------------: | :----------: |
+| __Vector Ops__              |                |              |            |                  |                |              |
+| [Dot] Product               |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
+| [Spatial] Metric            |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
+| [Set] Similarity            |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
+| [Geo]spatial                |       ●        |      ●       |     ●      |        ·         |       ●        |      ●       |
+| [Mesh] Alignment            |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
+| [Sparse] Products           |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
+| [Probability] Divergences   |       ●        |      ●       |     ●      |        ●         |       ·        |      ●       |
+| [Curved] Spaces             |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
+| __Many-to-Many Vector Ops__ |                |              |            |                  |                |              |
+| "[Dots]" Products           |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
+| "[Spatials]" Metrics        |       ●        |      ●       |     ●      |        ●         |       ●        |      ●       |
+| "[Sets]" Similarities       |       ●        |      ●       |     ●      |        ·         |       ●        |      ●       |
+| [MaxSim] Scoring            |       ●        |      ●       |     ●      |        ·         |       ●        |      ●       |
+| __Scalar Ops__              |                |              |            |                  |                |              |
+| [Cast]                      |       ●        |      ●       |     ●      |        ●         |       ·        |      ·       |
+| [Reduce]                    |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
+| [Each]                      |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
+| [Trigonometry]              |       ●        |      ●       |     ●      |        ·         |       ·        |      ·       |
+[Dot]: include/numkong/dot/README.md
+[Dots]: include/numkong/dots/README.md
+[Spatial]: include/numkong/spatial/README.md
+[Spatials]: include/numkong/spatials/README.md
+[Set]: include/numkong/set/README.md
+[Sets]: include/numkong/sets/README.md
+[Cast]: include/numkong/cast/README.md
+[Reduce]: include/numkong/reduce/README.md
+[Trigonometry]: include/numkong/trigonometry/README.md
+[MaxSim]: include/numkong/maxsim/README.md
+[Mesh]: include/numkong/mesh/README.md
+[Each]: include/numkong/each/README.md
+[Sparse]: include/numkong/sparse/README.md
+[Probability]: include/numkong/probability/README.md
+[Curved]: include/numkong/curved/README.md
+[Geo]: include/numkong/geospatial/README.md
+[c]: include/README.md
+[py]: python/README.md
+[js]: javascript/README.md
+[rs]: rust/README.md
+[swift]: swift/README.md
+[go]: golang/README.md
-## Design Decisions
-In general there are a few principles that NumKong follows:
+## Design Decisions
 - Avoid loop unrolling and scalar tails.
 - Don't manage threads and be compatible with any parallelism models.
@@ -140,17 +153,17 @@ float boring_dot_product_f32(float const *a, float const *b, size_t n) {
 }
 ```
-This kind of unrolling has been historically the most commonly requested optimization for NumKong, and it's intentionally avoided.
+This kind of unrolling has been a common request for NumKong, but the library avoids it by design.
 __Modern CPUs already "unroll" in hardware.__
 Out-of-order engines with reorder buffers of 320–630 entries (Zen 4: 320, Golden Cove: 512, Apple Firestorm: ~630) can keep a dozen of loop iterations in-flight simultaneously.
 The physical register file is much larger than the ISA-visible architectural registers — Skylake has ~180 physical integer registers behind 16 architectural GPRs, and ~168 physical vector registers behind 32 architectural ZMMs.
 The register renaming unit maps the same `zmm0` in iteration N and iteration N+1 to different physical registers, extracting cross-iteration parallelism automatically — exactly the benefit that source-level unrolling was historically supposed to provide.
-__Unrolling actively hurts at NumKong's scale.__
+__Unrolling works against NumKong's goals.__
 Every unrolled copy is a distinct instruction in the binary.
 With 1,500+ kernel endpoints across 30+ backends, even 2x unrolling would inflate the `.text` section by megabytes — directly impacting install size for Python wheels, NPM packages, and Rust crates.
-Larger loop bodies also increase instruction-cache and micro-op-cache pressure; Agner Fog also recommends:
+Larger loop bodies also increase instruction-cache and micro-op-cache pressure; [Agner Fog](https://www.agner.org/optimize/) also recommends:
 > _"avoid loop unrolling where possible in order to economize the use of the micro-op cache"_.
@@ -162,9 +175,9 @@ The leftover elements after the last full SIMD chunk run through a scalar loop t
 NumKong often uses masked loads instead (`_mm512_maskz_loadu_ps` on AVX-512, predicated `svld1_f32` on SVE), processing every element through the same arithmetic path regardless of alignment.
 It's not exactly orthogonal to loop-unrolling, but makes a different kernel layout more compatible.
-__The real performance gap is elsewhere.__
+__The gains come from elsewhere.__
 On Intel Sapphire Rapids, NumKong was benchmarked against auto-vectorized code compiled with GCC 12.
-GCC handles single-precision `float` competently, but struggles with `_Float16` and other mixed-precision paths:
+GCC handles single-precision `float` well, but struggles with `_Float16` and other mixed-precision paths:
 | Kind                      | GCC 12 `f32` | GCC 12 `f16` | NumKong `f16` | `f16` improvement |
 | :------------------------ | -----------: | -----------: | ------------: | ----------------: |
@@ -173,20 +186,20 @@ GCC handles single-precision `float` competently, but struggles with `_Float16`
 | Euclidean Distance ²      |    4,620 K/s |      147 K/s |     5,320 K/s |          __36 x__ |
 | Jensen-Shannon Divergence |    1,180 K/s |       18 K/s |     2,140 K/s |         __118 x__ |
-NumKong's `f16` kernels are faster than GCC's `f32` output — not because of unrolling, but because they use F16C conversion instructions, widening FMA pipelines, and compensated accumulation that no compiler will synthesize from a plain `for` loop.
-The same story repeats for `bf16`, `e4m3`, `i8`, and `i4`: these types require algorithmic transformations — lookup tables, algebraic domain shifts, asymmetric VNNI tricks — that live beyond the reach of auto-vectorization.
+NumKong's `f16` kernels are faster than GCC's `f32` output — not because of unrolling, but because they use [F16C](https://en.wikipedia.org/wiki/F16C) conversion instructions, widening FMA pipelines, and compensated accumulation that compilers do not synthesize from a plain `for` loop.
+The same story repeats for `bf16`, `e4m3`, `i8`, and `i4`: these types require algorithmic transformations — lookup tables, algebraic domain shifts, asymmetric [VNNI](https://en.wikipedia.org/wiki/AVX-512#VNNI) tricks — that live beyond the reach of auto-vectorization.
 ### Parallelism & Multi-Threading
 BLAS libraries traditionally manage their own thread pools.
-[OpenBLAS](https://github.com/OpenMathLib/OpenBLAS/blob/develop/USAGE.md) spawns threads controlled by `OPENBLAS_NUM_THREADS`, [Intel MKL](https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2025-1/techniques-to-set-the-number-of-threads.html) forks its own OpenMP runtime via `MKL_NUM_THREADS`, and [Apple Accelerate](https://developer.apple.com/documentation/accelerate/blas) delegates to GCD.
+[OpenBLAS](https://github.com/OpenMathLib/OpenBLAS/blob/develop/USAGE.md) spawns threads controlled by `OPENBLAS_NUM_THREADS`, [Intel MKL](https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2025-1/techniques-to-set-the-number-of-threads.html) forks its own OpenMP runtime via `MKL_NUM_THREADS`, and [Apple Accelerate](https://developer.apple.com/documentation/accelerate/blas) delegates to [GCD](https://developer.apple.com/documentation/dispatch) (Grand Central Dispatch).
 This works in isolation — but the moment your application adds its own parallelism (joblib, std::thread, Tokio, GCD, OpenMP), you get __thread oversubscription__: MKL spawns 8 threads inside each of your 8 joblib workers, producing 64 threads on 8 cores, thrashing caches and stalling on context switches.
 The Python ecosystem has built [entire libraries](https://github.com/joblib/threadpoolctl) just to work around this problem, and [scikit-learn's documentation](https://scikit-learn.org/stable/computing/parallelism.html) devotes a full page to managing the interaction between joblib parallelism and BLAS thread pools.
 NumKong takes a different position: __the numerics layer should not own threads__.
 Modern hardware makes the "spawn N threads and split evenly" model increasingly untenable:
-- __Server-grade CPUs__ have hundreds of cores split across sockets, chiplets, and tiles, resulting in dozens of physical NUMA domains with vastly different memory access latencies.
+- __Server-grade CPUs__ have hundreds of cores split across sockets, chiplets, and tiles, resulting in dozens of physical [NUMA](https://en.wikipedia.org/wiki/Non-uniform_memory_access) domains with vastly different memory access latencies.
   A thread pool that ignores NUMA topology will spend more time on remote memory stalls than on arithmetic.
 - __Consumer-grade CPUs__ pack heterogeneous Quality-of-Service core types on the same die — Intel P-cores and E-cores run at different frequencies and sometimes support different ISA extensions.
   A naive work-split gives equal chunks to fast and slow cores, and the whole task stalls waiting for the slowest partition.
@@ -196,7 +209,7 @@ Modern hardware makes the "spawn N threads and split evenly" model increasingly
 Instead, NumKong exposes __row-range parameters__ that let the caller partition work across any threading model.
 For GEMM-shaped `dots_packed`, this is straightforward — pass a slice of A's rows and the full packed B to compute the corresponding slice of C.
 For SYRK-shaped `dots_symmetric`, explicit `start_row` / `end_row` parameters control which rows of the symmetric output matrix a given thread computes.
-The GIL is released around every kernel call, making NumKong compatible with `concurrent.futures`, `multiprocessing`, or any other parallelism model:
+The [GIL](https://docs.python.org/3/glossary.html#term-global-interpreter-lock) (Global Interpreter Lock) is released around every kernel call, making NumKong compatible with `concurrent.futures`, `multiprocessing`, or any other parallelism model:
 ```python
 import concurrent.futures, numkong as nk, numpy as np
@@ -213,12 +226,12 @@ with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as pool:
     list(pool.map(compute_slice, range(num_threads)))
 ```
-For users who want a ready-made low-latency thread pool without the oversubscription baggage of OpenMP, we built [Fork Union](https://github.com/ashvardanian/ForkUnion) — a minimalist fork-join library for C, C++, and Rust that avoids mutexes, CAS atomics, and dynamic allocations on the critical path, with optional NUMA pinning on Linux.
+For users who want a ready-made low-latency thread pool without the oversubscription baggage of OpenMP, we built [ForkUnion](https://github.com/ashvardanian/ForkUnion) — a minimalist fork-join library for C, C++, and Rust that avoids mutexes, CAS atomics, and dynamic allocations on the critical path, with optional NUMA pinning on Linux.
 ### Memory Allocation & Management
 BLAS libraries typically allocate internal buffers during GEMM — [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS) packs matrices into L2/L3-sized panels via per-thread buffer pools backed by `mmap` or `shmget`.
-This hidden allocation has caused real problems: [14 lock/unlock pairs per small GEMM call](https://github.com/xianyi/OpenBLAS/issues/478) throttling 12-thread scaling to 2x, [silently incorrect results](https://github.com/xianyi/OpenBLAS/issues/1844) from thread-unsafe allocation in `np.dot`, and [deadlocks after `fork()`](https://github.com/numpy/numpy/issues/30092) due to mutex state not being reset in child processes.
+This hidden allocation has caused real problems: [14 lock/unlock pairs per small GEMM call](https://github.com/OpenMathLib/OpenBLAS/issues/478) throttling 12-thread scaling to 2x, [silently incorrect results](https://github.com/OpenMathLib/OpenBLAS/issues/1844) from thread-unsafe allocation in `np.dot`, and [deadlocks after `fork()`](https://github.com/numpy/numpy/issues/30092) due to mutex state not being reset in child processes.
 The [BLASFEO](https://github.com/giaf/blasfeo) library was created specifically for embedded model-predictive control where `malloc` during computation is unacceptable.
 NumKong __never allocates memory__.
@@ -241,14 +254,14 @@ NumKong's `nk_dots_pack_*` family performs five transformations beyond simple re
 import numkong as nk, numpy as np
 right_matrix = np.random.randn(1000, 768).astype(np.float16)
-right_packed = nk.dots_pack(right_matrix, dtype="float16")                        # pack once
+right_packed = nk.dots_pack(right_matrix, dtype=nk.float16)                        # pack once
 for query_batch in stream: results = nk.dots_packed(query_batch, right_packed)    # reuse many times
 ```
 ### Why Not Just GEMM? The Evolution of Matrix Multiplication APIs
 The classic BLAS GEMM computes $C = \alpha A B + \beta C$ for Float32/Float64 matrices.
-It's a powerful primitive, but the workloads that dominate modern compute — LLM inference, vector search, quantum simulation — expose three ways in which the traditional GEMM interface falls short.
+It covers many use cases, but LLM inference, vector search, and quantum simulation expose three ways in which the traditional interface falls short.
 __Frozen weights justify separating packing from computation.__
 During LLM inference, a very large share of GEMM calls use a static weight matrix — weights don't change after loading.
@@ -272,18 +285,18 @@ The standard BLAS interface was never designed for sub-byte types either — [no
 __Some operations need more than GEMM + postprocessing.__
 NumKong implements several GEMM-shaped operations where the "epilogue" is too complex for a simple addition:
-- __Bilinear forms__ ($a^T C b$) in quantum computing compute a [scalar expectation value](https://phys.libretexts.org/Bookshelves/Quantum_Mechanics/Advanced_Quantum_Mechanics_(Kok)/10:_Pauli_Spin_Matrices/10.2:_Expectation_Values) — the naive approach materializes an $N$-dimensional intermediate vector $Cb$, but NumKong's typed `nk_bilinear_*` kernels stream through rows of $C$ with nested compensated dot products, never allocating beyond registers.
+- __Bilinear forms__ ($a^T C b$) in quantum computing compute a [scalar expectation value](<https://phys.libretexts.org/Bookshelves/Quantum_Mechanics/Advanced_Quantum_Mechanics_(Kok)/10:_Pauli_Spin_Matrices/10.2:_Expectation_Values>) — the naive approach materializes an $N$-dimensional intermediate vector $Cb$, but NumKong's typed `nk_bilinear_*` kernels stream through rows of $C$ with nested compensated dot products, never allocating beyond registers.
   For complex-valued quantum states, where the intermediate would be a 2N-element complex vector, the savings double.
 - __MaxSim scoring__ for [ColBERT-style late-interaction retrieval](https://github.com/stanford-futuredata/ColBERT) computes $\sum_i \min_j \text{angular}(q_i, d_j)$ — a sum-of-min-distances across token pairs.
-  A GEMM would produce the full $M \times N$ similarity matrix, but NumKong's typed `nk_maxsim_packed_*` kernels fuse a coarse Int8-quantized screening with full-precision angular refinement on winning pairs only, __packing both query and document matrices__ to enable all 4 SME tiles as accumulators (+33% throughput vs `dots_packed`).
-  [PLAID](https://ar5iv.labs.arxiv.org/html/2205.09707) and [maxsim-cpu](https://www.mixedbread.com/blog/maxsim-cpu) have independently shown that dedicated MaxSim kernels outperform the GEMM decomposition by 5–10x.
+  A GEMM would produce the full $M \times N$ similarity matrix, but NumKong's typed `nk_maxsim_packed_*` kernels fuse a coarse Int8-quantized screening with full-precision angular refinement on winning pairs only, packing both query and document matrices to use all 4 SME tiles as accumulators.
+  [PLAID](https://ar5iv.labs.arxiv.org/html/2205.09707) and [maxsim-cpu](https://www.mixedbread.com/blog/maxsim-cpu) have independently shown that dedicated MaxSim kernels can outperform the GEMM decomposition by 5–10x.
 NumKong treats these as first-class operations — `dots_packed`, `euclideans_packed`, `angulars_packed`, typed `nk_bilinear_*` kernels, and typed `nk_maxsim_packed_*` kernels — rather than decomposing everything into GEMM + postprocessing.
 ### Precision by Design: Saturation, Rounding, & Float6 Over Float8
-Floating-point arithmetic on computers [is not associative](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html): $(a + b) + c \neq a + (b + c)$ in general, and the standard advice — "upcast to wider types" — often isn't enough, and always costs performance.
-NumKong makes opinionated, operation-specific decisions about where to spend precision and where to economize, rather than applying one IEEE rule uniformly.
+Floating-point arithmetic on computers [is not associative](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html): $(a + b) + c \neq a + (b + c)$ in general, and upcasting to wider types is not always sufficient.
+NumKong makes operation-specific decisions about where to spend precision and where to economize, rather than applying one rule uniformly.
 __Saturation depends on the operation.__
 A reduction over a 4 GB array of `i8` values contains ~4 billion elements — but [Int32 wrapping overflow](https://cedardb.com/blog/overflow_handling/) occurs after just ~17 million Int8 summands ($127 \times 16.9\text{M} > 2^{31}$).
@@ -294,7 +307,7 @@ x86 provides no saturating 32-bit SIMD add ([only byte/word variants](https://ww
 __Square roots & special math ops are platform-specific.__
 Angular distance requires $1/\sqrt{\|a\|^2 \cdot \|b\|^2}$ — but the cost of computing this normalization varies dramatically across hardware.
 x86 `VSQRTPS` takes [~12 cycles](https://uops.info/html-lat/SKX/VSQRTPS_XMM_XMM-Measurements.html), followed by `VDIVPS` at ~11 cycles — totalling ~23 cycles for a precise `1/sqrt(x)`.
-The `VRSQRT14PS` alternative starts with a [14-bit estimate in ~4 cycles](https://www.intel.com/content/www/us/en/developer/articles/code-sample/reference-implementations-for-ia-approximation-instructions-vrcp14-vrsqrt14-vrcp28-vrsqrt28-vexp2.html), then one Newton-Raphson iteration ($y = y \cdot (1.5 - 0.5 x y^2)$, ~4 more cycles) reaches full Float32 precision — a __~3x speedup__.
+The `VRSQRT14PS` alternative starts with a [14-bit estimate in ~4 cycles](https://www.intel.com/content/www/us/en/developer/articles/code-sample/reference-implementations-for-ia-approximation-instructions-vrcp14-vrsqrt14-vrcp28-vrsqrt28-vexp2.html), then one Newton-Raphson iteration ($y = y \cdot (1.5 - 0.5 x y^2)$, ~4 more cycles) reaches full Float32 precision — roughly 3x faster.
 ARM's `FRSQRTE` provides only [~8 bits](https://github.com/DLTcollab/sse2neon/issues/526), requiring __two__ Newton-Raphson iterations to match.
 NumKong selects the iteration count per platform so the final ULP bound is consistent across ISAs, rather than exposing different precision to different users.
@@ -347,10 +360,10 @@ The first call to `nk_capabilities()` initializes the dispatch table; all subseq
 ### Float64 & Float32: IEEE Precision
-__Float64__ — NumKong deviates from most BLAS-like libraries by leveraging __compensated summation__ that tracks numerical errors separately.
-On serial paths, we use __Neumaier's algorithm__ (1974), an improvement over Kahan-Babuška that correctly handles cases where added terms are larger than the running sum, achieving $O(1)$ error growth instead of $O(n)$.
+__Float64__ — NumKong uses __compensated summation__ that tracks numerical errors separately.
+On serial paths, we use __[Neumaier's algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm#Further_enhancements)__ (1974), an improvement over Kahan-Babuška that correctly handles cases where added terms are larger than the running sum, achieving $O(1)$ error growth instead of $O(n)$.
 On SIMD paths with FMA support, we implement the __Dot2 algorithm__ (Ogita-Rump-Oishi, 2005), maintaining separate error compensators for both multiplication and accumulation via `TwoProd` and `TwoSum` operations.
-The accuracy gains are visible in the [benchmark tables above](#latency-throughput--numerical-stability-together-in-a-tiny-package) — compensated Float64 is ideal for scientific computing where numerical stability matters more than raw speed.
+The accuracy differences are visible in the [benchmark tables above](#latency-throughput--numerical-stability) — compensated Float64 suits scientific computing where numerical stability matters more than raw speed.
 __Float32__ — SIMD implementations load Float32 values, upcast to Float64 for full-precision multiplication and accumulation, then downcast only during finalization.
 This avoids catastrophic cancellation at minimal cost since modern CPUs have dedicated Float64 vector units operating at nearly the same throughput as Float32.
@@ -368,7 +381,7 @@ e = (sum - t) + product;  // Compensator term
 ### BFloat16 & Float16: Half Precision
-__BFloat16__ — not an IEEE 754 standard type, but the __universal recommendation__ for AI workloads.
+__BFloat16__ — not an IEEE 754 standard type, but widely adopted for AI workloads.
 BFloat16 shares Float32's 8-bit exponent but truncates the mantissa to 7 bits, prioritizing __dynamic range over precision__ (±3.4×10³⁸ with coarser granularity).
 On old CPUs, upcasting BFloat16 to Float32 requires just an unpack and left-shift by 16 bits (essentially free); on newer CPUs, both Arm and x86 provide widening mixed-precision dot products via __DPBF16PS__ (AVX-512 on Genoa/Sapphire Rapids) and __BFDOT__ (NEON on ARMv8.6-A Graviton 3+).
 NumKong's Float8 types (E4M3/E5M2) upcast to BFloat16 before using DPBF16PS, creating a three-tier precision hierarchy: Float8 for storage, BFloat16 for compute, Float32 for accumulation.
@@ -378,36 +391,43 @@ Float16 prioritizes __precision over range__ (10 vs 7 mantissa bits), making it
 On x86, older CPUs use __F16C extensions__ (Ivy Bridge+) for fast Float16 → Float32 conversion; Sapphire Rapids+ adds native __AVX-512-FP16__ with dedicated Float16 arithmetic.
 On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float32 widening multiply-accumulate, reducing the total latency from 7 cycles to 4 cycles and achieving 20–48% speedup over the separate convert-then-FMA path.
-| Platform               | BFloat16 Path            | Elem/Op | Float16 Path           | Elem/Op |
-| ---------------------- | ------------------------ | ------: | ---------------------- | ------: |
-| __x86__                |                          |         |                        |         |
-| Sapphire Rapids (2023) | ↓ Genoa                  |      32 | ↓ Skylake              |      16 |
-| Genoa (2022)           | `VDPBF16PS` widening dot |      32 | ↓ Skylake              |      16 |
-| Skylake (2015)         | `SLLI` + `VFMADD`        |      16 | `VCVTPH2PS` + `VFMADD` |      16 |
-| Haswell (2013)         | `SLLI` + `VFMADD`        |       8 | `VCVTPH2PS` + `VFMADD` |       8 |
-| __Arm__                |                          |         |                        |         |
-| Graviton 3 (2021)      | `SVBFDOT` widening dot   |    4–32 | `SVCVT` → `SVFMLA`     |    4–32 |
-| Apple M2+ (2022)       | `BFDOT` widening dot     |       8 | ↓ FP16FML              |       8 |
-| Apple M1 (2020)        | ↓ NEON                   |       8 | `FMLAL` widening FMA   |       8 |
-| Graviton 2 (2019)      | ↓ NEON                   |       8 | `FCVTL` + `FMLA`       |       4 |
-| Graviton 1 (2018)      | `SHLL` + `FMLA`          |       8 | bit-manip → `FMLA`     |       8 |
+| Platform               | BFloat16 Path              | Elem/Op | Float16 Path           | Elem/Op |
+| ---------------------- | -------------------------- | ------: | ---------------------- | ------: |
+| __x86__                |                            |         |                        |         |
+| Diamond Rapids (2025)  | ↓ Genoa                    |      32 | `VDPPHPS` widening dot |      32 |
+| Sapphire Rapids (2023) | ↓ Genoa                    |      32 | ↓ Skylake              |      16 |
+| Genoa (2022)           | `VDPBF16PS` widening dot   |      32 | ↓ Skylake              |      16 |
+| Skylake (2015)         | `SLLI` + `VFMADD`          |      16 | `VCVTPH2PS` + `VFMADD` |      16 |
+| Haswell (2013)         | `SLLI` + `VFMADD`          |       8 | `VCVTPH2PS` + `VFMADD` |       8 |
+| __Arm__                |                            |         |                        |         |
+| Graviton 3 (2021)      | `SVBFDOT` widening dot     |    4–32 | `SVCVT` → `SVFMLA`     |    4–32 |
+| Apple M2+ (2022)       | `BFDOT` widening dot       |       8 | ↓ FP16FML              |       8 |
+| Apple M1 (2020)        | ↓ NEON                     |       8 | `FMLAL` widening FMA   |       8 |
+| Graviton 2 (2019)      | ↓ NEON                     |       8 | `FCVTL` + `FMLA`       |       4 |
+| Graviton 1 (2018)      | `SHLL` + `FMLA`            |       8 | bit-manip → `FMLA`     |       8 |
+| __RISC-V__             |                            |         |                        |         |
+| RVV + Zvfbfwma         | `VFWMACCBF16` widening FMA |    4–32 | ↓ RVV                  |    4–32 |
+| RVV + Zvfh             | ↓ RVV                      |    4–32 | `VFWMACC` widening FMA |    4–32 |
+| RVV                    | shift + `VFMACC`           |    4–32 | convert + `VFMACC`     |    4–32 |
 > BFloat16 shares Float32's 8-bit exponent, so upcasting is a 16-bit left shift (`SLLI` on x86, `SHLL` on Arm) that zero-pads the truncated mantissa — essentially free.
 > Float16 has a different exponent width (5 vs 8 bits), requiring a dedicated convert: `VCVTPH2PS` (x86 F16C) or `FCVTL` (Arm NEON).
 > Widening dot products (`VDPBF16PS`, `BFDOT`, `FMLAL`) fuse the conversion and multiply-accumulate into one instruction.
 > Sapphire Rapids has native `VFMADDPH` for Float16 arithmetic, but NumKong does not use it for general dot products — Float16 accumulation loses precision.
 > It is only used for mini-float (E2M3/E3M2) paths where periodic flush-to-Float32 windows keep error bounded.
+> The table above covers only vector dot-product paths - GEMMs also leverage Arm SME and Intel AMX instructions.
+> Beyond x86, Arm, and RISC-V, NumKong also ships LoongArch, WebAssembly, and PowerPC backends, also excluded from the table.
 ### Mini-Floats: E4M3, E5M2, E3M2, & E2M3
-| Format                    |  Bits |  Range | NumKong Promotion Strategy                  | Support in GPUs           |
-| ------------------------- | ----: | -----: | ------------------------------------------- | ------------------------- |
-| E5M2FN                    |     8 | ±57344 | BFloat16 → Float32                          | H100, B200, MI300, MI325  |
-| E4M3FN                    |     8 |   ±448 | BFloat16 → Float32                          | H100, B200, MI300, MI325  |
-| E3M2FN                    | 6 → 8 |    ±28 | BFloat16 & Float16 → Float32, Int16 → Int32 | only block-scaled support |
-| E2M3FN                    | 6 → 8 |   ±7.5 | BFloat16 & Float16 → Float32, Int8 → Int32  | only block-scaled support |
-| Block-scaled NVFP4        |     4 |     ±6 | —                                           | B200                      |
-| Block-scaled MXFP4 / E2M1 |     4 |     ±6 | —                                           | B200, MI325               |
+| Format                    |  Bits |  Range | NumKong Promotion Rules                         | Support in GPUs   |
+| ------------------------- | ----: | -----: | ----------------------------------------------- | ----------------- |
+| E5M2FN                    |     8 | ±57344 | BFloat16 → Float32                              | H100+, MI300+     |
+| E4M3FN                    |     8 |   ±448 | BFloat16 → Float32                              | H100+, MI300+     |
+| E3M2FN                    | 6 → 8 |    ±28 | BFloat16 & Float16 → Float32,<br/>Int16 → Int32 | only block-scaled |
+| E2M3FN                    | 6 → 8 |   ±7.5 | BFloat16 & Float16 → Float32,<br/>Int8 → Int32  | only block-scaled |
+| Block-scaled NVFP4        |     4 |     ±6 | —                                               | B200+             |
+| Block-scaled MXFP4 / E2M1 |     4 |     ±6 | —                                               | B200+, MI325+     |
 > __Block scaling.__
 > NumKong does not implement block-scaled variants (MXFP4, NVFP4, or block-scaled E3M2/E2M3).
@@ -424,33 +444,72 @@ E4M3FN (no infinities, NaN only) is preferred for __training__ where precision n
 On x86 Genoa/Sapphire Rapids, E4M3/E5M2 values upcast to BFloat16 via lookup tables, then use native __DPBF16PS__ for 2-per-lane dot products accumulating to Float32.
 On Arm Graviton 3+, the same BFloat16 upcast happens via NEON table lookups, then __BFDOT__ instructions complete the computation.
+| Platform                   | E5M2 Path                      | Elem/Op | E4M3 Path                      | Elem/Op |
+| -------------------------- | ------------------------------ | ------: | ------------------------------ | ------: |
+| __x86__                    |                                |         |                                |         |
+| Diamond Rapids (2025)      | `VCVTBF82PH` → F16 + `VDPPHPS` |      32 | `VCVTHF82PH` → F16 + `VDPPHPS` |      32 |
+| Genoa (2022)               | → BF16 + `VDPBF16PS`           |      32 | ↓ Ice Lake                     |      64 |
+| Ice Lake (2019)            | ↓ Skylake                      |      16 | octave LUT + `VPDPBUSD`        |      64 |
+| Skylake (2015)             | rebias → F32 FMA               |      16 | rebias → F32 FMA               |      16 |
+| Haswell (2013)             | rebias → F32 FMA               |       8 | rebias → F32 FMA               |       8 |
+| __Arm__                    |                                |         |                                |         |
+| NEON + FP8DOT (Olympus)    | native `FDOT`                  |      16 | native `FDOT`                  |      16 |
+| NEON + FP16FML (Apple M1+) | SHL → F16 + `FMLAL`            |      16 | LUT → F16 + `FMLAL`            |      16 |
+| NEON (Graviton 1+)         | SHL + `FCVTL` + FMA            |       8 | → F16 + `FCVTL` + FMA          |       8 |
+| __RISC-V__                 |                                |         |                                |         |
+| RVV + Zvfbfwma             | rebias → BF16 + `VFWMACCBF16`  |    4–32 | LUT → BF16 + `VFWMACCBF16`     |    4–32 |
+| RVV + Zvfh                 | SHL → F16 + `VFWMACC`          |    4–32 | LUT → F16 + `VFWMACC`          |    4–32 |
+| RVV                        | rebias → F32 + `VFMACC`        |    4–32 | LUT → F32 + `VFMACC`           |    4–32 |
+> E5M2 shares Float16's exponent bias (15), so E5M2 → Float16 conversion is a single left-shift by 8 bits (`SHL 8`).
+> E4M3 on Ice Lake uses "octave decomposition": the 4-bit exponent splits into 2 octave + 2 remainder bits, yielding 7 integer accumulators post-scaled by powers of 2.
 __6-bit floats (E3M2 & E2M3)__ follow the [OCP MX v1.0 standard](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
 Their smaller range allows scaling to exact integers that fit in `i8`/`i16`, enabling integer `VPDPBUSD`/`SDOT` accumulation instead of the floating-point pipeline.
 Float16 can also serve as an accumulator, accurately representing ~50 products of E3M2FN pairs or ~20 products of E2M3FN pairs before overflow.
 On Arm, NEON FHM extensions bring widening `FMLAL` dot-products for Float16 — both faster and more widely available than `BFDOT` for BFloat16.
+| Platform                     | E3M2 Path                  | Elem/Op | E2M3 Path                    | Elem/Op |
+| ---------------------------- | -------------------------- | ------: | ---------------------------- | ------: |
+| __x86__                      |                            |         |                              |         |
+| Ice Lake (2019)              | `VPERMW` LUT + `VPMADDWD`  |      32 | `VPERMB` LUT + `VPDPBUSD`    |      64 |
+| Sierra Forest (2024)         | ↓ Haswell                  |      32 | `VPSHUFB` LUT + `VPDPBSSD`   |      32 |
+| Alder Lake (2021)            | ↓ Haswell                  |      32 | `VPSHUFB` LUT + `VPDPBUSD`   |      32 |
+| Skylake (2015)               | `VPSHUFB` LUT + `VPMADDWD` |      64 | `VPSHUFB` LUT + `VPMADDUBSW` |      64 |
+| Haswell (2013)               | `VPSHUFB` LUT + `VPMADDWD` |      32 | `VPSHUFB` LUT + `VPMADDUBSW` |      32 |
+| __Arm__                      |                            |         |                              |         |
+| NEON + FP8DOT (Olympus)      | → E5M2 + `FDOT`            |      16 | → E4M3 + `FDOT`              |      16 |
+| NEON + DotProd (Graviton 2+) | `VQTBL2` LUT + `SMLAL`     |      16 | `VQTBL2` LUT + `SDOT`        |      16 |
+| NEON (Graviton 1+)           | → F16 + `FCVTL` + FMA      |      16 | → F16 + `FCVTL` + FMA        |      16 |
+| __RISC-V__                   |                            |         |                              |         |
+| RVV                          | I16 gather LUT + `VWMACC`  |    4–32 | U8 gather LUT + `VWMACC`     |    4–32 |
+> E3M2/E2M3 values map to exact integers via 32-entry LUTs (magnitudes up to 448 for E3M2, 120 for E2M3), enabling integer accumulation with no rounding error.
+> On NEON + FP8DOT, E3M2 is first promoted to E5M2 and E2M3 to E4M3 before the hardware `FDOT` instruction.
+> Sierra Forest and Alder Lake use native `VPDPBSSD` (signed×signed) and `VPDPBUSD` (unsigned×signed) respectively for E2M3.
 E4M3 and E5M2 cannot use the integer path.
 E4M3 scaled by 16 reaches 7,680 — too large for Int8, barely fitting Int16 with a 128-entry table.
 E5M2's range (±57,344) makes the scaled product exceed Int32 entirely.
 Without the integer path, E5M2 falls back to Float32 accumulation — where its [2-bit mantissa (only 4 values per binade)](https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/) creates a [catastrophic cancellation risk](https://www.ac.uma.es/arith2024/papers/Fused%20FP8%204-Way%20Dot%20Product%20with%20Scaling%20and%20FP32%20Accumulation.pdf) that E2M3's integer path avoids completely:
-|         |  _i_ = 0 | _i_ = 1 |  _i_ = 2 |   _i_ = 3 |  _i_ = 4 |  _i_ = 5 |  _i_ = 6 |  _i_ = 7 |
-| ------- | -------: | ------: | -------: | --------: | -------: | -------: | -------: | -------: |
-| _aᵢ_    |  0.00122 |   20480 | −0.00122 |       1.5 | −0.00586 |    −3072 |     −640 |  0.00146 |
-| _bᵢ_    |      −40 |     320 |    −1280 |  −7.63e⁻⁵ |        0 | 0.000427 |    10240 | −4.58e⁻⁵ |
-| _aᵢ·bᵢ_ | −0.04883 | 6553600 |   1.5625 | −0.000114 |        0 |  −1.3125 | −6553600 |      ≈ 0 |
+|         |  _i_ = 0 | _i_ = 1 |  _i_ = 2 |   _i_ = 3 |  _i_ = 4 |  _i_ = 5 |  _i_ = 6 |
+| ------- | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
+| _aᵢ_    |  0.00122 |   20480 | −0.00122 |       1.5 |    −3072 |     −640 |  0.00146 |
+| _bᵢ_    |      −40 |     320 |    −1280 |  −7.63e⁻⁵ | 0.000427 |    10240 | −4.58e⁻⁵ |
+| _aᵢ·bᵢ_ | −0.04883 | 6553600 |   1.5625 | −0.000114 |  −1.3125 | −6553600 |      ≈ 0 |
 > __Why Float32 accumulation fails here.__
-> The accurate sum of these 8 products is ≈ 0.201.
-> After two `vfmaq_f32` calls, the 4 accumulator lanes hold pairwise products: lanes 1 and 2 carry values around ±6.5 M.
-> At that magnitude the Float32 ULP is 0.5 — so the small meaningful terms (−0.049, 1.563, −1.313, −0.0001) are all below one ULP and get absorbed during pairwise reduction.
+> The accurate sum of these 7 products is ≈ 0.201.
+> A `vfmaq_f32` call accumulates 4 lanes at a time; the first batch already carries values around ±6.5 M.
+> At that magnitude the Float32 ULP is 0.5 — so the small meaningful terms (−0.049, 1.563, −1.313, −0.0001) are all below one ULP and get absorbed during lane reduction.
 > The large terms then cancel exactly to zero, and the information is gone.
 > Final Float32 result: __0.0__ instead of __0.201__.
 ### Int8 & Int4: Integer Types
 Both signed and unsigned 8-bit and 4-bit integers are supported with __Int32 accumulation__ to prevent overflow.
-The most sophisticated optimization is the __VNNI algebraic transform__: on Ice Lake+ with AVX-512 VNNI, the native __DPBUSD__ instruction is asymmetric (unsigned × signed → signed), yet NumKong exploits it for both Int8×Int8 and UInt8×UInt8.
+A notable optimization is the __VNNI algebraic transform__: on Ice Lake+ with AVX-512 VNNI, the native __DPBUSD__ instruction is asymmetric (unsigned × signed → signed), but NumKong uses it for both Int8×Int8 and UInt8×UInt8.
 For __signed Int8×Int8__, we convert the signed operand to unsigned via XOR with `0x80`, compute `DPBUSD(a⊕0x80, b) = (a+128)×b`, then subtract a correction term `128×sum(b)` to recover the true result.
 For __unsigned UInt8×UInt8__, we XOR the second operand to make it signed, compute `DPBUSD(a, b⊕0x80) = a×(b-128)`, then add correction `128×sum(a)` via the fast SAD instruction.
@@ -480,7 +539,7 @@ Complex types are essential in quantum simulation (state vectors, density matric
 The `dot` operation computes the unconjugated dot product $\sum a_k b_k$, while `vdot` computes the conjugated inner product $\sum \bar{a}_k b_k$ standard in physics and signal processing.
 For complex dot products, NumKong defers sign flips until after the accumulation loop: instead of using separate FMA and FMS (fused multiply-subtract) instructions for the real component, we compute $a_r b_r + a_i b_i$ treating all products as positive, then apply a single bitwise XOR with `0x80000000` to flip the sign bits.
-This eliminates execution port contention, allowing dual FMA units to run at full capacity.
+This avoids execution port contention between FMA and FMS, letting dual FMA units stay occupied.
 ```c
 for (...) { // Complex multiply optimization: XOR sign flip after the loop
@@ -490,6 +549,20 @@ for (...) { // Complex multiply optimization: XOR sign flip after the loop
 sum_real = xor(sum_real, 0x80000000);  // Single XOR after loop
 ```
+## Reading Materials
+Beyond the READMEs in this repository, there are several standalone articles covering different evolution steps and features of this library.
+- [NumKong: 2'000 Mixed Precision Kernels For All](https://ashvardanian.com/posts/numkong/)
+- [Hiding x86 Port Latency for 330 GB/s/core Reductions](https://ashvardanian.com/posts/cpu-ports/)
+- [Understanding SIMD: Infinite Complexity of Trivial Problems](https://ashvardanian.com/posts/understanding-simd-complexity/)
+- [NumPy vs BLAS: Losing 90% of Throughput](https://ashvardanian.com/posts/numpy-vs-blas-costs/)
+- [5x Faster Set Intersections: SVE2, AVX-512, & NEON](https://ashvardanian.com/posts/simd-set-intersections-sve2-avx512/)
+- [Python, C, Assembly - 2'500x Faster Cosine Similarity](https://ashvardanian.com/posts/python-c-assembly-comparison/)
+- [GCC Compiler vs Human - 119x Faster Assembly](https://ashvardanian.com/posts/gcc-12-vs-avx512fp16/)
+- [Accelerating JavaScript arrays by 10x for Vector Search](https://ashvardanian.com/posts/javascript-ai-vector-search/)
+- [SciPy distances... up to 200x faster with AVX-512 & SVE](https://ashvardanian.com/posts/simsimd-faster-scipy/)
 ## License
 Feel free to use the project under Apache 2.0 or the Three-clause BSD license at your preference.