numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
package/README.md
CHANGED
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
# NumKong: Mixed Precision for All
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
Over 1500 hand-tuned SIMD kernels for x86, Arm, RISC-V, and WASM power [Unum](https://www.unum.cloud/)'s open-source [USearch](https://github.com/unum-cloud/usearch) search engine and the DBMS & AI products built on it.
|
|
3
|
+
Portable mixed-precision math, linear-algebra, & retrieval library with 2'000+ SIMD kernels for x86, Arm, RISC-V, LoongArch, Power, & WebAssembly, leveraging rare algebraic transforms with both 1D & 2D registers like AMX & SME, covering 15+ numeric types from 4-bit integers & 6-bit floats to 128-bit complex numbers, validated against 118-bit extended-precision baselines with saturation, casting, & rounding edge-case coverage, in a 5-100x smaller binary than other BLAS-like alternatives, co-designed with Tensor abstractions in C++, Python, Rust, JavaScript, GoLang, & Swift.
|
|
5
4
|
|
|
6
5
|

|
|
7
6
|
|
|
8
|
-
## Latency, Throughput, & Numerical Stability
|
|
7
|
+
## Latency, Throughput, & Numerical Stability
|
|
9
8
|
|
|
10
9
|
Most libraries return dot products in the __same type as the input__ — Float16 × Float16 → Float16, Int8 × Int8 → Int8.
|
|
11
|
-
|
|
12
|
-
NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results
|
|
10
|
+
This leads to quiet overflow: a 2048-dimensional `i8` dot product can reach ±10 million, but `i8` maxes out at 127.
|
|
11
|
+
NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results stay in range.
|
|
13
12
|
|
|
14
|
-
> Single 2048-d dot product on Intel Sapphire Rapids
|
|
13
|
+
> Single 2048-d dot product on Intel [Sapphire Rapids](https://en.wikipedia.org/wiki/Sapphire_Rapids), single-threaded.
|
|
15
14
|
> Each cell shows __gso/s, mean relative error__ vs higher-precision reference.
|
|
16
15
|
> gso/s = Giga Scalar Operations per Second — a more suitable name than GFLOP/s when counting both integer and floating-point work.
|
|
17
|
-
>
|
|
16
|
+
> NumPy 2.4, PyTorch 2.10, JAX 0.9.
|
|
18
17
|
|
|
19
18
|
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
20
19
|
| :----- | ----------------------: | ----------------------: | ----------------------: | --------------------: |
|
|
@@ -27,12 +26,12 @@ NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Flo
|
|
|
27
26
|
| `i8` | 1.1 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 14.8 gso/s, 0% err |
|
|
28
27
|
|
|
29
28
|
A fair objection: PyTorch and JAX are designed for throughput, not single-call latency.
|
|
30
|
-
They lower execution graphs through XLA or vendored BLAS libraries like Intel MKL and Nvidia cuBLAS.
|
|
29
|
+
They lower execution graphs through [XLA](https://openxla.org/) or vendored BLAS libraries like [Intel MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) and Nvidia [cuBLAS](https://developer.nvidia.com/cublas).
|
|
31
30
|
So here's the same comparison on a throughput-oriented workload — matrix multiplication:
|
|
32
31
|
|
|
33
|
-
> Matrix multiplication (2048 × 2048) × (2048 × 2048), single-threaded
|
|
34
|
-
>
|
|
35
|
-
>
|
|
32
|
+
> Matrix multiplication (2048 × 2048) × (2048 × 2048) on Intel Sapphire Rapids, single-threaded.
|
|
33
|
+
> gso/s = Giga Scalar Operations per Second, same format.
|
|
34
|
+
> NumPy 2.4, PyTorch 2.10, JAX 0.9, same versions.
|
|
36
35
|
|
|
37
36
|
| Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
|
|
38
37
|
| :----- | ----------------------: | -----------------------: | -----------------------: | -------------------: |
|
|
@@ -44,74 +43,132 @@ So here's the same comparison on a throughput-oriented workload — matrix multi
|
|
|
44
43
|
| `e5m2` | — | 0.4 gso/s, 4.6% err | ~26.4 gso/s, 4.6% err | 398 gso/s, 0% err |
|
|
45
44
|
| `i8` | 0.4 gso/s, __overflow__ | 50.0 gso/s, __overflow__ | ~0.0 gso/s, __overflow__ | 1279 gso/s, 0% err |
|
|
46
45
|
|
|
47
|
-
For `f64`,
|
|
48
|
-
For `f32`, widening to Float64 gives
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
|
53
|
-
|
|
|
54
|
-
|
|
|
55
|
-
|
|
|
56
|
-
|
|
|
57
|
-
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
46
|
+
For `f64`, compensated "Dot2" summation reduces error by 10–50× compared to naive Float64 accumulation, depending on vector length.
|
|
47
|
+
For `f32`, widening to Float64 gives 5–10× lower error.
|
|
48
|
+
The library ships as a relatively small binary:
|
|
49
|
+
|
|
50
|
+
| Package | Size | Parallelism & Memory | Available For |
|
|
51
|
+
| :--------------- | -----: | :------------------------------------------------ | :---------------- |
|
|
52
|
+
| PyTorch + MKL | 705 MB | Vector & Tile SIMD, OpenMP Threads, Hidden Allocs | Python, C++, Java |
|
|
53
|
+
| JAX + jaxlib | 357 MB | Vector SIMD, XLA Threads, Hidden Allocs | Python |
|
|
54
|
+
| NumPy + OpenBLAS | 30 MB | Vector SIMD, Built-in Threads, Hidden Allocs | Python |
|
|
55
|
+
| mathjs | 9 MB | No SIMD, No Threads, Many Allocs | JS |
|
|
56
|
+
| NumKong | 5 MB | Vector & Tile SIMD, Your Threads, Your Allocs | 7 languages |
|
|
57
|
+
|
|
58
|
+
Every kernel is validated against 118-bit extended-precision baselines with per-type ULP budgets across log-normal, uniform, and Cauchy input distributions.
|
|
59
|
+
Tests check triangle inequality, Cauchy-Schwarz bounds, NaN propagation, overflow detection, and probability-simplex constraints for each ISA variant.
|
|
60
|
+
Results are cross-validated against OpenBLAS, Intel MKL, and Apple Accelerate.
|
|
61
61
|
A broader throughput comparison is maintained in [NumWars](https://github.com/ashvardanian/NumWars).
|
|
62
62
|
|
|
63
63
|
## Quick Start
|
|
64
64
|
|
|
65
|
-
| Language
|
|
66
|
-
|
|
|
67
|
-
| C / C++
|
|
68
|
-
| Python
|
|
69
|
-
| Rust
|
|
70
|
-
|
|
|
71
|
-
| Swift
|
|
72
|
-
| Go
|
|
65
|
+
| Language | Install | Compatible with | Guide |
|
|
66
|
+
| :------- | :------------------------- | :----------------------------- | :------------------------------------------- |
|
|
67
|
+
| C / C++ | CMake, headers, & prebuilt | Linux, macOS, Windows, Android | [include/README.md](include/README.md) |
|
|
68
|
+
| Python | `pip install` | Linux, macOS, Windows | [python/README.md](python/README.md) |
|
|
69
|
+
| Rust | `cargo add` | Linux, macOS, Windows | [rust/README.md](rust/README.md) |
|
|
70
|
+
| JS | `npm install` & `import` | Node.js, Bun, Deno & browsers | [javascript/README.md](javascript/README.md) |
|
|
71
|
+
| Swift | Swift Package Manager | macOS, iOS, tvOS, watchOS | [swift/README.md](swift/README.md) |
|
|
72
|
+
| Go | `go get` | Linux, macOS, Windows via cGo | [golang/README.md](golang/README.md) |
|
|
73
73
|
|
|
74
74
|
## What's Inside
|
|
75
75
|
|
|
76
|
-
NumKong
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
76
|
+
NumKong covers 17 numeric types — from 6-bit floats to 64-bit complex numbers — across dozens of operations and 30+ SIMD backends, with hardware-aware defaults: Arm prioritizes `f16`, x86 prioritizes `bf16`.
|
|
77
|
+
|
|
78
|
+
### Language Bindings
|
|
79
|
+
|
|
80
|
+
| Operation | [C/C++][c] | [Python][py] | [Rust][rs] | [JS][js] | [Swift][swift] | [Go][go] |
|
|
81
|
+
| :-------------------------- | :--------: | :----------: | :--------: | :------: | :------------: | :------: |
|
|
82
|
+
| __Vector Ops__ | | | | | | |
|
|
83
|
+
| [Dot] Product | ● | ● | ● | ● | ● | ● |
|
|
84
|
+
| [Spatial] Metric | ● | ● | ● | ● | ● | ● |
|
|
85
|
+
| [Set] Similarity | ● | ● | ● | ● | ● | ● |
|
|
86
|
+
| [Geo]spatial | ● | ● | ● | · | ● | ● |
|
|
87
|
+
| [Mesh] Alignment | ● | ● | ● | · | · | · |
|
|
88
|
+
| [Sparse] Products | ● | ● | ● | · | · | · |
|
|
89
|
+
| [Prob]ability Divergences | ● | ● | ● | ● | · | ● |
|
|
90
|
+
| [Curved] Spaces | ● | ● | ● | · | · | · |
|
|
91
|
+
| __Many-to-Many Vector Ops__ | | | | | | |
|
|
92
|
+
| "[Dots]" Products | ● | ● | ● | ● | ● | ● |
|
|
93
|
+
| "[Spatials]" Metrics | ● | ● | ● | ● | ● | ● |
|
|
94
|
+
| "[Sets]" Similarities | ● | ● | ● | · | ● | ● |
|
|
95
|
+
| [MaxSim] Scoring | ● | ● | ● | · | ● | ● |
|
|
96
|
+
| __Scalar Ops__ | | | | | | |
|
|
97
|
+
| [Cast] | ● | ● | ● | ● | · | · |
|
|
98
|
+
| [Reduce] | ● | ● | ● | · | · | · |
|
|
99
|
+
| [Each] | ● | ● | ● | · | · | · |
|
|
100
|
+
| [Trig]onometry | ● | ● | ● | · | · | · |
|
|
101
|
+
|
|
102
|
+
[Dot]: include/numkong/dot/README.md
|
|
103
|
+
[Dots]: include/numkong/dots/README.md
|
|
104
|
+
[Spatial]: include/numkong/spatial/README.md
|
|
105
|
+
[Spatials]: include/numkong/spatials/README.md
|
|
106
|
+
[Set]: include/numkong/set/README.md
|
|
107
|
+
[Sets]: include/numkong/sets/README.md
|
|
108
|
+
[Cast]: include/numkong/cast/README.md
|
|
109
|
+
[Reduce]: include/numkong/reduce/README.md
|
|
110
|
+
[Trig]: include/numkong/trigonometry/README.md
|
|
111
|
+
[MaxSim]: include/numkong/maxsim/README.md
|
|
112
|
+
[Mesh]: include/numkong/mesh/README.md
|
|
113
|
+
[Each]: include/numkong/each/README.md
|
|
114
|
+
[Sparse]: include/numkong/sparse/README.md
|
|
115
|
+
[Prob]: include/numkong/probability/README.md
|
|
116
|
+
[Curved]: include/numkong/curved/README.md
|
|
117
|
+
[Geo]: include/numkong/geospatial/README.md
|
|
118
|
+
[c]: include/README.md
|
|
119
|
+
[py]: python/README.md
|
|
120
|
+
[js]: javascript/README.md
|
|
121
|
+
[rs]: rust/README.md
|
|
122
|
+
[swift]: swift/README.md
|
|
123
|
+
[go]: golang/README.md
|
|
124
|
+
|
|
125
|
+
### Numeric Types × Backend
|
|
126
|
+
|
|
127
|
+
| Backend | f64 | f32 | bf16 | f16 | e5m2 | e4m3 | e3m2 | e2m3 | i8 | u8 | i4 | u4 | u1 | f64c | f32c | bf16c | f16c |
|
|
128
|
+
| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
|
|
129
|
+
| __x86__ | | | | | | | | | | | | | | | | | |
|
|
130
|
+
| Haswell | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● |
|
|
131
|
+
| Skylake | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · |
|
|
132
|
+
| Ice Lake | · | · | · | ● | · | · | ● | ● | ● | ● | ● | ● | ● | · | · | · | · |
|
|
133
|
+
| Genoa | · | · | ● | · | ● | ● | · | · | · | · | · | · | · | · | · | ● | · |
|
|
134
|
+
| Sapphire | · | · | · | ● | · | ● | · | · | ● | ● | · | · | · | · | · | · | · |
|
|
135
|
+
| Sapphire AMX | · | · | ● | · | ● | ● | ● | ● | ● | ● | · | · | · | · | · | · | · |
|
|
136
|
+
| Alder Lake | · | · | · | · | · | · | ● | ● | ● | ● | · | · | · | · | · | · | · |
|
|
137
|
+
| Sierra Forest | · | · | · | · | · | · | ● | ● | ● | ● | · | · | · | · | · | · | · |
|
|
138
|
+
| Turin | · | ● | ● | · | · | · | · | · | · | · | · | · | · | · | · | · | · |
|
|
139
|
+
| Diamond | · | · | · | ● | ● | ● | · | · | · | · | · | · | · | · | · | · | · |
|
|
140
|
+
| __Arm__ | | | | | | | | | | | | | | | | | |
|
|
141
|
+
| NEON | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · | ● | ● | ● | · | ● |
|
|
142
|
+
| NEON Half | · | · | · | ● | · | · | · | · | ● | ● | · | · | · | · | · | · | · |
|
|
143
|
+
| NEON FHM | · | · | · | ● | ● | ● | · | · | · | · | · | · | · | · | · | · | ● |
|
|
144
|
+
| NEON BF16 | · | · | ● | · | ● | ● | · | · | · | · | · | · | · | · | · | ● | · |
|
|
145
|
+
| NEON SDot | · | · | · | · | · | · | ● | ● | ● | ● | ● | ● | · | · | · | · | · |
|
|
146
|
+
| NEON FP8 | · | · | · | · | ● | ● | ● | ● | · | · | · | · | · | · | · | · | · |
|
|
147
|
+
| SVE | ● | ● | · | · | · | · | · | · | · | ● | · | · | ● | ● | ● | · | · |
|
|
148
|
+
| SVE Half | · | · | · | ● | · | · | · | · | · | · | · | · | · | · | · | · | ● |
|
|
149
|
+
| SVE BF16 | · | · | ● | · | · | · | · | · | · | · | · | · | · | · | · | · | · |
|
|
150
|
+
| SVE SDot | · | · | · | · | · | · | · | · | ● | ● | · | · | · | · | · | · | · |
|
|
151
|
+
| SVE2 | · | ● | ● | · | · | · | · | · | · | · | · | · | · | · | · | · | · |
|
|
152
|
+
| SME | · | · | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · | · | · | · |
|
|
153
|
+
| SME F64 | ● | ● | · | · | · | · | · | · | · | · | · | · | · | ● | ● | · | · |
|
|
154
|
+
| SME BI32 | · | · | · | · | · | · | · | · | · | · | · | · | ● | · | · | · | · |
|
|
155
|
+
| __RISC-V__ | | | | | | | | | | | | | | | | | |
|
|
156
|
+
| RVV | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · |
|
|
157
|
+
| RVV Half | · | · | · | ● | ● | ● | · | · | · | · | · | · | · | · | · | · | · |
|
|
158
|
+
| RVV BF16 | · | · | ● | · | ● | ● | · | · | · | · | · | · | · | · | · | · | · |
|
|
159
|
+
| RVV BB | · | · | · | · | · | · | · | · | · | · | · | · | ● | · | · | · | · |
|
|
160
|
+
| __Other__ | | | | | | | | | | | | | | | | | |
|
|
161
|
+
| Power VSX | ● | ● | ● | ● | · | · | · | · | ● | ● | · | · | ● | · | · | · | · |
|
|
162
|
+
| LoongArch LASX | ● | ● | ● | ● | · | · | · | · | ● | ● | · | · | ● | · | · | · | · |
|
|
163
|
+
| WASM V128 | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | ● | · | · |
|
|
164
|
+
|
|
165
|
+
Not every combination is implemented — only the ones that unlock real performance gains.
|
|
108
166
|
The `icelake` level doesn't get a `dot_bf16` variant, for example, and falls through to `dot_bf16_skylake`.
|
|
109
167
|
Every operation has a `serial` fallback, but even types no CPU supports today get optimized via lookup tables and bit-twiddling hacks rather than scalar loops.
|
|
168
|
+
For details on compile-time and run-time [dispatch](#compile-time-and-run-time-dispatch), see the contributor guide.
|
|
110
169
|
|
|
111
170
|
## Design Decisions
|
|
112
171
|
|
|
113
|
-
In general there are a few principles that NumKong follows:
|
|
114
|
-
|
|
115
172
|
- Avoid loop unrolling and scalar tails.
|
|
116
173
|
- Don't manage threads and be compatible with any parallelism models.
|
|
117
174
|
- Don't manage memory and be compatible with arbitrary allocators & alignment.
|
|
@@ -140,17 +197,17 @@ float boring_dot_product_f32(float const *a, float const *b, size_t n) {
|
|
|
140
197
|
}
|
|
141
198
|
```
|
|
142
199
|
|
|
143
|
-
This kind of unrolling has been
|
|
200
|
+
This kind of unrolling has been a common request for NumKong, but the library avoids it by design.
|
|
144
201
|
|
|
145
202
|
__Modern CPUs already "unroll" in hardware.__
|
|
146
203
|
Out-of-order engines with reorder buffers of 320–630 entries (Zen 4: 320, Golden Cove: 512, Apple Firestorm: ~630) can keep a dozen of loop iterations in-flight simultaneously.
|
|
147
204
|
The physical register file is much larger than the ISA-visible architectural registers — Skylake has ~180 physical integer registers behind 16 architectural GPRs, and ~168 physical vector registers behind 32 architectural ZMMs.
|
|
148
205
|
The register renaming unit maps the same `zmm0` in iteration N and iteration N+1 to different physical registers, extracting cross-iteration parallelism automatically — exactly the benefit that source-level unrolling was historically supposed to provide.
|
|
149
206
|
|
|
150
|
-
__Unrolling
|
|
207
|
+
__Unrolling works against NumKong's goals.__
|
|
151
208
|
Every unrolled copy is a distinct instruction in the binary.
|
|
152
209
|
With 1,500+ kernel endpoints across 30+ backends, even 2x unrolling would inflate the `.text` section by megabytes — directly impacting install size for Python wheels, NPM packages, and Rust crates.
|
|
153
|
-
Larger loop bodies also increase instruction-cache and micro-op-cache pressure; Agner Fog also recommends:
|
|
210
|
+
Larger loop bodies also increase instruction-cache and micro-op-cache pressure; [Agner Fog](https://www.agner.org/optimize/) also recommends:
|
|
154
211
|
|
|
155
212
|
> _"avoid loop unrolling where possible in order to economize the use of the micro-op cache"_.
|
|
156
213
|
|
|
@@ -162,9 +219,9 @@ The leftover elements after the last full SIMD chunk run through a scalar loop t
|
|
|
162
219
|
NumKong often uses masked loads instead (`_mm512_maskz_loadu_ps` on AVX-512, predicated `svld1_f32` on SVE), processing every element through the same arithmetic path regardless of alignment.
|
|
163
220
|
It's not exactly orthogonal to loop-unrolling, but makes a different kernel layout more compatible.
|
|
164
221
|
|
|
165
|
-
__The
|
|
222
|
+
__The gains come from elsewhere.__
|
|
166
223
|
On Intel Sapphire Rapids, NumKong was benchmarked against auto-vectorized code compiled with GCC 12.
|
|
167
|
-
GCC handles single-precision `float`
|
|
224
|
+
GCC handles single-precision `float` well, but struggles with `_Float16` and other mixed-precision paths:
|
|
168
225
|
|
|
169
226
|
| Kind | GCC 12 `f32` | GCC 12 `f16` | NumKong `f16` | `f16` improvement |
|
|
170
227
|
| :------------------------ | -----------: | -----------: | ------------: | ----------------: |
|
|
@@ -173,20 +230,20 @@ GCC handles single-precision `float` competently, but struggles with `_Float16`
|
|
|
173
230
|
| Euclidean Distance ² | 4,620 K/s | 147 K/s | 5,320 K/s | __36 x__ |
|
|
174
231
|
| Jensen-Shannon Divergence | 1,180 K/s | 18 K/s | 2,140 K/s | __118 x__ |
|
|
175
232
|
|
|
176
|
-
NumKong's `f16` kernels are faster than GCC's `f32` output — not because of unrolling, but because they use F16C conversion instructions, widening FMA pipelines, and compensated accumulation that
|
|
177
|
-
The same story repeats for `bf16`, `e4m3`, `i8`, and `i4`: these types require algorithmic transformations — lookup tables, algebraic domain shifts, asymmetric VNNI tricks — that live beyond the reach of auto-vectorization.
|
|
233
|
+
NumKong's `f16` kernels are faster than GCC's `f32` output — not because of unrolling, but because they use [F16C](https://en.wikipedia.org/wiki/F16C) conversion instructions, widening FMA pipelines, and compensated accumulation that compilers do not synthesize from a plain `for` loop.
|
|
234
|
+
The same story repeats for `bf16`, `e4m3`, `i8`, and `i4`: these types require algorithmic transformations — lookup tables, algebraic domain shifts, asymmetric [VNNI](https://en.wikipedia.org/wiki/AVX-512#VNNI) tricks — that live beyond the reach of auto-vectorization.
|
|
178
235
|
|
|
179
236
|
### Parallelism & Multi-Threading
|
|
180
237
|
|
|
181
238
|
BLAS libraries traditionally manage their own thread pools.
|
|
182
|
-
[OpenBLAS](https://github.com/OpenMathLib/OpenBLAS/blob/develop/USAGE.md) spawns threads controlled by `OPENBLAS_NUM_THREADS`, [Intel MKL](https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2025-1/techniques-to-set-the-number-of-threads.html) forks its own OpenMP runtime via `MKL_NUM_THREADS`, and [Apple Accelerate](https://developer.apple.com/documentation/accelerate/blas) delegates to GCD.
|
|
239
|
+
[OpenBLAS](https://github.com/OpenMathLib/OpenBLAS/blob/develop/USAGE.md) spawns threads controlled by `OPENBLAS_NUM_THREADS`, [Intel MKL](https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2025-1/techniques-to-set-the-number-of-threads.html) forks its own OpenMP runtime via `MKL_NUM_THREADS`, and [Apple Accelerate](https://developer.apple.com/documentation/accelerate/blas) delegates to [GCD](https://developer.apple.com/documentation/dispatch) (Grand Central Dispatch).
|
|
183
240
|
This works in isolation — but the moment your application adds its own parallelism (joblib, std::thread, Tokio, GCD, OpenMP), you get __thread oversubscription__: MKL spawns 8 threads inside each of your 8 joblib workers, producing 64 threads on 8 cores, thrashing caches and stalling on context switches.
|
|
184
241
|
The Python ecosystem has built [entire libraries](https://github.com/joblib/threadpoolctl) just to work around this problem, and [scikit-learn's documentation](https://scikit-learn.org/stable/computing/parallelism.html) devotes a full page to managing the interaction between joblib parallelism and BLAS thread pools.
|
|
185
242
|
|
|
186
243
|
NumKong takes a different position: __the numerics layer should not own threads__.
|
|
187
244
|
Modern hardware makes the "spawn N threads and split evenly" model increasingly untenable:
|
|
188
245
|
|
|
189
|
-
- __Server-grade CPUs__ have hundreds of cores split across sockets, chiplets, and tiles, resulting in dozens of physical NUMA domains with vastly different memory access latencies.
|
|
246
|
+
- __Server-grade CPUs__ have hundreds of cores split across sockets, chiplets, and tiles, resulting in dozens of physical [NUMA](https://en.wikipedia.org/wiki/Non-uniform_memory_access) domains with vastly different memory access latencies.
|
|
190
247
|
A thread pool that ignores NUMA topology will spend more time on remote memory stalls than on arithmetic.
|
|
191
248
|
- __Consumer-grade CPUs__ pack heterogeneous Quality-of-Service core types on the same die — Intel P-cores and E-cores run at different frequencies and sometimes support different ISA extensions.
|
|
192
249
|
A naive work-split gives equal chunks to fast and slow cores, and the whole task stalls waiting for the slowest partition.
|
|
@@ -196,7 +253,7 @@ Modern hardware makes the "spawn N threads and split evenly" model increasingly
|
|
|
196
253
|
Instead, NumKong exposes __row-range parameters__ that let the caller partition work across any threading model.
|
|
197
254
|
For GEMM-shaped `dots_packed`, this is straightforward — pass a slice of A's rows and the full packed B to compute the corresponding slice of C.
|
|
198
255
|
For SYRK-shaped `dots_symmetric`, explicit `start_row` / `end_row` parameters control which rows of the symmetric output matrix a given thread computes.
|
|
199
|
-
The GIL is released around every kernel call, making NumKong compatible with `concurrent.futures`, `multiprocessing`, or any other parallelism model:
|
|
256
|
+
The [GIL](https://docs.python.org/3/glossary.html#term-global-interpreter-lock) (Global Interpreter Lock) is released around every kernel call, making NumKong compatible with `concurrent.futures`, `multiprocessing`, or any other parallelism model:
|
|
200
257
|
|
|
201
258
|
```python
|
|
202
259
|
import concurrent.futures, numkong as nk, numpy as np
|
|
@@ -213,12 +270,12 @@ with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as pool:
|
|
|
213
270
|
list(pool.map(compute_slice, range(num_threads)))
|
|
214
271
|
```
|
|
215
272
|
|
|
216
|
-
For users who want a ready-made low-latency thread pool without the oversubscription baggage of OpenMP, we built [
|
|
273
|
+
For users who want a ready-made low-latency thread pool without the oversubscription baggage of OpenMP, we built [ForkUnion](https://github.com/ashvardanian/ForkUnion) — a minimalist fork-join library for C, C++, and Rust that avoids mutexes, CAS atomics, and dynamic allocations on the critical path, with optional NUMA pinning on Linux.
|
|
217
274
|
|
|
218
275
|
### Memory Allocation & Management
|
|
219
276
|
|
|
220
277
|
BLAS libraries typically allocate internal buffers during GEMM — [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS) packs matrices into L2/L3-sized panels via per-thread buffer pools backed by `mmap` or `shmget`.
|
|
221
|
-
This hidden allocation has caused real problems: [14 lock/unlock pairs per small GEMM call](https://github.com/
|
|
278
|
+
This hidden allocation has caused real problems: [14 lock/unlock pairs per small GEMM call](https://github.com/OpenMathLib/OpenBLAS/issues/478) throttling 12-thread scaling to 2x, [silently incorrect results](https://github.com/OpenMathLib/OpenBLAS/issues/1844) from thread-unsafe allocation in `np.dot`, and [deadlocks after `fork()`](https://github.com/numpy/numpy/issues/30092) due to mutex state not being reset in child processes.
|
|
222
279
|
The [BLASFEO](https://github.com/giaf/blasfeo) library was created specifically for embedded model-predictive control where `malloc` during computation is unacceptable.
|
|
223
280
|
|
|
224
281
|
NumKong __never allocates memory__.
|
|
@@ -241,14 +298,14 @@ NumKong's `nk_dots_pack_*` family performs five transformations beyond simple re
|
|
|
241
298
|
import numkong as nk, numpy as np
|
|
242
299
|
|
|
243
300
|
right_matrix = np.random.randn(1000, 768).astype(np.float16)
|
|
244
|
-
right_packed = nk.dots_pack(right_matrix, dtype=
|
|
301
|
+
right_packed = nk.dots_pack(right_matrix, dtype=nk.float16) # pack once
|
|
245
302
|
for query_batch in stream: results = nk.dots_packed(query_batch, right_packed) # reuse many times
|
|
246
303
|
```
|
|
247
304
|
|
|
248
305
|
### Why Not Just GEMM? The Evolution of Matrix Multiplication APIs
|
|
249
306
|
|
|
250
307
|
The classic BLAS GEMM computes $C = \alpha A B + \beta C$ for Float32/Float64 matrices.
|
|
251
|
-
It
|
|
308
|
+
It covers many use cases, but LLM inference, vector search, and quantum simulation expose three ways in which the traditional interface falls short.
|
|
252
309
|
|
|
253
310
|
__Frozen weights justify separating packing from computation.__
|
|
254
311
|
During LLM inference, a very large share of GEMM calls use a static weight matrix — weights don't change after loading.
|
|
@@ -272,18 +329,18 @@ The standard BLAS interface was never designed for sub-byte types either — [no
|
|
|
272
329
|
__Some operations need more than GEMM + postprocessing.__
|
|
273
330
|
NumKong implements several GEMM-shaped operations where the "epilogue" is too complex for a simple addition:
|
|
274
331
|
|
|
275
|
-
- __Bilinear forms__ ($a^T C b$) in quantum computing compute a [scalar expectation value](https://phys.libretexts.org/Bookshelves/Quantum_Mechanics/Advanced_Quantum_Mechanics_(Kok)/10:_Pauli_Spin_Matrices/10.2:_Expectation_Values) — the naive approach materializes an $N$-dimensional intermediate vector $Cb$, but NumKong's typed `nk_bilinear_*` kernels stream through rows of $C$ with nested compensated dot products, never allocating beyond registers.
|
|
332
|
+
- __Bilinear forms__ ($a^T C b$) in quantum computing compute a [scalar expectation value](<https://phys.libretexts.org/Bookshelves/Quantum_Mechanics/Advanced_Quantum_Mechanics_(Kok)/10:_Pauli_Spin_Matrices/10.2:_Expectation_Values>) — the naive approach materializes an $N$-dimensional intermediate vector $Cb$, but NumKong's typed `nk_bilinear_*` kernels stream through rows of $C$ with nested compensated dot products, never allocating beyond registers.
|
|
276
333
|
For complex-valued quantum states, where the intermediate would be a 2N-element complex vector, the savings double.
|
|
277
334
|
- __MaxSim scoring__ for [ColBERT-style late-interaction retrieval](https://github.com/stanford-futuredata/ColBERT) computes $\sum_i \min_j \text{angular}(q_i, d_j)$ — a sum-of-min-distances across token pairs.
|
|
278
|
-
A GEMM would produce the full $M \times N$ similarity matrix, but NumKong's typed `nk_maxsim_packed_*` kernels fuse a coarse Int8-quantized screening with full-precision angular refinement on winning pairs only,
|
|
279
|
-
[PLAID](https://ar5iv.labs.arxiv.org/html/2205.09707) and [maxsim-cpu](https://www.mixedbread.com/blog/maxsim-cpu) have independently shown that dedicated MaxSim kernels outperform the GEMM decomposition by 5–10x.
|
|
335
|
+
A GEMM would produce the full $M \times N$ similarity matrix, but NumKong's typed `nk_maxsim_packed_*` kernels fuse a coarse Int8-quantized screening with full-precision angular refinement on winning pairs only, packing both query and document matrices to use all 4 SME tiles as accumulators.
|
|
336
|
+
[PLAID](https://ar5iv.labs.arxiv.org/html/2205.09707) and [maxsim-cpu](https://www.mixedbread.com/blog/maxsim-cpu) have independently shown that dedicated MaxSim kernels can outperform the GEMM decomposition by 5–10x.
|
|
280
337
|
|
|
281
338
|
NumKong treats these as first-class operations — `dots_packed`, `euclideans_packed`, `angulars_packed`, typed `nk_bilinear_*` kernels, and typed `nk_maxsim_packed_*` kernels — rather than decomposing everything into GEMM + postprocessing.
|
|
282
339
|
|
|
283
340
|
### Precision by Design: Saturation, Rounding, & Float6 Over Float8
|
|
284
341
|
|
|
285
|
-
Floating-point arithmetic on computers [is not associative](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html): $(a + b) + c \neq a + (b + c)$ in general, and
|
|
286
|
-
NumKong makes
|
|
342
|
+
Floating-point arithmetic on computers [is not associative](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html): $(a + b) + c \neq a + (b + c)$ in general, and upcasting to wider types is not always sufficient.
|
|
343
|
+
NumKong makes operation-specific decisions about where to spend precision and where to economize, rather than applying one rule uniformly.
|
|
287
344
|
|
|
288
345
|
__Saturation depends on the operation.__
|
|
289
346
|
A reduction over a 4 GB array of `i8` values contains ~4 billion elements — but [Int32 wrapping overflow](https://cedardb.com/blog/overflow_handling/) occurs after just ~17 million Int8 summands ($127 \times 16.9\text{M} > 2^{31}$).
|
|
@@ -294,7 +351,7 @@ x86 provides no saturating 32-bit SIMD add ([only byte/word variants](https://ww
|
|
|
294
351
|
__Square roots & special math ops are platform-specific.__
|
|
295
352
|
Angular distance requires $1/\sqrt{\|a\|^2 \cdot \|b\|^2}$ — but the cost of computing this normalization varies dramatically across hardware.
|
|
296
353
|
x86 `VSQRTPS` takes [~12 cycles](https://uops.info/html-lat/SKX/VSQRTPS_XMM_XMM-Measurements.html), followed by `VDIVPS` at ~11 cycles — totalling ~23 cycles for a precise `1/sqrt(x)`.
|
|
297
|
-
The `VRSQRT14PS` alternative starts with a [14-bit estimate in ~4 cycles](https://www.intel.com/content/www/us/en/developer/articles/code-sample/reference-implementations-for-ia-approximation-instructions-vrcp14-vrsqrt14-vrcp28-vrsqrt28-vexp2.html), then one Newton-Raphson iteration ($y = y \cdot (1.5 - 0.5 x y^2)$, ~4 more cycles) reaches full Float32 precision —
|
|
354
|
+
The `VRSQRT14PS` alternative starts with a [14-bit estimate in ~4 cycles](https://www.intel.com/content/www/us/en/developer/articles/code-sample/reference-implementations-for-ia-approximation-instructions-vrcp14-vrsqrt14-vrcp28-vrsqrt28-vexp2.html), then one Newton-Raphson iteration ($y = y \cdot (1.5 - 0.5 x y^2)$, ~4 more cycles) reaches full Float32 precision — roughly 3x faster.
|
|
298
355
|
ARM's `FRSQRTE` provides only [~8 bits](https://github.com/DLTcollab/sse2neon/issues/526), requiring __two__ Newton-Raphson iterations to match.
|
|
299
356
|
NumKong selects the iteration count per platform so the final ULP bound is consistent across ISAs, rather than exposing different precision to different users.
|
|
300
357
|
|
|
@@ -347,10 +404,10 @@ The first call to `nk_capabilities()` initializes the dispatch table; all subseq
|
|
|
347
404
|
|
|
348
405
|
### Float64 & Float32: IEEE Precision
|
|
349
406
|
|
|
350
|
-
__Float64__ — NumKong
|
|
351
|
-
On serial paths, we use
|
|
407
|
+
__Float64__ — NumKong uses __compensated summation__ that tracks numerical errors separately.
|
|
408
|
+
On serial paths, we use __[Neumaier's algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm#Further_enhancements)__ (1974), an improvement over Kahan-Babuška that correctly handles cases where added terms are larger than the running sum, achieving $O(1)$ error growth instead of $O(n)$.
|
|
352
409
|
On SIMD paths with FMA support, we implement the __Dot2 algorithm__ (Ogita-Rump-Oishi, 2005), maintaining separate error compensators for both multiplication and accumulation via `TwoProd` and `TwoSum` operations.
|
|
353
|
-
The accuracy
|
|
410
|
+
The accuracy differences are visible in the [benchmark tables above](#latency-throughput--numerical-stability) — compensated Float64 suits scientific computing where numerical stability matters more than raw speed.
|
|
354
411
|
|
|
355
412
|
__Float32__ — SIMD implementations load Float32 values, upcast to Float64 for full-precision multiplication and accumulation, then downcast only during finalization.
|
|
356
413
|
This avoids catastrophic cancellation at minimal cost since modern CPUs have dedicated Float64 vector units operating at nearly the same throughput as Float32.
|
|
@@ -368,7 +425,7 @@ e = (sum - t) + product; // Compensator term
|
|
|
368
425
|
|
|
369
426
|
### BFloat16 & Float16: Half Precision
|
|
370
427
|
|
|
371
|
-
__BFloat16__ — not an IEEE 754 standard type, but
|
|
428
|
+
__BFloat16__ — not an IEEE 754 standard type, but widely adopted for AI workloads.
|
|
372
429
|
BFloat16 shares Float32's 8-bit exponent but truncates the mantissa to 7 bits, prioritizing __dynamic range over precision__ (±3.4×10³⁸ with coarser granularity).
|
|
373
430
|
On old CPUs, upcasting BFloat16 to Float32 requires just an unpack and left-shift by 16 bits (essentially free); on newer CPUs, both Arm and x86 provide widening mixed-precision dot products via __DPBF16PS__ (AVX-512 on Genoa/Sapphire Rapids) and __BFDOT__ (NEON on ARMv8.6-A Graviton 3+).
|
|
374
431
|
NumKong's Float8 types (E4M3/E5M2) upcast to BFloat16 before using DPBF16PS, creating a three-tier precision hierarchy: Float8 for storage, BFloat16 for compute, Float32 for accumulation.
|
|
@@ -378,36 +435,43 @@ Float16 prioritizes __precision over range__ (10 vs 7 mantissa bits), making it
|
|
|
378
435
|
On x86, older CPUs use __F16C extensions__ (Ivy Bridge+) for fast Float16 → Float32 conversion; Sapphire Rapids+ adds native __AVX-512-FP16__ with dedicated Float16 arithmetic.
|
|
379
436
|
On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float32 widening multiply-accumulate, reducing the total latency from 7 cycles to 4 cycles and achieving 20–48% speedup over the separate convert-then-FMA path.
|
|
380
437
|
|
|
381
|
-
| Platform | BFloat16 Path
|
|
382
|
-
| ---------------------- |
|
|
383
|
-
| __x86__ |
|
|
384
|
-
|
|
|
385
|
-
|
|
|
386
|
-
|
|
|
387
|
-
|
|
|
388
|
-
|
|
|
389
|
-
|
|
|
390
|
-
|
|
|
391
|
-
| Apple
|
|
392
|
-
|
|
|
393
|
-
| Graviton
|
|
438
|
+
| Platform | BFloat16 Path | Elem/Op | Float16 Path | Elem/Op |
|
|
439
|
+
| ---------------------- | -------------------------- | ------: | ---------------------- | ------: |
|
|
440
|
+
| __x86__ | | | | |
|
|
441
|
+
| Diamond Rapids (2025) | ↓ Genoa | 32 | `VDPPHPS` widening dot | 32 |
|
|
442
|
+
| Sapphire Rapids (2023) | ↓ Genoa | 32 | ↓ Skylake | 16 |
|
|
443
|
+
| Genoa (2022) | `VDPBF16PS` widening dot | 32 | ↓ Skylake | 16 |
|
|
444
|
+
| Skylake (2015) | `SLLI` + `VFMADD` | 16 | `VCVTPH2PS` + `VFMADD` | 16 |
|
|
445
|
+
| Haswell (2013) | `SLLI` + `VFMADD` | 8 | `VCVTPH2PS` + `VFMADD` | 8 |
|
|
446
|
+
| __Arm__ | | | | |
|
|
447
|
+
| Graviton 3 (2021) | `SVBFDOT` widening dot | 4–32 | `SVCVT` → `SVFMLA` | 4–32 |
|
|
448
|
+
| Apple M2+ (2022) | `BFDOT` widening dot | 8 | ↓ FP16FML | 8 |
|
|
449
|
+
| Apple M1 (2020) | ↓ NEON | 8 | `FMLAL` widening FMA | 8 |
|
|
450
|
+
| Graviton 2 (2019) | ↓ NEON | 8 | `FCVTL` + `FMLA` | 4 |
|
|
451
|
+
| Graviton 1 (2018) | `SHLL` + `FMLA` | 8 | bit-manip → `FMLA` | 8 |
|
|
452
|
+
| __RISC-V__ | | | | |
|
|
453
|
+
| RVV + Zvfbfwma | `VFWMACCBF16` widening FMA | 4–32 | ↓ RVV | 4–32 |
|
|
454
|
+
| RVV + Zvfh | ↓ RVV | 4–32 | `VFWMACC` widening FMA | 4–32 |
|
|
455
|
+
| RVV | shift + `VFMACC` | 4–32 | convert + `VFMACC` | 4–32 |
|
|
394
456
|
|
|
395
457
|
> BFloat16 shares Float32's 8-bit exponent, so upcasting is a 16-bit left shift (`SLLI` on x86, `SHLL` on Arm) that zero-pads the truncated mantissa — essentially free.
|
|
396
458
|
> Float16 has a different exponent width (5 vs 8 bits), requiring a dedicated convert: `VCVTPH2PS` (x86 F16C) or `FCVTL` (Arm NEON).
|
|
397
459
|
> Widening dot products (`VDPBF16PS`, `BFDOT`, `FMLAL`) fuse the conversion and multiply-accumulate into one instruction.
|
|
398
460
|
> Sapphire Rapids has native `VFMADDPH` for Float16 arithmetic, but NumKong does not use it for general dot products — Float16 accumulation loses precision.
|
|
399
461
|
> It is only used for mini-float (E2M3/E3M2) paths where periodic flush-to-Float32 windows keep error bounded.
|
|
462
|
+
> The table above covers only vector dot-product paths - GEMMs also leverage Arm SME and Intel AMX instructions.
|
|
463
|
+
> Beyond x86, Arm, and RISC-V, NumKong also ships LoongArch, WebAssembly, and PowerPC backends, also excluded from the table.
|
|
400
464
|
|
|
401
465
|
### Mini-Floats: E4M3, E5M2, E3M2, & E2M3
|
|
402
466
|
|
|
403
|
-
| Format | Bits | Range | NumKong Promotion
|
|
404
|
-
| ------------------------- | ----: | -----: |
|
|
405
|
-
| E5M2FN | 8 | ±57344 | BFloat16 → Float32
|
|
406
|
-
| E4M3FN | 8 | ±448 | BFloat16 → Float32
|
|
407
|
-
| E3M2FN | 6 → 8 | ±28 | BFloat16 & Float16 → Float32
|
|
408
|
-
| E2M3FN | 6 → 8 | ±7.5 | BFloat16 & Float16 → Float32
|
|
409
|
-
| Block-scaled NVFP4 | 4 | ±6 | —
|
|
410
|
-
| Block-scaled MXFP4 / E2M1 | 4 | ±6 | —
|
|
467
|
+
| Format | Bits | Range | NumKong Promotion Rules | Support in GPUs |
|
|
468
|
+
| ------------------------- | ----: | -----: | ----------------------------------------------- | ----------------- |
|
|
469
|
+
| E5M2FN | 8 | ±57344 | BFloat16 → Float32 | H100+, MI300+ |
|
|
470
|
+
| E4M3FN | 8 | ±448 | BFloat16 → Float32 | H100+, MI300+ |
|
|
471
|
+
| E3M2FN | 6 → 8 | ±28 | BFloat16 & Float16 → Float32,<br/>Int16 → Int32 | only block-scaled |
|
|
472
|
+
| E2M3FN | 6 → 8 | ±7.5 | BFloat16 & Float16 → Float32,<br/>Int8 → Int32 | only block-scaled |
|
|
473
|
+
| Block-scaled NVFP4 | 4 | ±6 | — | B200+ |
|
|
474
|
+
| Block-scaled MXFP4 / E2M1 | 4 | ±6 | — | B200+, MI325+ |
|
|
411
475
|
|
|
412
476
|
> __Block scaling.__
|
|
413
477
|
> NumKong does not implement block-scaled variants (MXFP4, NVFP4, or block-scaled E3M2/E2M3).
|
|
@@ -424,33 +488,72 @@ E4M3FN (no infinities, NaN only) is preferred for __training__ where precision n
|
|
|
424
488
|
On x86 Genoa/Sapphire Rapids, E4M3/E5M2 values upcast to BFloat16 via lookup tables, then use native __DPBF16PS__ for 2-per-lane dot products accumulating to Float32.
|
|
425
489
|
On Arm Graviton 3+, the same BFloat16 upcast happens via NEON table lookups, then __BFDOT__ instructions complete the computation.
|
|
426
490
|
|
|
491
|
+
| Platform | E5M2 Path | Elem/Op | E4M3 Path | Elem/Op |
|
|
492
|
+
| -------------------------- | ------------------------------ | ------: | ------------------------------ | ------: |
|
|
493
|
+
| __x86__ | | | | |
|
|
494
|
+
| Diamond Rapids (2025) | `VCVTBF82PH` → F16 + `VDPPHPS` | 32 | `VCVTHF82PH` → F16 + `VDPPHPS` | 32 |
|
|
495
|
+
| Genoa (2022) | → BF16 + `VDPBF16PS` | 32 | ↓ Ice Lake | 64 |
|
|
496
|
+
| Ice Lake (2019) | ↓ Skylake | 16 | octave LUT + `VPDPBUSD` | 64 |
|
|
497
|
+
| Skylake (2015) | rebias → F32 FMA | 16 | rebias → F32 FMA | 16 |
|
|
498
|
+
| Haswell (2013) | rebias → F32 FMA | 8 | rebias → F32 FMA | 8 |
|
|
499
|
+
| __Arm__ | | | | |
|
|
500
|
+
| NEON + FP8DOT (Olympus) | native `FDOT` | 16 | native `FDOT` | 16 |
|
|
501
|
+
| NEON + FP16FML (Apple M1+) | SHL → F16 + `FMLAL` | 16 | LUT → F16 + `FMLAL` | 16 |
|
|
502
|
+
| NEON (Graviton 1+) | SHL + `FCVTL` + FMA | 8 | → F16 + `FCVTL` + FMA | 8 |
|
|
503
|
+
| __RISC-V__ | | | | |
|
|
504
|
+
| RVV + Zvfbfwma | rebias → BF16 + `VFWMACCBF16` | 4–32 | LUT → BF16 + `VFWMACCBF16` | 4–32 |
|
|
505
|
+
| RVV + Zvfh | SHL → F16 + `VFWMACC` | 4–32 | LUT → F16 + `VFWMACC` | 4–32 |
|
|
506
|
+
| RVV | rebias → F32 + `VFMACC` | 4–32 | LUT → F32 + `VFMACC` | 4–32 |
|
|
507
|
+
|
|
508
|
+
> E5M2 shares Float16's exponent bias (15), so E5M2 → Float16 conversion is a single left-shift by 8 bits (`SHL 8`).
|
|
509
|
+
> E4M3 on Ice Lake uses "octave decomposition": the 4-bit exponent splits into 2 octave + 2 remainder bits, yielding 7 integer accumulators post-scaled by powers of 2.
|
|
510
|
+
|
|
427
511
|
__6-bit floats (E3M2 & E2M3)__ follow the [OCP MX v1.0 standard](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
|
|
428
512
|
Their smaller range allows scaling to exact integers that fit in `i8`/`i16`, enabling integer `VPDPBUSD`/`SDOT` accumulation instead of the floating-point pipeline.
|
|
429
513
|
Float16 can also serve as an accumulator, accurately representing ~50 products of E3M2FN pairs or ~20 products of E2M3FN pairs before overflow.
|
|
430
514
|
On Arm, NEON FHM extensions bring widening `FMLAL` dot-products for Float16 — both faster and more widely available than `BFDOT` for BFloat16.
|
|
431
515
|
|
|
516
|
+
| Platform | E3M2 Path | Elem/Op | E2M3 Path | Elem/Op |
|
|
517
|
+
| ---------------------------- | -------------------------- | ------: | ---------------------------- | ------: |
|
|
518
|
+
| __x86__ | | | | |
|
|
519
|
+
| Ice Lake (2019) | `VPERMW` LUT + `VPMADDWD` | 32 | `VPERMB` LUT + `VPDPBUSD` | 64 |
|
|
520
|
+
| Sierra Forest (2024) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBSSD` | 32 |
|
|
521
|
+
| Alder Lake (2021) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBUSD` | 32 |
|
|
522
|
+
| Skylake (2015) | `VPSHUFB` LUT + `VPMADDWD` | 64 | `VPSHUFB` LUT + `VPMADDUBSW` | 64 |
|
|
523
|
+
| Haswell (2013) | `VPSHUFB` LUT + `VPMADDWD` | 32 | `VPSHUFB` LUT + `VPMADDUBSW` | 32 |
|
|
524
|
+
| __Arm__ | | | | |
|
|
525
|
+
| NEON + FP8DOT (Olympus) | → E5M2 + `FDOT` | 16 | → E4M3 + `FDOT` | 16 |
|
|
526
|
+
| NEON + DotProd (Graviton 2+) | `VQTBL2` LUT + `SMLAL` | 16 | `VQTBL2` LUT + `SDOT` | 16 |
|
|
527
|
+
| NEON (Graviton 1+) | → F16 + `FCVTL` + FMA | 16 | → F16 + `FCVTL` + FMA | 16 |
|
|
528
|
+
| __RISC-V__ | | | | |
|
|
529
|
+
| RVV | I16 gather LUT + `VWMACC` | 4–32 | U8 gather LUT + `VWMACC` | 4–32 |
|
|
530
|
+
|
|
531
|
+
> E3M2/E2M3 values map to exact integers via 32-entry LUTs (magnitudes up to 448 for E3M2, 120 for E2M3), enabling integer accumulation with no rounding error.
|
|
532
|
+
> On NEON + FP8DOT, E3M2 is first promoted to E5M2 and E2M3 to E4M3 before the hardware `FDOT` instruction.
|
|
533
|
+
> Sierra Forest and Alder Lake use native `VPDPBSSD` (signed×signed) and `VPDPBUSD` (unsigned×signed) respectively for E2M3.
|
|
534
|
+
|
|
432
535
|
E4M3 and E5M2 cannot use the integer path.
|
|
433
536
|
E4M3 scaled by 16 reaches 7,680 — too large for Int8, barely fitting Int16 with a 128-entry table.
|
|
434
537
|
E5M2's range (±57,344) makes the scaled product exceed Int32 entirely.
|
|
435
538
|
Without the integer path, E5M2 falls back to Float32 accumulation — where its [2-bit mantissa (only 4 values per binade)](https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/) creates a [catastrophic cancellation risk](https://www.ac.uma.es/arith2024/papers/Fused%20FP8%204-Way%20Dot%20Product%20with%20Scaling%20and%20FP32%20Accumulation.pdf) that E2M3's integer path avoids completely:
|
|
436
539
|
|
|
437
|
-
| | _i_ = 0 | _i_ = 1 | _i_ = 2 | _i_ = 3 | _i_ = 4 | _i_ = 5 | _i_ = 6 |
|
|
438
|
-
| ------- | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
|
|
439
|
-
| _aᵢ_ | 0.00122 | 20480 | −0.00122 | 1.5 |
|
|
440
|
-
| _bᵢ_ | −40 | 320 | −1280 | −7.63e⁻⁵ |
|
|
441
|
-
| _aᵢ·bᵢ_ | −0.04883 | 6553600 | 1.5625 | −0.000114 |
|
|
540
|
+
| | _i_ = 0 | _i_ = 1 | _i_ = 2 | _i_ = 3 | _i_ = 4 | _i_ = 5 | _i_ = 6 |
|
|
541
|
+
| ------- | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
|
|
542
|
+
| _aᵢ_ | 0.00122 | 20480 | −0.00122 | 1.5 | −3072 | −640 | 0.00146 |
|
|
543
|
+
| _bᵢ_ | −40 | 320 | −1280 | −7.63e⁻⁵ | 0.000427 | 10240 | −4.58e⁻⁵ |
|
|
544
|
+
| _aᵢ·bᵢ_ | −0.04883 | 6553600 | 1.5625 | −0.000114 | −1.3125 | −6553600 | ≈ 0 |
|
|
442
545
|
|
|
443
546
|
> __Why Float32 accumulation fails here.__
|
|
444
|
-
> The accurate sum of these
|
|
445
|
-
>
|
|
446
|
-
> At that magnitude the Float32 ULP is 0.5 — so the small meaningful terms (−0.049, 1.563, −1.313, −0.0001) are all below one ULP and get absorbed during
|
|
547
|
+
> The accurate sum of these 7 products is ≈ 0.201.
|
|
548
|
+
> A `vfmaq_f32` call accumulates 4 lanes at a time; the first batch already carries values around ±6.5 M.
|
|
549
|
+
> At that magnitude the Float32 ULP is 0.5 — so the small meaningful terms (−0.049, 1.563, −1.313, −0.0001) are all below one ULP and get absorbed during lane reduction.
|
|
447
550
|
> The large terms then cancel exactly to zero, and the information is gone.
|
|
448
551
|
> Final Float32 result: __0.0__ instead of __0.201__.
|
|
449
552
|
|
|
450
553
|
### Int8 & Int4: Integer Types
|
|
451
554
|
|
|
452
555
|
Both signed and unsigned 8-bit and 4-bit integers are supported with __Int32 accumulation__ to prevent overflow.
|
|
453
|
-
|
|
556
|
+
A notable optimization is the __VNNI algebraic transform__: on Ice Lake+ with AVX-512 VNNI, the native __DPBUSD__ instruction is asymmetric (unsigned × signed → signed), but NumKong uses it for both Int8×Int8 and UInt8×UInt8.
|
|
454
557
|
For __signed Int8×Int8__, we convert the signed operand to unsigned via XOR with `0x80`, compute `DPBUSD(a⊕0x80, b) = (a+128)×b`, then subtract a correction term `128×sum(b)` to recover the true result.
|
|
455
558
|
For __unsigned UInt8×UInt8__, we XOR the second operand to make it signed, compute `DPBUSD(a, b⊕0x80) = a×(b-128)`, then add correction `128×sum(a)` via the fast SAD instruction.
|
|
456
559
|
|
|
@@ -480,7 +583,7 @@ Complex types are essential in quantum simulation (state vectors, density matric
|
|
|
480
583
|
The `dot` operation computes the unconjugated dot product $\sum a_k b_k$, while `vdot` computes the conjugated inner product $\sum \bar{a}_k b_k$ standard in physics and signal processing.
|
|
481
584
|
|
|
482
585
|
For complex dot products, NumKong defers sign flips until after the accumulation loop: instead of using separate FMA and FMS (fused multiply-subtract) instructions for the real component, we compute $a_r b_r + a_i b_i$ treating all products as positive, then apply a single bitwise XOR with `0x80000000` to flip the sign bits.
|
|
483
|
-
This
|
|
586
|
+
This avoids execution port contention between FMA and FMS, letting dual FMA units stay occupied.
|
|
484
587
|
|
|
485
588
|
```c
|
|
486
589
|
for (...) { // Complex multiply optimization: XOR sign flip after the loop
|
|
@@ -490,6 +593,20 @@ for (...) { // Complex multiply optimization: XOR sign flip after the loop
|
|
|
490
593
|
sum_real = xor(sum_real, 0x80000000); // Single XOR after loop
|
|
491
594
|
```
|
|
492
595
|
|
|
596
|
+
## Reading Materials
|
|
597
|
+
|
|
598
|
+
Beyond the READMEs in this repository, there are several standalone articles covering different evolution steps and features of this library.
|
|
599
|
+
|
|
600
|
+
- [NumKong: 2'000 Mixed Precision Kernels For All](https://ashvardanian.com/posts/numkong/)
|
|
601
|
+
- [Hiding x86 Port Latency for 330 GB/s/core Reductions](https://ashvardanian.com/posts/cpu-ports/)
|
|
602
|
+
- [Understanding SIMD: Infinite Complexity of Trivial Problems](https://ashvardanian.com/posts/understanding-simd-complexity/)
|
|
603
|
+
- [NumPy vs BLAS: Losing 90% of Throughput](https://ashvardanian.com/posts/numpy-vs-blas-costs/)
|
|
604
|
+
- [5x Faster Set Intersections: SVE2, AVX-512, & NEON](https://ashvardanian.com/posts/simd-set-intersections-sve2-avx512/)
|
|
605
|
+
- [Python, C, Assembly - 2'500x Faster Cosine Similarity](https://ashvardanian.com/posts/python-c-assembly-comparison/)
|
|
606
|
+
- [GCC Compiler vs Human - 119x Faster Assembly](https://ashvardanian.com/posts/gcc-12-vs-avx512fp16/)
|
|
607
|
+
- [Accelerating JavaScript arrays by 10x for Vector Search](https://ashvardanian.com/posts/javascript-ai-vector-search/)
|
|
608
|
+
- [SciPy distances... up to 200x faster with AVX-512 & SVE](https://ashvardanian.com/posts/simsimd-faster-scipy/)
|
|
609
|
+
|
|
493
610
|
## License
|
|
494
611
|
|
|
495
612
|
Feel free to use the project under Apache 2.0 or the Three-clause BSD license at your preference.
|