numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
package/README.md CHANGED
@@ -1,20 +1,19 @@
1
1
  # NumKong: Mixed Precision for All
2
2
 
3
- NumKong (previously SimSIMD) delivers mixed-precision numerics that are often faster _and_ more accurate than standard BLAS libraries in a 5 MB binary, across C, C++, Rust, Python, Go, JavaScript, and Swift.
4
- Over 1500 hand-tuned SIMD kernels for x86, Arm, RISC-V, and WASM power [Unum](https://www.unum.cloud/)'s open-source [USearch](https://github.com/unum-cloud/usearch) search engine and the DBMS & AI products built on it.
3
+ Portable mixed-precision math, linear-algebra, & retrieval library with 2'000+ SIMD kernels for x86, Arm, RISC-V, LoongArch, Power, & WebAssembly, leveraging rare algebraic transforms with both 1D & 2D registers like AMX & SME, covering 15+ numeric types from 4-bit integers & 6-bit floats to 128-bit complex numbers, validated against 118-bit extended-precision baselines with saturation, casting, & rounding edge-case coverage, in a 5-100x smaller binary than other BLAS-like alternatives, co-designed with Tensor abstractions in C++, Python, Rust, JavaScript, GoLang, & Swift.
5
4
 
6
5
  ![NumKong banner](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/NumKong-v7.png?raw=true)
7
6
 
8
- ## Latency, Throughput, & Numerical Stability Together in a Tiny Package
7
+ ## Latency, Throughput, & Numerical Stability
9
8
 
10
9
  Most libraries return dot products in the __same type as the input__ — Float16 × Float16 → Float16, Int8 × Int8 → Int8.
11
- That's a recipe for silent data corruption: a 2048-dimensional `i8` dot product can reach ±10 million, but `i8` maxes out at 127.
12
- NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results never overflow, and it's still faster.
10
+ This leads to quiet overflow: a 2048-dimensional `i8` dot product can reach ±10 million, but `i8` maxes out at 127.
11
+ NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Float32, Int8 → Int32, Float32 → Float64 — so results stay in range.
13
12
 
14
- > Single 2048-d dot product on Intel Sapphire Rapids (Xeon 8468), single-threaded, CPU-only packages.
13
+ > Single 2048-d dot product on Intel [Sapphire Rapids](https://en.wikipedia.org/wiki/Sapphire_Rapids), single-threaded.
15
14
  > Each cell shows __gso/s, mean relative error__ vs higher-precision reference.
16
15
  > gso/s = Giga Scalar Operations per Second — a more suitable name than GFLOP/s when counting both integer and floating-point work.
17
- > Median of 5 runs × 500 K calls each. NumPy 2.4, PyTorch 2.10, JAX 0.9.
16
+ > NumPy 2.4, PyTorch 2.10, JAX 0.9.
18
17
 
19
18
  | Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
20
19
  | :----- | ----------------------: | ----------------------: | ----------------------: | --------------------: |
@@ -27,12 +26,12 @@ NumKong promotes to wider accumulators — Float16 → Float32, BFloat16 → Flo
27
26
  | `i8` | 1.1 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 0.5 gso/s, __overflow__ | 14.8 gso/s, 0% err |
28
27
 
29
28
  A fair objection: PyTorch and JAX are designed for throughput, not single-call latency.
30
- They lower execution graphs through XLA or vendored BLAS libraries like Intel MKL and Nvidia cuBLAS.
29
+ They lower execution graphs through [XLA](https://openxla.org/) or vendored BLAS libraries like [Intel MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) and Nvidia [cuBLAS](https://developer.nvidia.com/cublas).
31
30
  So here's the same comparison on a throughput-oriented workload — matrix multiplication:
32
31
 
33
- > Matrix multiplication (2048 × 2048) × (2048 × 2048), single-threaded, same machine.
34
- > JAX/XLA numbers divided by 16 cores (XLA ignores thread restrictions).
35
- > NumKong uses `dots_packed` (pre-packed GEMM). Same format: __gso/s, mean relative error__.
32
+ > Matrix multiplication (2048 × 2048) × (2048 × 2048) on Intel Sapphire Rapids, single-threaded.
33
+ > gso/s = Giga Scalar Operations per Second, same format.
34
+ > NumPy 2.4, PyTorch 2.10, JAX 0.9, same versions.
36
35
 
37
36
  | Input | NumPy + OpenBLAS | PyTorch + MKL | JAX | NumKong |
38
37
  | :----- | ----------------------: | -----------------------: | -----------------------: | -------------------: |
@@ -44,73 +43,87 @@ So here's the same comparison on a throughput-oriented workload — matrix multi
44
43
  | `e5m2` | — | 0.4 gso/s, 4.6% err | ~26.4 gso/s, 4.6% err | 398 gso/s, 0% err |
45
44
  | `i8` | 0.4 gso/s, __overflow__ | 50.0 gso/s, __overflow__ | ~0.0 gso/s, __overflow__ | 1279 gso/s, 0% err |
46
45
 
47
- For `f64`, NumKong's compensated "Dot2" summation is __10–50× more accurate__ than naive Float64 accumulation, depending on vector length.
48
- For `f32`, widening to Float64 gives __5–10× lower error__.
49
- For smaller types and especially integers, the gap is even more dramatic.
50
- And all of that fits into one of the smallest binaries in the industry:
51
-
52
- | Package | Size | Parallelism & Memory | Available For |
53
- | :--------------------- | -----: | :-------------------------------------------------- | :---------------------------------- |
54
- | PyTorch + MKL + oneDNN | 705 MB | Vector & Tile SIMD, OpenMP Threads, Internal Allocs | Python, C++, Java |
55
- | JAX + jaxlib | 357 MB | Vector SIMD, XLA Threads, Internal Allocs | Python |
56
- | NumPy + OpenBLAS | 30 MB | Vector SIMD, Built-in Threads, Internal Allocs | Python |
57
- | mathjs | 9 MB | No SIMD, No Threads, Countless Allocs | JS |
58
- | NumKong | 5 MB | Vector & Tile SIMD, Your Threads, Your Allocs | C, C++, Rust, Python, Go, JS, Swift |
59
-
60
- But kernels and precision are only part of the story — the larger investment is test coverage: every kernel is validated against 118-bit extended-precision baselines with per-type ULP budgets across log-normal, uniform, and Cauchy input distributions, enforcing triangle inequality, Cauchy-Schwarz bounds, NaN propagation, overflow detection, and probability-simplex constraints for every ISA variant in the table above, cross-validated against OpenBLAS, Intel MKL, and Apple Accelerate to catch regressions that no single reference can.
46
+ For `f64`, compensated "Dot2" summation reduces error by 10–50× compared to naive Float64 accumulation, depending on vector length.
47
+ For `f32`, widening to Float64 gives 5–10× lower error.
48
+ The library ships as a relatively small binary:
49
+
50
+ | Package | Size | Parallelism & Memory | Available For |
51
+ | :--------------- | -----: | :------------------------------------------------ | :---------------- |
52
+ | PyTorch + MKL | 705 MB | Vector & Tile SIMD, OpenMP Threads, Hidden Allocs | Python, C++, Java |
53
+ | JAX + jaxlib | 357 MB | Vector SIMD, XLA Threads, Hidden Allocs | Python |
54
+ | NumPy + OpenBLAS | 30 MB | Vector SIMD, Built-in Threads, Hidden Allocs | Python |
55
+ | mathjs | 9 MB | No SIMD, No Threads, Many Allocs | JS |
56
+ | NumKong | 5 MB | Vector & Tile SIMD, Your Threads, Your Allocs | 7 languages |
57
+
58
+ Every kernel is validated against 118-bit extended-precision baselines with per-type ULP budgets across log-normal, uniform, and Cauchy input distributions.
59
+ Tests check triangle inequality, Cauchy-Schwarz bounds, NaN propagation, overflow detection, and probability-simplex constraints for each ISA variant.
60
+ Results are cross-validated against OpenBLAS, Intel MKL, and Apple Accelerate.
61
61
  A broader throughput comparison is maintained in [NumWars](https://github.com/ashvardanian/NumWars).
62
62
 
63
63
  ## Quick Start
64
64
 
65
- | Language | Install | Compatible with | Guide |
66
- | :--------- | :------------------------------- | :------------------------------- | :------------------------------------------- |
67
- | C / C++ | CMake, headers, or prebuilt | Linux, macOS, Windows, Android | [include/README.md](include/README.md) |
68
- | Python | `pip install` | Linux, macOS, Windows | [python/README.md](python/README.md) |
69
- | Rust | `cargo add` | Linux, macOS, Windows | [rust/README.md](rust/README.md) |
70
- | JavaScript | `npm install` or `import` remote | Node.js, Bun, Deno & any browser | [javascript/README.md](javascript/README.md) |
71
- | Swift | Swift Package Manager | macOS, iOS, tvOS, watchOS | [swift/README.md](swift/README.md) |
72
- | Go | `go get` | Linux, macOS, Windows via cGo | [golang/README.md](golang/README.md) |
65
+ | Language | Install | Compatible with | Guide |
66
+ | :------- | :------------------------- | :----------------------------- | :------------------------------------------- |
67
+ | C / C++ | CMake, headers, & prebuilt | Linux, macOS, Windows, Android | [include/README.md](include/README.md) |
68
+ | Python | `pip install` | Linux, macOS, Windows | [python/README.md](python/README.md) |
69
+ | Rust | `cargo add` | Linux, macOS, Windows | [rust/README.md](rust/README.md) |
70
+ | JS | `npm install` & `import` | Node.js, Bun, Deno & browsers | [javascript/README.md](javascript/README.md) |
71
+ | Swift | Swift Package Manager | macOS, iOS, tvOS, watchOS | [swift/README.md](swift/README.md) |
72
+ | Go | `go get` | Linux, macOS, Windows via cGo | [golang/README.md](golang/README.md) |
73
73
 
74
74
  ## What's Inside
75
75
 
76
- NumKong spans 16 numeric types — from exotic GPU-only 6-bit floats to 64-bit complex numbers — across dozens of operations and 30+ SIMD backends, with hardware-aware defaults: Arm prioritizes `f16`, x86 prioritizes `bf16`.
77
-
78
- <div align="center">
79
- <pre><code>
80
- ┌──────────────────────────────┬────────────────┬───────────────────────────┬────────────┐
81
- │ Operations │ Datatypes │ Backends │ Ecosystems
82
- ├──────────────────────────────┼────────────────┼───────────────────────────┼────────────┤
83
- Vector-Vector │ <a href="#numeric-types">Bits &amp; Ints</a> │ <a href="#compile-time-and-run-time-dispatch">x86</a> │ Core
84
- <a href="include/README.md#dot-products">dot</a> · <a href="include/README.md#dense-distances">angular</a> · <a href="include/README.md#dense-distances">euclidean</a> │ u1 · u4 · u8 │ Haswell · Alder Lake │ <a href="include/README.md#the-c-abi">C 99</a>
85
- hamming · kld · jsd · … │ i4 · i8 │ Sierra Forest · Skylake │ │
86
- │ │ │ Ice Lake · Genoa · Turin │ Primary │
87
- <a href="include/README.md#packed-matrix-kernels-for-gemm-like-workloads">Matrix-Matrix</a> │ <a href="#mini-floats-e4m3-e5m2-e3m2--e2m3">Mini-floats</a> │ Sapphire Rapids · │ <a href="include/README.md#the-c-layer">C++ 23</a> │
88
- <a href="include/README.md#packed-matrix-kernels-for-gemm-like-workloads">dots_packed</a> · <a href="include/README.md#symmetric-kernels-for-syrk-like-workloads">dots_symmetric</a> │ e2m3 · e3m2 │ Granite Rapids │ <a href="python/README.md">Python 3</a> │
89
- <a href="include/README.md#packed-matrix-kernels-for-gemm-like-workloads">euclideans_packed</a> · │ e4m3 · e5m2 │ │ <a href="rust/README.md">Rust</a>
90
- │ │ │ <a href="#compile-time-and-run-time-dispatch">Arm</a> │ │
91
- Quadratic │ <a href="#float16--bfloat16-half-precision">Half &amp; Classic</a> │ NEON · NEONHalf · NEONFhm │ Additional │
92
- <a href="include/README.md#curved-metrics">bilinear</a> · mahalanobis │ f16 · bf16 │ NEONBFDot · NEONSDot │ <a href="swift/README.md">Swift</a> · <a href="javascript/README.md">JS</a> │
93
- │ │ f32 · f64 │ SVE · SVEHalf · SVEBfDot │ <a href="golang/README.md">Go</a>
94
- <a href="include/README.md#geospatial-metrics">Geospatial</a> &amp; <a href="include/README.md#geometric-mesh-alignment">Geometric</a> │ │ SVESDot · SVE2 │ │
95
- haversine · vincenty │ <a href="#complex-types">Complex</a> │ SME · SMEF64 · SMEBI32 │ <a href="CONTRIBUTING.md">Tools</a>
96
- rmsd · kabsch · umeyama · … │ f16c · bf16c │ │ <a href="test/README.md">Tests</a> │
97
- │ │ f32c · f64c │ <a href="#compile-time-and-run-time-dispatch">RISC-V</a> │ <a href="bench/README.md">Benchmarks</a> │
98
- Bespoke │ │ RVV · RVVHalf │ <a href="https://github.com/ashvardanian/NumWars">NumWars</a> │
99
- <a href="include/numkong/each/README.md">fma</a> · blend · <a href="include/numkong/trigonometry/README.md">sin</a> · <a href="include/numkong/cast/README.md">cast</a> │ │ RVVBf16 · RVVBB │ │
100
- <a href="include/numkong/reduce/README.md">reduce_moments</a> · <a href="include/numkong/sparse/README.md">sparse_dot</a> │ │ │ │
101
- │ <a href="include/README.md#maxsim-and-late-interaction">maxsim</a> · intersect · … │ │ <a href="CONTRIBUTING.md#cross-compilation">WASM</a> │ │
102
- │ │ │ V128Relaxed │ │
103
- └──────────────────────────────┴────────────────┴───────────────────────────┴────────────┘
104
- </code></pre>
105
- </div>
106
-
107
- Not every combination is implemented — only the ones that unlock interesting new opportunities.
108
- The `icelake` level doesn't get a `dot_bf16` variant, for example, and falls through to `dot_bf16_skylake`.
109
- Every operation has a `serial` fallback, but even types no CPU supports today get optimized via lookup tables and bit-twiddling hacks rather than scalar loops.
76
+ NumKong covers 17 numeric types — from 6-bit floats to 64-bit complex numbers — across dozens of operations and 30+ SIMD backends, with hardware-aware defaults: Arm prioritizes `f16`, x86 prioritizes `bf16`.
77
+
78
+ ### Language Bindings
79
+
80
+ | Operation | [C and C++][c] | [Python][py] | [Rust][rs] | [JavaScript][js] | [Swift][swift] | [GoLang][go] |
81
+ | :-------------------------- | :------------: | :----------: | :--------: | :--------------: | :------------: | :----------: |
82
+ | __Vector Ops__ | | | | | | |
83
+ | [Dot] Product | ● | ● | ● | ● | ● | ● |
84
+ | [Spatial] Metric | ● | | ● | ● | ● | ● |
85
+ | [Set] Similarity | ● | ● | ● | ● | ● | ● |
86
+ | [Geo]spatial | ● | ● | ● | · | ● | ● |
87
+ | [Mesh] Alignment | ● | ● | ● | · | · | · |
88
+ | [Sparse] Products | ● | ● | ● | · | · | · |
89
+ | [Probability] Divergences | ● | ● | ● | ● | · | ● |
90
+ | [Curved] Spaces | ● | ● | ● | · | · | · |
91
+ | __Many-to-Many Vector Ops__ | | | | | | |
92
+ | "[Dots]" Products | ● | ● | | ● | ● | ● |
93
+ | "[Spatials]" Metrics | ● | ● | ● | ● | ● | ● |
94
+ | "[Sets]" Similarities | ● | ● | ● | · | ● | ● |
95
+ | [MaxSim] Scoring | ● | ● | ● | · | ● | ● |
96
+ | __Scalar Ops__ | | | | | | |
97
+ | [Cast] | ● | ● | ● | ● | · | · |
98
+ | [Reduce] | ● | ● | ● | · | · | · |
99
+ | [Each] | ● | ● | ● | · | · | · |
100
+ | [Trigonometry] | ● | ● | ● | · | · | · |
101
+
102
+ [Dot]: include/numkong/dot/README.md
103
+ [Dots]: include/numkong/dots/README.md
104
+ [Spatial]: include/numkong/spatial/README.md
105
+ [Spatials]: include/numkong/spatials/README.md
106
+ [Set]: include/numkong/set/README.md
107
+ [Sets]: include/numkong/sets/README.md
108
+ [Cast]: include/numkong/cast/README.md
109
+ [Reduce]: include/numkong/reduce/README.md
110
+ [Trigonometry]: include/numkong/trigonometry/README.md
111
+ [MaxSim]: include/numkong/maxsim/README.md
112
+ [Mesh]: include/numkong/mesh/README.md
113
+ [Each]: include/numkong/each/README.md
114
+ [Sparse]: include/numkong/sparse/README.md
115
+ [Probability]: include/numkong/probability/README.md
116
+ [Curved]: include/numkong/curved/README.md
117
+ [Geo]: include/numkong/geospatial/README.md
118
+ [c]: include/README.md
119
+ [py]: python/README.md
120
+ [js]: javascript/README.md
121
+ [rs]: rust/README.md
122
+ [swift]: swift/README.md
123
+ [go]: golang/README.md
110
124
 
111
- ## Design Decisions
112
125
 
113
- In general there are a few principles that NumKong follows:
126
+ ## Design Decisions
114
127
 
115
128
  - Avoid loop unrolling and scalar tails.
116
129
  - Don't manage threads and be compatible with any parallelism models.
@@ -140,17 +153,17 @@ float boring_dot_product_f32(float const *a, float const *b, size_t n) {
140
153
  }
141
154
  ```
142
155
 
143
- This kind of unrolling has been historically the most commonly requested optimization for NumKong, and it's intentionally avoided.
156
+ This kind of unrolling has been a common request for NumKong, but the library avoids it by design.
144
157
 
145
158
  __Modern CPUs already "unroll" in hardware.__
146
159
  Out-of-order engines with reorder buffers of 320–630 entries (Zen 4: 320, Golden Cove: 512, Apple Firestorm: ~630) can keep a dozen of loop iterations in-flight simultaneously.
147
160
  The physical register file is much larger than the ISA-visible architectural registers — Skylake has ~180 physical integer registers behind 16 architectural GPRs, and ~168 physical vector registers behind 32 architectural ZMMs.
148
161
  The register renaming unit maps the same `zmm0` in iteration N and iteration N+1 to different physical registers, extracting cross-iteration parallelism automatically — exactly the benefit that source-level unrolling was historically supposed to provide.
149
162
 
150
- __Unrolling actively hurts at NumKong's scale.__
163
+ __Unrolling works against NumKong's goals.__
151
164
  Every unrolled copy is a distinct instruction in the binary.
152
165
  With 1,500+ kernel endpoints across 30+ backends, even 2x unrolling would inflate the `.text` section by megabytes — directly impacting install size for Python wheels, NPM packages, and Rust crates.
153
- Larger loop bodies also increase instruction-cache and micro-op-cache pressure; Agner Fog also recommends:
166
+ Larger loop bodies also increase instruction-cache and micro-op-cache pressure; [Agner Fog](https://www.agner.org/optimize/) also recommends:
154
167
 
155
168
  > _"avoid loop unrolling where possible in order to economize the use of the micro-op cache"_.
156
169
 
@@ -162,9 +175,9 @@ The leftover elements after the last full SIMD chunk run through a scalar loop t
162
175
  NumKong often uses masked loads instead (`_mm512_maskz_loadu_ps` on AVX-512, predicated `svld1_f32` on SVE), processing every element through the same arithmetic path regardless of alignment.
163
176
  It's not exactly orthogonal to loop-unrolling, but makes a different kernel layout more compatible.
164
177
 
165
- __The real performance gap is elsewhere.__
178
+ __The gains come from elsewhere.__
166
179
  On Intel Sapphire Rapids, NumKong was benchmarked against auto-vectorized code compiled with GCC 12.
167
- GCC handles single-precision `float` competently, but struggles with `_Float16` and other mixed-precision paths:
180
+ GCC handles single-precision `float` well, but struggles with `_Float16` and other mixed-precision paths:
168
181
 
169
182
  | Kind | GCC 12 `f32` | GCC 12 `f16` | NumKong `f16` | `f16` improvement |
170
183
  | :------------------------ | -----------: | -----------: | ------------: | ----------------: |
@@ -173,20 +186,20 @@ GCC handles single-precision `float` competently, but struggles with `_Float16`
173
186
  | Euclidean Distance ² | 4,620 K/s | 147 K/s | 5,320 K/s | __36 x__ |
174
187
  | Jensen-Shannon Divergence | 1,180 K/s | 18 K/s | 2,140 K/s | __118 x__ |
175
188
 
176
- NumKong's `f16` kernels are faster than GCC's `f32` output — not because of unrolling, but because they use F16C conversion instructions, widening FMA pipelines, and compensated accumulation that no compiler will synthesize from a plain `for` loop.
177
- The same story repeats for `bf16`, `e4m3`, `i8`, and `i4`: these types require algorithmic transformations — lookup tables, algebraic domain shifts, asymmetric VNNI tricks — that live beyond the reach of auto-vectorization.
189
+ NumKong's `f16` kernels are faster than GCC's `f32` output — not because of unrolling, but because they use [F16C](https://en.wikipedia.org/wiki/F16C) conversion instructions, widening FMA pipelines, and compensated accumulation that compilers do not synthesize from a plain `for` loop.
190
+ The same story repeats for `bf16`, `e4m3`, `i8`, and `i4`: these types require algorithmic transformations — lookup tables, algebraic domain shifts, asymmetric [VNNI](https://en.wikipedia.org/wiki/AVX-512#VNNI) tricks — that live beyond the reach of auto-vectorization.
178
191
 
179
192
  ### Parallelism & Multi-Threading
180
193
 
181
194
  BLAS libraries traditionally manage their own thread pools.
182
- [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS/blob/develop/USAGE.md) spawns threads controlled by `OPENBLAS_NUM_THREADS`, [Intel MKL](https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2025-1/techniques-to-set-the-number-of-threads.html) forks its own OpenMP runtime via `MKL_NUM_THREADS`, and [Apple Accelerate](https://developer.apple.com/documentation/accelerate/blas) delegates to GCD.
195
+ [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS/blob/develop/USAGE.md) spawns threads controlled by `OPENBLAS_NUM_THREADS`, [Intel MKL](https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2025-1/techniques-to-set-the-number-of-threads.html) forks its own OpenMP runtime via `MKL_NUM_THREADS`, and [Apple Accelerate](https://developer.apple.com/documentation/accelerate/blas) delegates to [GCD](https://developer.apple.com/documentation/dispatch) (Grand Central Dispatch).
183
196
  This works in isolation — but the moment your application adds its own parallelism (joblib, std::thread, Tokio, GCD, OpenMP), you get __thread oversubscription__: MKL spawns 8 threads inside each of your 8 joblib workers, producing 64 threads on 8 cores, thrashing caches and stalling on context switches.
184
197
  The Python ecosystem has built [entire libraries](https://github.com/joblib/threadpoolctl) just to work around this problem, and [scikit-learn's documentation](https://scikit-learn.org/stable/computing/parallelism.html) devotes a full page to managing the interaction between joblib parallelism and BLAS thread pools.
185
198
 
186
199
  NumKong takes a different position: __the numerics layer should not own threads__.
187
200
  Modern hardware makes the "spawn N threads and split evenly" model increasingly untenable:
188
201
 
189
- - __Server-grade CPUs__ have hundreds of cores split across sockets, chiplets, and tiles, resulting in dozens of physical NUMA domains with vastly different memory access latencies.
202
+ - __Server-grade CPUs__ have hundreds of cores split across sockets, chiplets, and tiles, resulting in dozens of physical [NUMA](https://en.wikipedia.org/wiki/Non-uniform_memory_access) domains with vastly different memory access latencies.
190
203
  A thread pool that ignores NUMA topology will spend more time on remote memory stalls than on arithmetic.
191
204
  - __Consumer-grade CPUs__ pack heterogeneous Quality-of-Service core types on the same die — Intel P-cores and E-cores run at different frequencies and sometimes support different ISA extensions.
192
205
  A naive work-split gives equal chunks to fast and slow cores, and the whole task stalls waiting for the slowest partition.
@@ -196,7 +209,7 @@ Modern hardware makes the "spawn N threads and split evenly" model increasingly
196
209
  Instead, NumKong exposes __row-range parameters__ that let the caller partition work across any threading model.
197
210
  For GEMM-shaped `dots_packed`, this is straightforward — pass a slice of A's rows and the full packed B to compute the corresponding slice of C.
198
211
  For SYRK-shaped `dots_symmetric`, explicit `start_row` / `end_row` parameters control which rows of the symmetric output matrix a given thread computes.
199
- The GIL is released around every kernel call, making NumKong compatible with `concurrent.futures`, `multiprocessing`, or any other parallelism model:
212
+ The [GIL](https://docs.python.org/3/glossary.html#term-global-interpreter-lock) (Global Interpreter Lock) is released around every kernel call, making NumKong compatible with `concurrent.futures`, `multiprocessing`, or any other parallelism model:
200
213
 
201
214
  ```python
202
215
  import concurrent.futures, numkong as nk, numpy as np
@@ -213,12 +226,12 @@ with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as pool:
213
226
  list(pool.map(compute_slice, range(num_threads)))
214
227
  ```
215
228
 
216
- For users who want a ready-made low-latency thread pool without the oversubscription baggage of OpenMP, we built [Fork Union](https://github.com/ashvardanian/ForkUnion) — a minimalist fork-join library for C, C++, and Rust that avoids mutexes, CAS atomics, and dynamic allocations on the critical path, with optional NUMA pinning on Linux.
229
+ For users who want a ready-made low-latency thread pool without the oversubscription baggage of OpenMP, we built [ForkUnion](https://github.com/ashvardanian/ForkUnion) — a minimalist fork-join library for C, C++, and Rust that avoids mutexes, CAS atomics, and dynamic allocations on the critical path, with optional NUMA pinning on Linux.
217
230
 
218
231
  ### Memory Allocation & Management
219
232
 
220
233
  BLAS libraries typically allocate internal buffers during GEMM — [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS) packs matrices into L2/L3-sized panels via per-thread buffer pools backed by `mmap` or `shmget`.
221
- This hidden allocation has caused real problems: [14 lock/unlock pairs per small GEMM call](https://github.com/xianyi/OpenBLAS/issues/478) throttling 12-thread scaling to 2x, [silently incorrect results](https://github.com/xianyi/OpenBLAS/issues/1844) from thread-unsafe allocation in `np.dot`, and [deadlocks after `fork()`](https://github.com/numpy/numpy/issues/30092) due to mutex state not being reset in child processes.
234
+ This hidden allocation has caused real problems: [14 lock/unlock pairs per small GEMM call](https://github.com/OpenMathLib/OpenBLAS/issues/478) throttling 12-thread scaling to 2x, [silently incorrect results](https://github.com/OpenMathLib/OpenBLAS/issues/1844) from thread-unsafe allocation in `np.dot`, and [deadlocks after `fork()`](https://github.com/numpy/numpy/issues/30092) due to mutex state not being reset in child processes.
222
235
  The [BLASFEO](https://github.com/giaf/blasfeo) library was created specifically for embedded model-predictive control where `malloc` during computation is unacceptable.
223
236
 
224
237
  NumKong __never allocates memory__.
@@ -241,14 +254,14 @@ NumKong's `nk_dots_pack_*` family performs five transformations beyond simple re
241
254
  import numkong as nk, numpy as np
242
255
 
243
256
  right_matrix = np.random.randn(1000, 768).astype(np.float16)
244
- right_packed = nk.dots_pack(right_matrix, dtype="float16") # pack once
257
+ right_packed = nk.dots_pack(right_matrix, dtype=nk.float16) # pack once
245
258
  for query_batch in stream: results = nk.dots_packed(query_batch, right_packed) # reuse many times
246
259
  ```
247
260
 
248
261
  ### Why Not Just GEMM? The Evolution of Matrix Multiplication APIs
249
262
 
250
263
  The classic BLAS GEMM computes $C = \alpha A B + \beta C$ for Float32/Float64 matrices.
251
- It's a powerful primitive, but the workloads that dominate modern compute — LLM inference, vector search, quantum simulation expose three ways in which the traditional GEMM interface falls short.
264
+ It covers many use cases, but LLM inference, vector search, and quantum simulation expose three ways in which the traditional interface falls short.
252
265
 
253
266
  __Frozen weights justify separating packing from computation.__
254
267
  During LLM inference, a very large share of GEMM calls use a static weight matrix — weights don't change after loading.
@@ -272,18 +285,18 @@ The standard BLAS interface was never designed for sub-byte types either — [no
272
285
  __Some operations need more than GEMM + postprocessing.__
273
286
  NumKong implements several GEMM-shaped operations where the "epilogue" is too complex for a simple addition:
274
287
 
275
- - __Bilinear forms__ ($a^T C b$) in quantum computing compute a [scalar expectation value](https://phys.libretexts.org/Bookshelves/Quantum_Mechanics/Advanced_Quantum_Mechanics_(Kok)/10:_Pauli_Spin_Matrices/10.2:_Expectation_Values) — the naive approach materializes an $N$-dimensional intermediate vector $Cb$, but NumKong's typed `nk_bilinear_*` kernels stream through rows of $C$ with nested compensated dot products, never allocating beyond registers.
288
+ - __Bilinear forms__ ($a^T C b$) in quantum computing compute a [scalar expectation value](<https://phys.libretexts.org/Bookshelves/Quantum_Mechanics/Advanced_Quantum_Mechanics_(Kok)/10:_Pauli_Spin_Matrices/10.2:_Expectation_Values>) — the naive approach materializes an $N$-dimensional intermediate vector $Cb$, but NumKong's typed `nk_bilinear_*` kernels stream through rows of $C$ with nested compensated dot products, never allocating beyond registers.
276
289
  For complex-valued quantum states, where the intermediate would be a 2N-element complex vector, the savings double.
277
290
  - __MaxSim scoring__ for [ColBERT-style late-interaction retrieval](https://github.com/stanford-futuredata/ColBERT) computes $\sum_i \min_j \text{angular}(q_i, d_j)$ — a sum-of-min-distances across token pairs.
278
- A GEMM would produce the full $M \times N$ similarity matrix, but NumKong's typed `nk_maxsim_packed_*` kernels fuse a coarse Int8-quantized screening with full-precision angular refinement on winning pairs only, __packing both query and document matrices__ to enable all 4 SME tiles as accumulators (+33% throughput vs `dots_packed`).
279
- [PLAID](https://ar5iv.labs.arxiv.org/html/2205.09707) and [maxsim-cpu](https://www.mixedbread.com/blog/maxsim-cpu) have independently shown that dedicated MaxSim kernels outperform the GEMM decomposition by 5–10x.
291
+ A GEMM would produce the full $M \times N$ similarity matrix, but NumKong's typed `nk_maxsim_packed_*` kernels fuse a coarse Int8-quantized screening with full-precision angular refinement on winning pairs only, packing both query and document matrices to use all 4 SME tiles as accumulators.
292
+ [PLAID](https://ar5iv.labs.arxiv.org/html/2205.09707) and [maxsim-cpu](https://www.mixedbread.com/blog/maxsim-cpu) have independently shown that dedicated MaxSim kernels can outperform the GEMM decomposition by 5–10x.
280
293
 
281
294
  NumKong treats these as first-class operations — `dots_packed`, `euclideans_packed`, `angulars_packed`, typed `nk_bilinear_*` kernels, and typed `nk_maxsim_packed_*` kernels — rather than decomposing everything into GEMM + postprocessing.
282
295
 
283
296
  ### Precision by Design: Saturation, Rounding, & Float6 Over Float8
284
297
 
285
- Floating-point arithmetic on computers [is not associative](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html): $(a + b) + c \neq a + (b + c)$ in general, and the standard advice — "upcast to wider types" often isn't enough, and always costs performance.
286
- NumKong makes opinionated, operation-specific decisions about where to spend precision and where to economize, rather than applying one IEEE rule uniformly.
298
+ Floating-point arithmetic on computers [is not associative](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html): $(a + b) + c \neq a + (b + c)$ in general, and upcasting to wider types is not always sufficient.
299
+ NumKong makes operation-specific decisions about where to spend precision and where to economize, rather than applying one rule uniformly.
287
300
 
288
301
  __Saturation depends on the operation.__
289
302
  A reduction over a 4 GB array of `i8` values contains ~4 billion elements — but [Int32 wrapping overflow](https://cedardb.com/blog/overflow_handling/) occurs after just ~17 million Int8 summands ($127 \times 16.9\text{M} > 2^{31}$).
@@ -294,7 +307,7 @@ x86 provides no saturating 32-bit SIMD add ([only byte/word variants](https://ww
294
307
  __Square roots & special math ops are platform-specific.__
295
308
  Angular distance requires $1/\sqrt{\|a\|^2 \cdot \|b\|^2}$ — but the cost of computing this normalization varies dramatically across hardware.
296
309
  x86 `VSQRTPS` takes [~12 cycles](https://uops.info/html-lat/SKX/VSQRTPS_XMM_XMM-Measurements.html), followed by `VDIVPS` at ~11 cycles — totalling ~23 cycles for a precise `1/sqrt(x)`.
297
- The `VRSQRT14PS` alternative starts with a [14-bit estimate in ~4 cycles](https://www.intel.com/content/www/us/en/developer/articles/code-sample/reference-implementations-for-ia-approximation-instructions-vrcp14-vrsqrt14-vrcp28-vrsqrt28-vexp2.html), then one Newton-Raphson iteration ($y = y \cdot (1.5 - 0.5 x y^2)$, ~4 more cycles) reaches full Float32 precision — a __~3x speedup__.
310
+ The `VRSQRT14PS` alternative starts with a [14-bit estimate in ~4 cycles](https://www.intel.com/content/www/us/en/developer/articles/code-sample/reference-implementations-for-ia-approximation-instructions-vrcp14-vrsqrt14-vrcp28-vrsqrt28-vexp2.html), then one Newton-Raphson iteration ($y = y \cdot (1.5 - 0.5 x y^2)$, ~4 more cycles) reaches full Float32 precision — roughly 3x faster.
298
311
  ARM's `FRSQRTE` provides only [~8 bits](https://github.com/DLTcollab/sse2neon/issues/526), requiring __two__ Newton-Raphson iterations to match.
299
312
  NumKong selects the iteration count per platform so the final ULP bound is consistent across ISAs, rather than exposing different precision to different users.
300
313
 
@@ -347,10 +360,10 @@ The first call to `nk_capabilities()` initializes the dispatch table; all subseq
347
360
 
348
361
  ### Float64 & Float32: IEEE Precision
349
362
 
350
- __Float64__ — NumKong deviates from most BLAS-like libraries by leveraging __compensated summation__ that tracks numerical errors separately.
351
- On serial paths, we use __Neumaier's algorithm__ (1974), an improvement over Kahan-Babuška that correctly handles cases where added terms are larger than the running sum, achieving $O(1)$ error growth instead of $O(n)$.
363
+ __Float64__ — NumKong uses __compensated summation__ that tracks numerical errors separately.
364
+ On serial paths, we use __[Neumaier's algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm#Further_enhancements)__ (1974), an improvement over Kahan-Babuška that correctly handles cases where added terms are larger than the running sum, achieving $O(1)$ error growth instead of $O(n)$.
352
365
  On SIMD paths with FMA support, we implement the __Dot2 algorithm__ (Ogita-Rump-Oishi, 2005), maintaining separate error compensators for both multiplication and accumulation via `TwoProd` and `TwoSum` operations.
353
- The accuracy gains are visible in the [benchmark tables above](#latency-throughput--numerical-stability-together-in-a-tiny-package) — compensated Float64 is ideal for scientific computing where numerical stability matters more than raw speed.
366
+ The accuracy differences are visible in the [benchmark tables above](#latency-throughput--numerical-stability) — compensated Float64 suits scientific computing where numerical stability matters more than raw speed.
354
367
 
355
368
  __Float32__ — SIMD implementations load Float32 values, upcast to Float64 for full-precision multiplication and accumulation, then downcast only during finalization.
356
369
  This avoids catastrophic cancellation at minimal cost since modern CPUs have dedicated Float64 vector units operating at nearly the same throughput as Float32.
@@ -368,7 +381,7 @@ e = (sum - t) + product; // Compensator term
368
381
 
369
382
  ### BFloat16 & Float16: Half Precision
370
383
 
371
- __BFloat16__ — not an IEEE 754 standard type, but the __universal recommendation__ for AI workloads.
384
+ __BFloat16__ — not an IEEE 754 standard type, but widely adopted for AI workloads.
372
385
  BFloat16 shares Float32's 8-bit exponent but truncates the mantissa to 7 bits, prioritizing __dynamic range over precision__ (±3.4×10³⁸ with coarser granularity).
373
386
  On old CPUs, upcasting BFloat16 to Float32 requires just an unpack and left-shift by 16 bits (essentially free); on newer CPUs, both Arm and x86 provide widening mixed-precision dot products via __DPBF16PS__ (AVX-512 on Genoa/Sapphire Rapids) and __BFDOT__ (NEON on ARMv8.6-A Graviton 3+).
374
387
  NumKong's Float8 types (E4M3/E5M2) upcast to BFloat16 before using DPBF16PS, creating a three-tier precision hierarchy: Float8 for storage, BFloat16 for compute, Float32 for accumulation.
@@ -378,36 +391,43 @@ Float16 prioritizes __precision over range__ (10 vs 7 mantissa bits), making it
378
391
  On x86, older CPUs use __F16C extensions__ (Ivy Bridge+) for fast Float16 → Float32 conversion; Sapphire Rapids+ adds native __AVX-512-FP16__ with dedicated Float16 arithmetic.
379
392
  On Arm, ARMv8.4-A adds __FMLAL/FMLAL2__ instructions for fused Float16 → Float32 widening multiply-accumulate, reducing the total latency from 7 cycles to 4 cycles and achieving 20–48% speedup over the separate convert-then-FMA path.
380
393
 
381
- | Platform | BFloat16 Path | Elem/Op | Float16 Path | Elem/Op |
382
- | ---------------------- | ------------------------ | ------: | ---------------------- | ------: |
383
- | __x86__ | | | | |
384
- | Sapphire Rapids (2023) | ↓ Genoa | 32 | Skylake | 16 |
385
- | Genoa (2022) | `VDPBF16PS` widening dot | 32 | ↓ Skylake | 16 |
386
- | Skylake (2015) | `SLLI` + `VFMADD` | 16 | `VCVTPH2PS` + `VFMADD` | 16 |
387
- | Haswell (2013) | `SLLI` + `VFMADD` | 8 | `VCVTPH2PS` + `VFMADD` | 8 |
388
- | __Arm__ | | | | |
389
- | Graviton 3 (2021) | `SVBFDOT` widening dot | 4–32 | `SVCVT` → `SVFMLA` | 4–32 |
390
- | Apple M2+ (2022) | `BFDOT` widening dot | 8 | FP16FML | 8 |
391
- | Apple M1 (2020) | NEON | 8 | `FMLAL` widening FMA | 8 |
392
- | Graviton 2 (2019) | ↓ NEON | 8 | `FCVTL` + `FMLA` | 4 |
393
- | Graviton 1 (2018) | `SHLL` + `FMLA` | 8 | bit-manip `FMLA` | 8 |
394
+ | Platform | BFloat16 Path | Elem/Op | Float16 Path | Elem/Op |
395
+ | ---------------------- | -------------------------- | ------: | ---------------------- | ------: |
396
+ | __x86__ | | | | |
397
+ | Diamond Rapids (2025) | ↓ Genoa | 32 | `VDPPHPS` widening dot | 32 |
398
+ | Sapphire Rapids (2023) | Genoa | 32 | ↓ Skylake | 16 |
399
+ | Genoa (2022) | `VDPBF16PS` widening dot | 32 | Skylake | 16 |
400
+ | Skylake (2015) | `SLLI` + `VFMADD` | 16 | `VCVTPH2PS` + `VFMADD` | 16 |
401
+ | Haswell (2013) | `SLLI` + `VFMADD` | 8 | `VCVTPH2PS` + `VFMADD` | 8 |
402
+ | __Arm__ | | | | |
403
+ | Graviton 3 (2021) | `SVBFDOT` widening dot | 4–32 | `SVCVT` → `SVFMLA` | 4–32 |
404
+ | Apple M2+ (2022) | `BFDOT` widening dot | 8 | FP16FML | 8 |
405
+ | Apple M1 (2020) | ↓ NEON | 8 | `FMLAL` widening FMA | 8 |
406
+ | Graviton 2 (2019) | NEON | 8 | `FCVTL` + `FMLA` | 4 |
407
+ | Graviton 1 (2018) | `SHLL` + `FMLA` | 8 | bit-manip → `FMLA` | 8 |
408
+ | __RISC-V__ | | | | |
409
+ | RVV + Zvfbfwma | `VFWMACCBF16` widening FMA | 4–32 | ↓ RVV | 4–32 |
410
+ | RVV + Zvfh | ↓ RVV | 4–32 | `VFWMACC` widening FMA | 4–32 |
411
+ | RVV | shift + `VFMACC` | 4–32 | convert + `VFMACC` | 4–32 |
394
412
 
395
413
  > BFloat16 shares Float32's 8-bit exponent, so upcasting is a 16-bit left shift (`SLLI` on x86, `SHLL` on Arm) that zero-pads the truncated mantissa — essentially free.
396
414
  > Float16 has a different exponent width (5 vs 8 bits), requiring a dedicated convert: `VCVTPH2PS` (x86 F16C) or `FCVTL` (Arm NEON).
397
415
  > Widening dot products (`VDPBF16PS`, `BFDOT`, `FMLAL`) fuse the conversion and multiply-accumulate into one instruction.
398
416
  > Sapphire Rapids has native `VFMADDPH` for Float16 arithmetic, but NumKong does not use it for general dot products — Float16 accumulation loses precision.
399
417
  > It is only used for mini-float (E2M3/E3M2) paths where periodic flush-to-Float32 windows keep error bounded.
418
+ > The table above covers only vector dot-product paths - GEMMs also leverage Arm SME and Intel AMX instructions.
419
+ > Beyond x86, Arm, and RISC-V, NumKong also ships LoongArch, WebAssembly, and PowerPC backends, also excluded from the table.
400
420
 
401
421
  ### Mini-Floats: E4M3, E5M2, E3M2, & E2M3
402
422
 
403
- | Format | Bits | Range | NumKong Promotion Strategy | Support in GPUs |
404
- | ------------------------- | ----: | -----: | ------------------------------------------- | ------------------------- |
405
- | E5M2FN | 8 | ±57344 | BFloat16 → Float32 | H100, B200, MI300, MI325 |
406
- | E4M3FN | 8 | ±448 | BFloat16 → Float32 | H100, B200, MI300, MI325 |
407
- | E3M2FN | 6 → 8 | ±28 | BFloat16 & Float16 → Float32, Int16 → Int32 | only block-scaled support |
408
- | E2M3FN | 6 → 8 | ±7.5 | BFloat16 & Float16 → Float32, Int8 → Int32 | only block-scaled support |
409
- | Block-scaled NVFP4 | 4 | ±6 | — | B200 |
410
- | Block-scaled MXFP4 / E2M1 | 4 | ±6 | — | B200, MI325 |
423
+ | Format | Bits | Range | NumKong Promotion Rules | Support in GPUs |
424
+ | ------------------------- | ----: | -----: | ----------------------------------------------- | ----------------- |
425
+ | E5M2FN | 8 | ±57344 | BFloat16 → Float32 | H100+, MI300+ |
426
+ | E4M3FN | 8 | ±448 | BFloat16 → Float32 | H100+, MI300+ |
427
+ | E3M2FN | 6 → 8 | ±28 | BFloat16 & Float16 → Float32,<br/>Int16 → Int32 | only block-scaled |
428
+ | E2M3FN | 6 → 8 | ±7.5 | BFloat16 & Float16 → Float32,<br/>Int8 → Int32 | only block-scaled |
429
+ | Block-scaled NVFP4 | 4 | ±6 | — | B200+ |
430
+ | Block-scaled MXFP4 / E2M1 | 4 | ±6 | — | B200+, MI325+ |
411
431
 
412
432
  > __Block scaling.__
413
433
  > NumKong does not implement block-scaled variants (MXFP4, NVFP4, or block-scaled E3M2/E2M3).
@@ -424,33 +444,72 @@ E4M3FN (no infinities, NaN only) is preferred for __training__ where precision n
424
444
  On x86 Genoa/Sapphire Rapids, E4M3/E5M2 values upcast to BFloat16 via lookup tables, then use native __DPBF16PS__ for 2-per-lane dot products accumulating to Float32.
425
445
  On Arm Graviton 3+, the same BFloat16 upcast happens via NEON table lookups, then __BFDOT__ instructions complete the computation.
426
446
 
447
+ | Platform | E5M2 Path | Elem/Op | E4M3 Path | Elem/Op |
448
+ | -------------------------- | ------------------------------ | ------: | ------------------------------ | ------: |
449
+ | __x86__ | | | | |
450
+ | Diamond Rapids (2025) | `VCVTBF82PH` → F16 + `VDPPHPS` | 32 | `VCVTHF82PH` → F16 + `VDPPHPS` | 32 |
451
+ | Genoa (2022) | → BF16 + `VDPBF16PS` | 32 | ↓ Ice Lake | 64 |
452
+ | Ice Lake (2019) | ↓ Skylake | 16 | octave LUT + `VPDPBUSD` | 64 |
453
+ | Skylake (2015) | rebias → F32 FMA | 16 | rebias → F32 FMA | 16 |
454
+ | Haswell (2013) | rebias → F32 FMA | 8 | rebias → F32 FMA | 8 |
455
+ | __Arm__ | | | | |
456
+ | NEON + FP8DOT (Olympus) | native `FDOT` | 16 | native `FDOT` | 16 |
457
+ | NEON + FP16FML (Apple M1+) | SHL → F16 + `FMLAL` | 16 | LUT → F16 + `FMLAL` | 16 |
458
+ | NEON (Graviton 1+) | SHL + `FCVTL` + FMA | 8 | → F16 + `FCVTL` + FMA | 8 |
459
+ | __RISC-V__ | | | | |
460
+ | RVV + Zvfbfwma | rebias → BF16 + `VFWMACCBF16` | 4–32 | LUT → BF16 + `VFWMACCBF16` | 4–32 |
461
+ | RVV + Zvfh | SHL → F16 + `VFWMACC` | 4–32 | LUT → F16 + `VFWMACC` | 4–32 |
462
+ | RVV | rebias → F32 + `VFMACC` | 4–32 | LUT → F32 + `VFMACC` | 4–32 |
463
+
464
+ > E5M2 shares Float16's exponent bias (15), so E5M2 → Float16 conversion is a single left-shift by 8 bits (`SHL 8`).
465
+ > E4M3 on Ice Lake uses "octave decomposition": the 4-bit exponent splits into 2 octave + 2 remainder bits, yielding 7 integer accumulators post-scaled by powers of 2.
466
+
427
467
  __6-bit floats (E3M2 & E2M3)__ follow the [OCP MX v1.0 standard](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
428
468
  Their smaller range allows scaling to exact integers that fit in `i8`/`i16`, enabling integer `VPDPBUSD`/`SDOT` accumulation instead of the floating-point pipeline.
429
469
  Float16 can also serve as an accumulator, accurately representing ~50 products of E3M2FN pairs or ~20 products of E2M3FN pairs before overflow.
430
470
  On Arm, NEON FHM extensions bring widening `FMLAL` dot-products for Float16 — both faster and more widely available than `BFDOT` for BFloat16.
431
471
 
472
+ | Platform | E3M2 Path | Elem/Op | E2M3 Path | Elem/Op |
473
+ | ---------------------------- | -------------------------- | ------: | ---------------------------- | ------: |
474
+ | __x86__ | | | | |
475
+ | Ice Lake (2019) | `VPERMW` LUT + `VPMADDWD` | 32 | `VPERMB` LUT + `VPDPBUSD` | 64 |
476
+ | Sierra Forest (2024) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBSSD` | 32 |
477
+ | Alder Lake (2021) | ↓ Haswell | 32 | `VPSHUFB` LUT + `VPDPBUSD` | 32 |
478
+ | Skylake (2015) | `VPSHUFB` LUT + `VPMADDWD` | 64 | `VPSHUFB` LUT + `VPMADDUBSW` | 64 |
479
+ | Haswell (2013) | `VPSHUFB` LUT + `VPMADDWD` | 32 | `VPSHUFB` LUT + `VPMADDUBSW` | 32 |
480
+ | __Arm__ | | | | |
481
+ | NEON + FP8DOT (Olympus) | → E5M2 + `FDOT` | 16 | → E4M3 + `FDOT` | 16 |
482
+ | NEON + DotProd (Graviton 2+) | `VQTBL2` LUT + `SMLAL` | 16 | `VQTBL2` LUT + `SDOT` | 16 |
483
+ | NEON (Graviton 1+) | → F16 + `FCVTL` + FMA | 16 | → F16 + `FCVTL` + FMA | 16 |
484
+ | __RISC-V__ | | | | |
485
+ | RVV | I16 gather LUT + `VWMACC` | 4–32 | U8 gather LUT + `VWMACC` | 4–32 |
486
+
487
+ > E3M2/E2M3 values map to exact integers via 32-entry LUTs (magnitudes up to 448 for E3M2, 120 for E2M3), enabling integer accumulation with no rounding error.
488
+ > On NEON + FP8DOT, E3M2 is first promoted to E5M2 and E2M3 to E4M3 before the hardware `FDOT` instruction.
489
+ > Sierra Forest and Alder Lake use native `VPDPBSSD` (signed×signed) and `VPDPBUSD` (unsigned×signed) respectively for E2M3.
490
+
432
491
  E4M3 and E5M2 cannot use the integer path.
433
492
  E4M3 scaled by 16 reaches 7,680 — too large for Int8, barely fitting Int16 with a 128-entry table.
434
493
  E5M2's range (±57,344) makes the scaled product exceed Int32 entirely.
435
494
  Without the integer path, E5M2 falls back to Float32 accumulation — where its [2-bit mantissa (only 4 values per binade)](https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/) creates a [catastrophic cancellation risk](https://www.ac.uma.es/arith2024/papers/Fused%20FP8%204-Way%20Dot%20Product%20with%20Scaling%20and%20FP32%20Accumulation.pdf) that E2M3's integer path avoids completely:
436
495
 
437
- | | _i_ = 0 | _i_ = 1 | _i_ = 2 | _i_ = 3 | _i_ = 4 | _i_ = 5 | _i_ = 6 | _i_ = 7 |
438
- | ------- | -------: | ------: | -------: | --------: | -------: | -------: | -------: | -------: |
439
- | _aᵢ_ | 0.00122 | 20480 | −0.00122 | 1.5 | −0.00586 | −3072 | −640 | 0.00146 |
440
- | _bᵢ_ | −40 | 320 | −1280 | −7.63e⁻⁵ | 0 | 0.000427 | 10240 | −4.58e⁻⁵ |
441
- | _aᵢ·bᵢ_ | −0.04883 | 6553600 | 1.5625 | −0.000114 | 0 | −1.3125 | −6553600 | ≈ 0 |
496
+ | | _i_ = 0 | _i_ = 1 | _i_ = 2 | _i_ = 3 | _i_ = 4 | _i_ = 5 | _i_ = 6 |
497
+ | ------- | -------: | ------: | -------: | --------: | -------: | -------: | -------: |
498
+ | _aᵢ_ | 0.00122 | 20480 | −0.00122 | 1.5 | −3072 | −640 | 0.00146 |
499
+ | _bᵢ_ | −40 | 320 | −1280 | −7.63e⁻⁵ | 0.000427 | 10240 | −4.58e⁻⁵ |
500
+ | _aᵢ·bᵢ_ | −0.04883 | 6553600 | 1.5625 | −0.000114 | −1.3125 | −6553600 | ≈ 0 |
442
501
 
443
502
  > __Why Float32 accumulation fails here.__
444
- > The accurate sum of these 8 products is ≈ 0.201.
445
- > After two `vfmaq_f32` calls, the 4 accumulator lanes hold pairwise products: lanes 1 and 2 carry values around ±6.5 M.
446
- > At that magnitude the Float32 ULP is 0.5 — so the small meaningful terms (−0.049, 1.563, −1.313, −0.0001) are all below one ULP and get absorbed during pairwise reduction.
503
+ > The accurate sum of these 7 products is ≈ 0.201.
504
+ > A `vfmaq_f32` call accumulates 4 lanes at a time; the first batch already carries values around ±6.5 M.
505
+ > At that magnitude the Float32 ULP is 0.5 — so the small meaningful terms (−0.049, 1.563, −1.313, −0.0001) are all below one ULP and get absorbed during lane reduction.
447
506
  > The large terms then cancel exactly to zero, and the information is gone.
448
507
  > Final Float32 result: __0.0__ instead of __0.201__.
449
508
 
450
509
  ### Int8 & Int4: Integer Types
451
510
 
452
511
  Both signed and unsigned 8-bit and 4-bit integers are supported with __Int32 accumulation__ to prevent overflow.
453
- The most sophisticated optimization is the __VNNI algebraic transform__: on Ice Lake+ with AVX-512 VNNI, the native __DPBUSD__ instruction is asymmetric (unsigned × signed → signed), yet NumKong exploits it for both Int8×Int8 and UInt8×UInt8.
512
+ A notable optimization is the __VNNI algebraic transform__: on Ice Lake+ with AVX-512 VNNI, the native __DPBUSD__ instruction is asymmetric (unsigned × signed → signed), but NumKong uses it for both Int8×Int8 and UInt8×UInt8.
454
513
  For __signed Int8×Int8__, we convert the signed operand to unsigned via XOR with `0x80`, compute `DPBUSD(a⊕0x80, b) = (a+128)×b`, then subtract a correction term `128×sum(b)` to recover the true result.
455
514
  For __unsigned UInt8×UInt8__, we XOR the second operand to make it signed, compute `DPBUSD(a, b⊕0x80) = a×(b-128)`, then add correction `128×sum(a)` via the fast SAD instruction.
456
515
 
@@ -480,7 +539,7 @@ Complex types are essential in quantum simulation (state vectors, density matric
480
539
  The `dot` operation computes the unconjugated dot product $\sum a_k b_k$, while `vdot` computes the conjugated inner product $\sum \bar{a}_k b_k$ standard in physics and signal processing.
481
540
 
482
541
  For complex dot products, NumKong defers sign flips until after the accumulation loop: instead of using separate FMA and FMS (fused multiply-subtract) instructions for the real component, we compute $a_r b_r + a_i b_i$ treating all products as positive, then apply a single bitwise XOR with `0x80000000` to flip the sign bits.
483
- This eliminates execution port contention, allowing dual FMA units to run at full capacity.
542
+ This avoids execution port contention between FMA and FMS, letting dual FMA units stay occupied.
484
543
 
485
544
  ```c
486
545
  for (...) { // Complex multiply optimization: XOR sign flip after the loop
@@ -490,6 +549,20 @@ for (...) { // Complex multiply optimization: XOR sign flip after the loop
490
549
  sum_real = xor(sum_real, 0x80000000); // Single XOR after loop
491
550
  ```
492
551
 
552
+ ## Reading Materials
553
+
554
+ Beyond the READMEs in this repository, there are several standalone articles covering different evolution steps and features of this library.
555
+
556
+ - [NumKong: 2'000 Mixed Precision Kernels For All](https://ashvardanian.com/posts/numkong/)
557
+ - [Hiding x86 Port Latency for 330 GB/s/core Reductions](https://ashvardanian.com/posts/cpu-ports/)
558
+ - [Understanding SIMD: Infinite Complexity of Trivial Problems](https://ashvardanian.com/posts/understanding-simd-complexity/)
559
+ - [NumPy vs BLAS: Losing 90% of Throughput](https://ashvardanian.com/posts/numpy-vs-blas-costs/)
560
+ - [5x Faster Set Intersections: SVE2, AVX-512, & NEON](https://ashvardanian.com/posts/simd-set-intersections-sve2-avx512/)
561
+ - [Python, C, Assembly - 2'500x Faster Cosine Similarity](https://ashvardanian.com/posts/python-c-assembly-comparison/)
562
+ - [GCC Compiler vs Human - 119x Faster Assembly](https://ashvardanian.com/posts/gcc-12-vs-avx512fp16/)
563
+ - [Accelerating JavaScript arrays by 10x for Vector Search](https://ashvardanian.com/posts/javascript-ai-vector-search/)
564
+ - [SciPy distances... up to 200x faster with AVX-512 & SVE](https://ashvardanian.com/posts/simsimd-faster-scipy/)
565
+
493
566
  ## License
494
567
 
495
568
  Feel free to use the project under Apache 2.0 or the Three-clause BSD license at your preference.