numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,573 @@
1
+ # NumKong for C and C++
2
+
3
+ NumKong's native SDK is the reference surface for the project.
4
+ The plain C ABI exposes every kernel family directly: dot products, dense distances, binary metrics, probability divergences, geospatial solvers, curved-space kernels, sparse intersections, mesh alignment, packed matrix multiplication, symmetric self-similarity, and late-interaction scoring.
5
+ The ABI is stable, versioned, and callable from any language that can load a shared library.
6
+ There is no runtime overhead: no hidden thread pool, no implicit allocation, no garbage collector interaction.
7
+ The C++ layer stays thin, typed, allocator-aware, and close enough to inline through, adding type-level result promotion and owning containers without hiding the dispatch model or the mixed-precision policy.
8
+
9
+ ## Quickstart
10
+
11
+ ```c
12
+ #include <numkong/numkong.h>
13
+ #include <stdio.h>
14
+
15
+ int main(void) {
16
+ nk_f32_t a[] = {1, 2, 3};
17
+ nk_f32_t b[] = {4, 5, 6};
18
+ nk_f64_t dot = 0;
19
+ nk_configure_thread(nk_capabilities());
20
+ nk_dot_f32(a, b, 3, &dot); // widened f32 → f64 output
21
+ printf("dot=%f\n", dot);
22
+ return 0;
23
+ }
24
+ ```
25
+
26
+ ## Highlights
27
+
28
+ This is the most complete SDK in the project.
29
+ It is the right layer if you want exact control over dtypes, allocators, packed buffers, dispatch, and host-side partitioning.
30
+
31
+ __Full kernel surface.__
32
+ All public operation families are reachable from native code.
33
+ __No hidden threading.__
34
+ NumKong does not own a thread pool.
35
+ __No hidden allocation.__
36
+ C APIs take caller-owned buffers, and C++ wrappers make ownership explicit.
37
+ __Mixed precision by default.__
38
+ Small storage types widen into safer accumulator and output types.
39
+ __Allocator-aware containers.__
40
+ `vector`, `tensor`, `packed_matrix`, and `packed_maxsim` accept custom allocators.
41
+ __Unaligned inputs are fine.__
42
+ Packing handles internal layout itself and does not require caller-side alignment.
43
+
44
+ ## Ecosystem Comparison
45
+
46
+ | Feature | NumKong | [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS) | [Eigen](https://gitlab.com/libeigen/eigen) |
47
+ | ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- | --------------------------------------------------------------------------------------- |
48
+ | Operation families | dots, distances, binary, probability, geospatial, curved, mesh, sparse, MaxSim, elementwise, reductions, cast, trig | dense linear algebra only | dense LA, some reductions and elementwise |
49
+ | Precision | Sub-byte to Float64 dtypes; automatic widening per scalar type; Kahan-compensated summation; 0 ULP Float32/Float64 where applicable | Float32, Float64 only; same-type in/out; no compensated summation | Float16/BFloat16 partial; no Float8 or sub-byte; manual casts; no compensated summation |
50
+ | Runtime SIMD dispatch | per-thread at runtime across x86, ARM, RISC-V | load-time CPU detection; one kernel set per process | compile-time ISA flags only |
51
+ | Packed matrix, GEMM-like | `packed_matrix` — pack once, reuse across query batches | internal opaque packing per GEMM call; no persistent packed form | no equivalent packed reuse abstraction |
52
+ | Symmetric kernels, SYRK-like | skips duplicate pairs, up to 2x speedup for self-distance | `SSYRK`/`DSYRK` for rank-k updates | `.selfadjointView` for rank-k updates |
53
+ | Memory model | Caller-owned buffers; C++ adds `tensor<T,A>` with per-container allocators | Caller-managed buffers; no container abstraction | Lazy expression templates avoid most temporaries; `aligned_allocator` provided |
54
+
55
+
56
+
57
+ ## Installation
58
+
59
+ With CMake `FetchContent`:
60
+
61
+ ```cmake
62
+ include(FetchContent)
63
+
64
+ FetchContent_Declare(
65
+ numkong
66
+ GIT_REPOSITORY https://github.com/ashvardanian/NumKong.git
67
+ GIT_SHALLOW TRUE
68
+ )
69
+ FetchContent_MakeAvailable(numkong)
70
+
71
+ target_link_libraries(my_target PRIVATE numkong)
72
+ ```
73
+
74
+ Vendored:
75
+
76
+ ```cmake
77
+ add_subdirectory(external/NumKong)
78
+ target_link_libraries(my_target PRIVATE numkong)
79
+ ```
80
+
81
+ Header-only C++ usage also works for direct template wrappers.
82
+ Most applications should still build the library once and keep `NK_DYNAMIC_DISPATCH=1`.
83
+
84
+ ## The C ABI
85
+
86
+ The C ABI keeps the operation family, input dtype, and output policy visible in the symbol name.
87
+ That makes widening obvious at the call site.
88
+
89
+ ```c
90
+ #include <numkong/numkong.h>
91
+
92
+ nk_i8_t a[1536];
93
+ nk_i8_t b[1536];
94
+ nk_i32_t dot = 0; // widened from int8 storage
95
+ nk_f32_t l2 = 0; // widened from int8 storage
96
+
97
+ nk_dot_i8(a, b, 1536, &dot);
98
+ nk_euclidean_i8(a, b, 1536, &l2);
99
+ ```
100
+
101
+ If you want runtime-selected kernels without naming a specific ISA, use the punned dispatch layer:
102
+
103
+ ```c
104
+ nk_metric_dense_punned_t angular = 0;
105
+ nk_capability_t used = nk_cap_serial_k;
106
+ nk_find_kernel_punned(nk_kernel_angular_k, nk_f32_k,
107
+ nk_capabilities(), (nk_kernel_punned_t *)&angular, &used);
108
+
109
+ nk_f32_t a[768], b[768], result = 0;
110
+ angular(a, b, 768, &result);
111
+ ```
112
+
113
+ That is the lowest-level dynamic path.
114
+ The typed C++ wrappers usually read better unless you are building your own dispatch layer.
115
+
116
+ ## The C++ Layer
117
+
118
+ The C++ wrappers add three things.
119
+ They add type-level result promotion.
120
+ They add explicit owning and non-owning containers.
121
+ They add allocator-aware packed objects for repeated matrix workloads.
122
+
123
+ ```cpp
124
+ #include <numkong/numkong.hpp>
125
+
126
+ namespace nk = ashvardanian::numkong;
127
+
128
+ int main() {
129
+ nk::f32_t a[3] = {1, 2, 3}, b[3] = {4, 5, 6};
130
+ nk::f64_t dot {};
131
+ nk::dot(a, b, 3, &dot); // default result type is nk::f32_t::dot_result_t == nk::f64_t
132
+ }
133
+ ```
134
+
135
+ The API is intentionally not STL-shaped.
136
+ `vector_view`, `tensor_view`, and `matrix_view` prioritize signed strides, sub-byte storage, and kernel compatibility over resizable-container ergonomics.
137
+
138
+ ## Scalar Types and Promotions
139
+
140
+ The scalar wrappers in `include/numkong/types.hpp` are storage-first types.
141
+ They encode raw layout, default output types, and the kernel function pointer signatures for each family.
142
+
143
+ | Type | Layout | Bytes | Range | Inf | NaN |
144
+ | ----------- | ---------------- | ----- | --------------- | --- | --- |
145
+ | `nk_f16_t` | 1+5+10 | 2 | ±65504 | yes | yes |
146
+ | `nk_bf16_t` | 1+8+7 | 2 | ±3.4×10³⁸ | yes | yes |
147
+ | `nk_e4m3_t` | 1+4+3 | 1 | ±448 | no | yes |
148
+ | `nk_e5m2_t` | 1+5+2 | 1 | ±57344 | yes | yes |
149
+ | `nk_e2m3_t` | 1+2+3 | 1 | ±7.5 | no | no |
150
+ | `nk_e3m2_t` | 1+3+2 | 1 | ±28 | no | no |
151
+ | `nk_u1x8_t` | 8 packed bits | 1 | 0 or 1 per bit | — | — |
152
+ | `nk_u4x2_t` | 2x4-bit unsigned | 1 | 0-15 per nibble | — | — |
153
+ | `nk_i4x2_t` | 2x4-bit signed | 1 | -8-7 per nibble | — | — |
154
+
155
+ The layout column shows sign, exponent, and mantissa bit counts for floating-point types.
156
+ For `nk_f16_t`, 1+5+10 means one sign bit, five exponent bits, and ten mantissa bits, totaling 16 bits stored in 2 bytes.
157
+ For `nk_bf16_t`, the wider exponent field (8 bits) gives the same dynamic range as IEEE 754 single precision but with reduced mantissa precision.
158
+ The Float8 types `nk_e4m3_t` and `nk_e5m2_t` follow the OFP8 specification.
159
+ The narrower `nk_e2m3_t` and `nk_e3m2_t` types are MX-compatible micro-floats.
160
+ Sub-byte types `nk_u1x8_t`, `nk_u4x2_t`, and `nk_i4x2_t` pack multiple logical values into a single byte.
161
+
162
+ Default promotions are encoded on the type.
163
+ For example, `f32_t::dot_result_t` is wider than `f32_t`.
164
+ `i8_t::dot_result_t` is `i32_t`.
165
+ `u1x8_t::dot_result_t` is `u32_t`.
166
+
167
+ The higher-level templates use `result_type_ = typename in_type_::dot_result_t` and similar defaults.
168
+ The fast typed overloads are constrained so that overriding the result type away from the native policy can disable the specialized path and fall back to the more generic one.
169
+
170
+ ## Dot Products
171
+
172
+ Dot products are one of the broadest parts of the native SDK.
173
+ They include real, complex, packed-binary, mini-float, and mixed-precision forms.
174
+
175
+ ```c
176
+ nk_f32c_t a[384];
177
+ nk_f32c_t b[384];
178
+ nk_f32_t out[2] = {0, 0};
179
+
180
+ nk_dot_f32c(a, b, 384, out); // complex inner product
181
+ nk_vdot_f32c(a, b, 384, out); // conjugated variant, like numpy.vdot
182
+ ```
183
+
184
+ For quantized retrieval pipelines, the storage format often matters more than the nominal math family.
185
+ The native SDK lets you keep the compact representation and still get a widened output.
186
+
187
+ ## Dense Distances
188
+
189
+ The dense spatial kernels cover the SciPy-style `sqeuclidean`, `euclidean`, and `angular` family.
190
+ The important difference is that storage type and output type are not forced to match.
191
+
192
+ ```c
193
+ nk_f16_t a[768];
194
+ nk_f16_t b[768];
195
+ nk_f32_t sqeuclidean = 0, euclidean = 0, angular = 0;
196
+
197
+ // `_Float16` support varies across compilers, and
198
+ // auto-vectorization targets `f32` — not `f16`.
199
+ nk_sqeuclidean_f16(a, b, 768, &sqeuclidean);
200
+ nk_euclidean_f16(a, b, 768, &euclidean);
201
+ nk_angular_f16(a, b, 768, &angular);
202
+ ```
203
+
204
+ For `i8`, `u8`, `i4`, `u4`, and `u1`, the widening is even more important.
205
+ The output type is chosen to avoid the obvious overflow trap of same-width accumulation.
206
+
207
+ ## Set Similarity
208
+
209
+ Packed-binary metrics operate on packed words, not on byte-wise booleans.
210
+ That is why `u1x8_t` exists as a storage type instead of pretending that `bool[8]` is the right primitive.
211
+
212
+ ```c
213
+ nk_u1x8_t a[128], b[128];
214
+ nk_u32_t hamming = 0;
215
+ nk_f32_t jaccard = 0;
216
+ nk_hamming_u1(a, b, 128 * 8, &hamming);
217
+ nk_jaccard_u1(a, b, 128 * 8, &jaccard);
218
+ ```
219
+
220
+ Integer set Jaccard works on sorted arrays of integer identifiers.
221
+
222
+ ```c
223
+ nk_u32_t set_a[] = {1, 3, 5, 7, 9}, set_b[] = {3, 5, 8, 9, 10};
224
+ nk_f32_t jaccard_sets = 0;
225
+ nk_jaccard_u32(set_a, set_b, 5, &jaccard_sets); // |A ∩ B| / |A ∪ B|
226
+ assert(jaccard_sets > 0.0f && jaccard_sets < 1.0f && "|A ∩ B| / |A ∪ B| should be in (0, 1)");
227
+ ```
228
+
229
+ ## Probability Metrics
230
+
231
+ Probability kernels target divergences directly instead of making you rebuild them from scalar loops.
232
+
233
+ ```c
234
+ nk_f32_t p[] = {0.2f, 0.3f, 0.5f}, q[] = {0.1f, 0.3f, 0.6f};
235
+ nk_f64_t kl_forward = 0, kl_reverse = 0, js_forward = 0, js_reverse = 0;
236
+
237
+ nk_kld_f32(p, q, 3, &kl_forward);
238
+ nk_kld_f32(q, p, 3, &kl_reverse);
239
+ assert(kl_forward != kl_reverse && "KLD is asymmetric");
240
+
241
+ nk_jsd_f32(p, q, 3, &js_forward);
242
+ nk_jsd_f32(q, p, 3, &js_reverse);
243
+ assert(js_forward == js_reverse && "JSD is symmetric");
244
+ ```
245
+
246
+ These paths are especially valuable once you move below `f64`.
247
+ Naive implementations are usually dominated by repeated scalar transcendental calls and weak accumulation policy.
248
+
249
+ ## Geospatial Metrics
250
+
251
+ The native SDK exposes both the fast spherical approximation and the more accurate ellipsoidal solver.
252
+ Inputs are in radians.
253
+ Outputs are in meters.
254
+
255
+ ```c
256
+ // Statue of Liberty (40.6892°N, 74.0445°W) → Big Ben (51.5007°N, 0.1246°W)
257
+ nk_f64_t liberty_lat[] = {0.7101605100}, liberty_lon[] = {-1.2923203180};
258
+ nk_f64_t big_ben_lat[] = {0.8988567821}, big_ben_lon[] = {-0.0021746802};
259
+
260
+ nk_f64_t distance[1];
261
+ nk_vincenty_f64(liberty_lat, liberty_lon, big_ben_lat, big_ben_lon, 1, distance); // ≈ 5,589,857 m (ellipsoidal, baseline)
262
+ nk_haversine_f64(liberty_lat, liberty_lon, big_ben_lat, big_ben_lon, 1, distance); // ≈ 5,543,723 m (spherical, ~46 km less)
263
+
264
+ // Vincenty in f32 — drifts ~2 m from f64
265
+ nk_f32_t liberty_lat32[] = {0.7101605100f}, liberty_lon32[] = {-1.2923203180f};
266
+ nk_f32_t big_ben_lat32[] = {0.8988567821f}, big_ben_lon32[] = {-0.0021746802f};
267
+ nk_f32_t distance_f32[1];
268
+ nk_vincenty_f32(liberty_lat32, liberty_lon32, big_ben_lat32, big_ben_lon32, 1, distance_f32); // ≈ 5,589,859 m (+2 m drift)
269
+ ```
270
+
271
+ ## Curved Metrics
272
+
273
+ Curved-space kernels are separate from the flat Euclidean family because their dataflow is different.
274
+ They combine vectors with an extra metric tensor or covariance inverse.
275
+
276
+ ```c
277
+ // Complex bilinear form: aᴴ M b
278
+ nk_f32c_t a[32], b[32], metric[32 * 32];
279
+ nk_f64c_t result = {0, 0};
280
+ nk_bilinear_f32c(a, b, metric, 32, &result);
281
+
282
+ // Real Mahalanobis distance: √((a−b)ᵀ M⁻¹ (a−b))
283
+ nk_f32_t x[64], y[64], inv_cov[64 * 64];
284
+ nk_f64_t distance = 0;
285
+ nk_mahalanobis_f32(x, y, inv_cov, 64, &distance);
286
+ ```
287
+
288
+ ## Tensors, Views, and Memory Layout
289
+
290
+ The native containers are where most integration mistakes happen.
291
+ They need to be documented explicitly.
292
+
293
+ - `vector<T, A>` owns storage and defaults to `aligned_allocator<T, 64>`.
294
+ - `vector_view<T>` is a const strided non-owning view.
295
+ - `vector_span<T>` is a mutable strided non-owning view.
296
+ - `tensor<T, A, R>` owns rank-`R` storage and also defaults to aligned allocation.
297
+ - `tensor_view<T>` and `tensor_span<T>` are the view forms.
298
+ - `matrix`, `matrix_view`, and `matrix_span` are rank-2 aliases.
299
+
300
+ The important layout rules are:
301
+
302
+ - Signed strides are supported by the view types.
303
+ - Reversed and sliced views are valid for many elementwise and reduction kernels.
304
+ - `reshape` and `flatten` require contiguous layout.
305
+ - Matrix-style kernels care about _row contiguity_, not just total tensor contiguity.
306
+ - Negative strides are conceptually valid views, but matrix packing and packed matmul workflows are not written around them.
307
+
308
+ Memory ownership is explicit.
309
+ `vector` and `tensor` deallocate through their allocator.
310
+ `vector_view`, `tensor_view`, `matrix_view`, and spans never own memory.
311
+ And heterogenous index types for `operator[]` enable more interesting access patterns:
312
+
313
+
314
+ ```cpp
315
+ #include <numkong/numkong.hpp>
316
+
317
+ namespace nk = ashvardanian::numkong;
318
+ using nk::slice, nk::all, nk::f32_t, nk::tensor, nk::tensor_view;
319
+
320
+ auto t = tensor<f32_t>::try_from({
321
+ {1, 2, 3},
322
+ {4, 5, 6},
323
+ {7, 8, 9},
324
+ });
325
+
326
+ f32_t scalar_at_2d_coordinate = t[1, -1];
327
+ f32_t scalar_at_global_offset = t[4];
328
+ assert(scalar_at_2d_coordinate == scalar_at_global_offset && "same value");
329
+
330
+ tensor_view<f32_t> scalar_as_tensor = t[1, 1, slice];
331
+ tensor_view<f32_t> second_row = t[1, slice];
332
+ tensor_view<f32_t> second_column = t[all, 1, slice];
333
+ assert(second_row[1] == second_column[1] && "same value");
334
+ ```
335
+
336
+ You can also use a more traditional syntax with member functions, also leveraging built-in functionality for hardware-accelerated strided reductions and elementwise operations along any axis combination.
337
+ Similar to NumPy, but statically typed:
338
+
339
+ ```cpp
340
+ auto first_column = t[all, 1, slice]; // strided column view → {2, 5, 8}
341
+ auto minimum_index = nk::argmin(first_column); // index of the minimum in the second column
342
+ ```
343
+
344
+ The view types are conceptually close to `std::mdspan` from C++23.
345
+ The main differences are sub-byte element support, signed strides, and the kernel dispatch integration that `std::mdspan` does not provide.
346
+ If your codebase already uses `std::mdspan`, converting at the NumKong call boundary is straightforward:
347
+
348
+ ```cpp
349
+ #include <numkong/numkong.hpp>
350
+ #include <mdspan>
351
+
352
+ namespace nk = ashvardanian::numkong;
353
+
354
+ // Existing std::mdspan from your codebase
355
+ float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
356
+ auto md = std::mdspan<float, std::extents<std::size_t, 3, 3>>(data);
357
+
358
+ // Wrap into a NumKong matrix_view — data pointer, extents, and strides map directly
359
+ auto view = nk::matrix_view<nk::f32_t>(
360
+ reinterpret_cast<nk::f32_t const *>(md.data_handle()),
361
+ md.extent(0), md.extent(1));
362
+
363
+ // Now use any NumKong kernel on it
364
+ nk::f64_t dot {};
365
+ nk::dot(view.row(0), view.row(1), md.extent(1), &dot);
366
+ ```
367
+
368
+ ## Packed Matrix Kernels for GEMM-Like Workloads
369
+
370
+ This is the most distinctive native subsystem outside the raw vector kernels.
371
+ It is the right tool when the right-hand side is reused many times.
372
+
373
+ ```cpp
374
+ #include <numkong/numkong.hpp>
375
+
376
+ namespace nk = ashvardanian::numkong;
377
+
378
+ auto a = nk::tensor<nk::f32_t>::try_full({2, 4}, nk::f32_t {1});
379
+ auto b = nk::tensor<nk::f32_t>::try_full({3, 4}, nk::f32_t {2});
380
+ auto packed = nk::packed_matrix<nk::f32_t>::try_pack(b.as_matrix_view());
381
+
382
+ // Dot products, angular distances, and Euclidean distances all reuse the same packed B
383
+ auto dots = nk::try_dots_packed(a.as_matrix_view(), packed);
384
+ auto angulars = nk::try_angulars_packed(a.as_matrix_view(), packed);
385
+ auto euclideans = nk::try_euclideans_packed(a.as_matrix_view(), packed);
386
+ ```
387
+
388
+ This is GEMM-like in the workload shape, not in the strict BLAS API.
389
+ The useful economics are:
390
+
391
+ - one-time packing of `B`
392
+ - one-time type preconversion where needed
393
+ - depth padding handled internally
394
+ - per-column norm reuse for `angulars_packed` and `euclideans_packed`
395
+ - repeated reuse of the same packed RHS across many `A` batches
396
+
397
+ Caller-side alignment is not required.
398
+ Owned `packed_matrix` storage uses its allocator.
399
+ The C ABI also exposes `nk_dots_packed_size_*` so you can `malloc` the exact external buffer yourself.
400
+
401
+ ## Symmetric Kernels for SYRK-Like Workloads
402
+
403
+ The symmetric kernels solve a different problem.
404
+ They compute self-similarity or self-distance without paying for both triangles independently.
405
+
406
+ ```cpp
407
+ auto vectors = nk::tensor<nk::f32_t>::try_full({100, 768}, nk::f32_t {1});
408
+ auto gram = nk::try_dots_symmetric(vectors.as_matrix_view());
409
+ auto angular_dists = nk::try_angulars_symmetric(vectors.as_matrix_view());
410
+ auto euclidean_dists = nk::try_euclideans_symmetric(vectors.as_matrix_view());
411
+ ```
412
+
413
+ This is SYRK-like in the sense that the output is square and symmetric.
414
+ The important difference from packed GEMM-style work is the partitioning model.
415
+ You typically split by output row windows, not by distinct left batches against a shared packed right-hand side.
416
+
417
+ The arithmetic advantage is direct and honest.
418
+ The symmetric kernels avoid recomputing both `(i, j)` and `(j, i)` pairs.
419
+ That cuts the pair count almost in half before any micro-kernel details matter.
420
+
421
+ ## Sparse Operations and Intersections
422
+
423
+ Sparse helpers cover sorted-index intersection and weighted sparse dot products.
424
+
425
+ ```c
426
+ nk_u32_t a_idx[] = {1, 3, 5, 7}, b_idx[] = {3, 4, 5, 8};
427
+ nk_u32_t intersection[4];
428
+ nk_size_t count = 0;
429
+ nk_sparse_intersect_u32(a_idx, b_idx, 4, 4, intersection, &count);
430
+ assert(count == 2 && "indices 3 and 5");
431
+
432
+ nk_f32_t a_weights[] = {1.0f, 2.0f, 3.0f, 4.0f};
433
+ nk_f32_t b_weights[] = {5.0f, 6.0f, 7.0f, 8.0f};
434
+ nk_f64_t result = 0;
435
+ nk_sparse_dot_u32f32(a_idx, b_idx, a_weights, b_weights, 4, 4, &result);
436
+ assert(result > 0 && "weighted dot over shared indices");
437
+ ```
438
+
439
+ This family deserves explicit mention because it is not just sparse dot.
440
+ Set intersection itself is often the workload.
441
+
442
+ ## Geometric Mesh Alignment
443
+
444
+ Mesh alignment returns structured outputs, not just one scalar.
445
+ The native API covers `rmsd`, `kabsch`, and `umeyama`.
446
+
447
+ ```c
448
+ // Three 3D points, target is source scaled by 2x
449
+ nk_f32_t source[] = {0, 0, 0, 1, 0, 0, 0, 1, 0};
450
+ nk_f32_t target[] = {0, 0, 0, 2, 0, 0, 0, 2, 0};
451
+ nk_f32_t a_centroid[3], b_centroid[3], rotation[9];
452
+ nk_f32_t scale = 0, rmsd = 0;
453
+
454
+ nk_umeyama_f32(source, target, 3, a_centroid, b_centroid, rotation, &scale, &rmsd);
455
+ assert(rmsd < 1e-6f && "umeyama should recover exact alignment");
456
+ assert(scale > 1.99f && scale < 2.01f && "umeyama should recover 2x scale");
457
+ ```
458
+
459
+ This family is separate from curved metrics because the output is a transform, not just a distance.
460
+
461
+ ## MaxSim and Late Interaction
462
+
463
+ MaxSim is the late-interaction primitive used by systems such as [ColBERT](https://arxiv.org/abs/2004.12832).
464
+ It is not generic matrix multiplication.
465
+ It packs query and document token vectors into a scoring-specific layout and computes a late-interaction score.
466
+
467
+ ```cpp
468
+ auto queries = nk::tensor<nk::bf16_t>::try_full({32, 128}, nk::bf16_t::one());
469
+ auto docs = nk::tensor<nk::bf16_t>::try_full({192, 128}, nk::bf16_t::one());
470
+
471
+ auto q = nk::packed_maxsim<nk::bf16_t>::try_pack(queries.as_matrix_view());
472
+ auto d = nk::packed_maxsim<nk::bf16_t>::try_pack(docs.as_matrix_view());
473
+ auto score = nk::maxsim(q, d);
474
+ ```
475
+
476
+ `packed_maxsim` is allocator-aware in the same way as `packed_matrix`.
477
+ Its footprint is exposed through `size_bytes()`.
478
+
479
+ ## Runtime Dispatch and Capabilities
480
+
481
+ Dynamic dispatch is the default recommendation for shipping one binary across many CPU generations.
482
+ `nk_configure_thread` configures rounding behavior and enables CPU-specific acceleration features such as Intel AMX.
483
+ It must be called once per thread before any kernel invocation and returns 1 on success, 0 on failure.
484
+
485
+ ```c
486
+ nk_capability_t caps = nk_capabilities();
487
+ nk_configure_thread(caps);
488
+ if (caps & nk_cap_sapphireamx_k) { /* AMX available */ }
489
+ ```
490
+
491
+ For exact register-level details, see `capabilities.h`.
492
+ The C++ wrappers can also call directly into named backends if you want to pin a path for testing or benchmarking.
493
+
494
+ ## Parallelism and Fork Union
495
+
496
+ NumKong does not manage its own threads.
497
+ That is deliberate.
498
+ The library is designed to sit inside a larger scheduler.
499
+
500
+ GEMM-like packed work is usually partitioned across row ranges of `A` against one shared packed `B`:
501
+
502
+ ```cpp
503
+ using nk::range, nk::all, nk::slice;
504
+ fork_union.parallel_for(0, worker_count, [&](std::size_t t) {
505
+ auto start = t * rows_per_worker;
506
+ auto stop = std::min(start + rows_per_worker, total_rows);
507
+ auto a_slice = a[range(start, stop), all, slice].as_matrix_view();
508
+ auto c_slice = c[range(start, stop), all, slice].as_matrix_span();
509
+ nk::dots_packed<value_type_>(a_slice, packed, c_slice);
510
+ });
511
+ ```
512
+
513
+ SYRK-like symmetric work is partitioned by output row windows on one matrix:
514
+
515
+ ```cpp
516
+ fork_union.parallel_for(0, worker_count, [&](std::size_t t) {
517
+ auto start = t * rows_per_worker;
518
+ auto count = std::min(rows_per_worker, total_rows - start);
519
+ nk::dots_symmetric<value_type_>(vectors.as_matrix_view(), gram.as_matrix_span(), start, count);
520
+ nk::angulars_symmetric<value_type_>(vectors.as_matrix_view(), angular_dists.as_matrix_span(), start, count);
521
+ });
522
+ ```
523
+
524
+ We recommend [Fork Union](https://github.com/ashvardanian/ForkUnion) for that host-side orchestration.
525
+ OpenMP is still a reasonable fit if the rest of your application already uses it.
526
+ Manual thread pools and task systems also work well because the kernels have explicit row-range interfaces.
527
+
528
+ The C++26 Executors TS (`std::execution`) is a natural fit here.
529
+ NumKong kernels take explicit row-range parameters and do not own threads, so they compose directly with `std::execution::bulk` or any sender/receiver scheduler.
530
+ When executors ship in your toolchain, replacing the `parallel_for` lambda above with a `bulk` sender is a one-line change.
531
+
532
+ ## Integration Notes
533
+
534
+ - The C ABI is the easiest place to integrate with foreign runtimes and custom allocators.
535
+ - The C++ layer is the easiest place to express typed packed workflows, tensor slicing, and allocator-aware ownership.
536
+ - `aligned_allocator` defaults to 64-byte alignment for owned containers, but unaligned caller inputs are still valid for the kernels that accept raw pointers or views.
537
+ - If you override result types away from the scalar defaults, document that choice carefully because it can change both performance and numerical policy.
538
+
539
+ ## CMake Configuration
540
+
541
+ The main user-facing CMake options are:
542
+
543
+ - `NK_BUILD_SHARED` builds a shared library, ON by default for standalone builds and OFF when included as a subdirectory.
544
+ - `NK_BUILD_TEST` and `NK_BUILD_BENCH` enable precision tests and benchmarks respectively, both OFF by default.
545
+ - `NK_DYNAMIC_DISPATCH=1` compiles all backends into one binary and selects at runtime via `nk_capabilities()`, recommended for shipping one binary across CPU generations.
546
+ - `NK_COMPARE_TO_BLAS` and `NK_COMPARE_TO_MKL` link benchmarks against a system BLAS or Intel MKL, each accepting `AUTO`, `ON`, or `OFF` with `AUTO` as the default.
547
+
548
+ The build enforces C99 for the C layer and C++23 for the C++ layer.
549
+
550
+ ```sh
551
+ cmake -B build -D CMAKE_BUILD_TYPE=Release -D NK_BUILD_TEST=ON
552
+ cmake -B build -D NK_DYNAMIC_DISPATCH=1 -D NK_BUILD_BENCH=ON -D NK_COMPARE_TO_MKL=ON
553
+ ```
554
+
555
+ ## Cross-Compilation
556
+
557
+ Toolchain files for cross-compilation live in `cmake/`:
558
+
559
+ - `cmake/toolchain-aarch64-gnu.cmake` for ARM64 Linux with the GNU toolchain.
560
+ - `cmake/toolchain-riscv64-gnu.cmake` for RISC-V 64 Linux with the GNU toolchain.
561
+ - `cmake/toolchain-android-arm64.cmake` for Android ARM64 via the NDK.
562
+ - `cmake/toolchain-x86_64-llvm.cmake` and `cmake/toolchain-riscv64-llvm.cmake` for Clang/LLD builds.
563
+ - `cmake/toolchain-wasm.cmake`, `toolchain-wasm64.cmake`, and `toolchain-wasi.cmake` for WebAssembly targets.
564
+
565
+ ```sh
566
+ cmake -B build -D CMAKE_TOOLCHAIN_FILE=cmake/toolchain-aarch64-gnu.cmake
567
+ ```
568
+
569
+ ## Threading Model
570
+
571
+ NumKong does not use OpenMP and does not create a hidden thread pool.
572
+ Standard pthreads are linked via CMake's `Threads` package.
573
+ Parallelism is host-controlled: partition work across row ranges and dispatch through Fork Union, `std::thread`, or any external scheduler.
@@ -0,0 +1,129 @@
1
+ // Hierarchical Clang C Module Map for NumKong
2
+ //
3
+ // Layout for fast partial recompilation:
4
+ // CNumKongCore → CNumKongKernels (per-header submodules) → CNumKongCapabilities → CNumKong
5
+ //
6
+ // `config_macros` documents which macros affect the ABI of each module.
7
+ // SPM's `cSettings: [.define(...)]` only applies when compiling .c sources,
8
+ // NOT when Swift imports these headers — but listing the macros here prevents
9
+ // silent miscompilation if flags ever do differ between TUs.
10
+
11
+ // Layer 1: Core types (changes rarely, no dependencies)
12
+ module CNumKongCore {
13
+ config_macros [exhaustive] NK_DYNAMIC_DISPATCH, NK_NATIVE_F16, NK_NATIVE_BF16
14
+
15
+ header "numkong/types.h"
16
+ header "numkong/scalar.h"
17
+ export *
18
+ }
19
+
20
+ // Layer 2: Kernel families (independent, change frequently during development)
21
+ module CNumKongKernels {
22
+ config_macros [exhaustive] NK_DYNAMIC_DISPATCH, NK_NATIVE_F16, NK_NATIVE_BF16
23
+
24
+ use CNumKongCore
25
+
26
+ explicit module Cast {
27
+ header "numkong/cast.h"
28
+ export *
29
+ }
30
+
31
+ explicit module Dot {
32
+ header "numkong/dot.h"
33
+ export *
34
+ }
35
+
36
+ explicit module Dots {
37
+ header "numkong/dots.h"
38
+ export *
39
+ }
40
+
41
+ explicit module Spatial {
42
+ header "numkong/spatial.h"
43
+ export *
44
+ }
45
+
46
+ explicit module Spatials {
47
+ header "numkong/spatials.h"
48
+ export *
49
+ }
50
+
51
+ explicit module Set {
52
+ header "numkong/set.h"
53
+ export *
54
+ }
55
+
56
+ explicit module Sets {
57
+ header "numkong/sets.h"
58
+ export *
59
+ }
60
+
61
+ explicit module Geospatial {
62
+ header "numkong/geospatial.h"
63
+ export *
64
+ }
65
+
66
+ explicit module Each {
67
+ header "numkong/each.h"
68
+ export *
69
+ }
70
+
71
+ explicit module Trigonometry {
72
+ header "numkong/trigonometry.h"
73
+ export *
74
+ }
75
+
76
+ explicit module Reduce {
77
+ header "numkong/reduce.h"
78
+ export *
79
+ }
80
+
81
+ explicit module MaxSim {
82
+ header "numkong/maxsim.h"
83
+ export *
84
+ }
85
+
86
+ explicit module Mesh {
87
+ header "numkong/mesh.h"
88
+ export *
89
+ }
90
+
91
+ explicit module Curved {
92
+ header "numkong/curved.h"
93
+ export *
94
+ }
95
+
96
+ explicit module Probability {
97
+ header "numkong/probability.h"
98
+ export *
99
+ }
100
+
101
+ explicit module Sparse {
102
+ header "numkong/sparse.h"
103
+ export *
104
+ }
105
+
106
+ export *
107
+ }
108
+
109
+ // Layer 3: Capability detection (depends on Core, used by umbrella)
110
+ module CNumKongCapabilities {
111
+ config_macros [exhaustive] NK_DYNAMIC_DISPATCH, NK_NATIVE_F16, NK_NATIVE_BF16
112
+
113
+ use CNumKongCore
114
+
115
+ header "numkong/capabilities.h"
116
+ export *
117
+ }
118
+
119
+ // Layer 4: Unified API (top-level umbrella module)
120
+ module CNumKong {
121
+ config_macros [exhaustive] NK_DYNAMIC_DISPATCH, NK_NATIVE_F16, NK_NATIVE_BF16
122
+
123
+ use CNumKongCore
124
+ use CNumKongKernels
125
+ use CNumKongCapabilities
126
+
127
+ header "numkong/numkong.h"
128
+ export *
129
+ }