npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/README.md ADDED Viewed

@@ -0,0 +1,573 @@
+# NumKong for C and C++
+NumKong's native SDK is the reference surface for the project.
+The plain C ABI exposes every kernel family directly: dot products, dense distances, binary metrics, probability divergences, geospatial solvers, curved-space kernels, sparse intersections, mesh alignment, packed matrix multiplication, symmetric self-similarity, and late-interaction scoring.
+The ABI is stable, versioned, and callable from any language that can load a shared library.
+There is no runtime overhead: no hidden thread pool, no implicit allocation, no garbage collector interaction.
+The C++ layer stays thin, typed, allocator-aware, and close enough to inline through, adding type-level result promotion and owning containers without hiding the dispatch model or the mixed-precision policy.
+## Quickstart
+```c
+#include <numkong/numkong.h>
+#include <stdio.h>
+int main(void) {
+    nk_f32_t a[] = {1, 2, 3};
+    nk_f32_t b[] = {4, 5, 6};
+    nk_f64_t dot = 0;
+    nk_configure_thread(nk_capabilities());
+    nk_dot_f32(a, b, 3, &dot); // widened f32 → f64 output
+    printf("dot=%f\n", dot);
+    return 0;
+}
+```
+## Highlights
+This is the most complete SDK in the project.
+It is the right layer if you want exact control over dtypes, allocators, packed buffers, dispatch, and host-side partitioning.
+__Full kernel surface.__
+All public operation families are reachable from native code.
+__No hidden threading.__
+NumKong does not own a thread pool.
+__No hidden allocation.__
+C APIs take caller-owned buffers, and C++ wrappers make ownership explicit.
+__Mixed precision by default.__
+Small storage types widen into safer accumulator and output types.
+__Allocator-aware containers.__
+`vector`, `tensor`, `packed_matrix`, and `packed_maxsim` accept custom allocators.
+__Unaligned inputs are fine.__
+Packing handles internal layout itself and does not require caller-side alignment.
+## Ecosystem Comparison
+| Feature                      | NumKong                                                                                                                             | [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS)               | [Eigen](https://gitlab.com/libeigen/eigen)                                              |
+| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- | --------------------------------------------------------------------------------------- |
+| Operation families           | dots, distances, binary, probability, geospatial, curved, mesh, sparse, MaxSim, elementwise, reductions, cast, trig                 | dense linear algebra only                                         | dense LA, some reductions and elementwise                                               |
+| Precision                    | Sub-byte to Float64 dtypes; automatic widening per scalar type; Kahan-compensated summation; 0 ULP Float32/Float64 where applicable | Float32, Float64 only; same-type in/out; no compensated summation | Float16/BFloat16 partial; no Float8 or sub-byte; manual casts; no compensated summation |
+| Runtime SIMD dispatch        | per-thread at runtime across x86, ARM, RISC-V                                                                                       | load-time CPU detection; one kernel set per process               | compile-time ISA flags only                                                             |
+| Packed matrix, GEMM-like     | `packed_matrix` — pack once, reuse across query batches                                                                             | internal opaque packing per GEMM call; no persistent packed form  | no equivalent packed reuse abstraction                                                  |
+| Symmetric kernels, SYRK-like | skips duplicate pairs, up to 2x speedup for self-distance                                                                           | `SSYRK`/`DSYRK` for rank-k updates                                | `.selfadjointView` for rank-k updates                                                   |
+| Memory model                 | Caller-owned buffers; C++ adds `tensor<T,A>` with per-container allocators                                                          | Caller-managed buffers; no container abstraction                  | Lazy expression templates avoid most temporaries; `aligned_allocator` provided          |
+## Installation
+With CMake `FetchContent`:
+```cmake
+include(FetchContent)
+FetchContent_Declare(
+    numkong
+    GIT_REPOSITORY https://github.com/ashvardanian/NumKong.git
+    GIT_SHALLOW TRUE
+)
+FetchContent_MakeAvailable(numkong)
+target_link_libraries(my_target PRIVATE numkong)
+```
+Vendored:
+```cmake
+add_subdirectory(external/NumKong)
+target_link_libraries(my_target PRIVATE numkong)
+```
+Header-only C++ usage also works for direct template wrappers.
+Most applications should still build the library once and keep `NK_DYNAMIC_DISPATCH=1`.
+## The C ABI
+The C ABI keeps the operation family, input dtype, and output policy visible in the symbol name.
+That makes widening obvious at the call site.
+```c
+#include <numkong/numkong.h>
+nk_i8_t a[1536];
+nk_i8_t b[1536];
+nk_i32_t dot = 0;     // widened from int8 storage
+nk_f32_t l2 = 0;      // widened from int8 storage
+nk_dot_i8(a, b, 1536, &dot);
+nk_euclidean_i8(a, b, 1536, &l2);
+```
+If you want runtime-selected kernels without naming a specific ISA, use the punned dispatch layer:
+```c
+nk_metric_dense_punned_t angular = 0;
+nk_capability_t used = nk_cap_serial_k;
+nk_find_kernel_punned(nk_kernel_angular_k, nk_f32_k,
+    nk_capabilities(), (nk_kernel_punned_t *)&angular, &used);
+nk_f32_t a[768], b[768], result = 0;
+angular(a, b, 768, &result);
+```
+That is the lowest-level dynamic path.
+The typed C++ wrappers usually read better unless you are building your own dispatch layer.
+## The C++ Layer
+The C++ wrappers add three things.
+They add type-level result promotion.
+They add explicit owning and non-owning containers.
+They add allocator-aware packed objects for repeated matrix workloads.
+```cpp
+#include <numkong/numkong.hpp>
+namespace nk = ashvardanian::numkong;
+int main() {
+    nk::f32_t a[3] = {1, 2, 3}, b[3] = {4, 5, 6};
+    nk::f64_t dot {};
+    nk::dot(a, b, 3, &dot); // default result type is nk::f32_t::dot_result_t == nk::f64_t
+}
+```
+The API is intentionally not STL-shaped.
+`vector_view`, `tensor_view`, and `matrix_view` prioritize signed strides, sub-byte storage, and kernel compatibility over resizable-container ergonomics.
+## Scalar Types and Promotions
+The scalar wrappers in `include/numkong/types.hpp` are storage-first types.
+They encode raw layout, default output types, and the kernel function pointer signatures for each family.
+| Type        | Layout           | Bytes | Range           | Inf | NaN |
+| ----------- | ---------------- | ----- | --------------- | --- | --- |
+| `nk_f16_t`  | 1+5+10           | 2     | ±65504          | yes | yes |
+| `nk_bf16_t` | 1+8+7            | 2     | ±3.4×10³⁸       | yes | yes |
+| `nk_e4m3_t` | 1+4+3            | 1     | ±448            | no  | yes |
+| `nk_e5m2_t` | 1+5+2            | 1     | ±57344          | yes | yes |
+| `nk_e2m3_t` | 1+2+3            | 1     | ±7.5            | no  | no  |
+| `nk_e3m2_t` | 1+3+2            | 1     | ±28             | no  | no  |
+| `nk_u1x8_t` | 8 packed bits    | 1     | 0 or 1 per bit  | —   | —   |
+| `nk_u4x2_t` | 2x4-bit unsigned | 1     | 0-15 per nibble | —   | —   |
+| `nk_i4x2_t` | 2x4-bit signed   | 1     | -8-7 per nibble | —   | —   |
+The layout column shows sign, exponent, and mantissa bit counts for floating-point types.
+For `nk_f16_t`, 1+5+10 means one sign bit, five exponent bits, and ten mantissa bits, totaling 16 bits stored in 2 bytes.
+For `nk_bf16_t`, the wider exponent field (8 bits) gives the same dynamic range as IEEE 754 single precision but with reduced mantissa precision.
+The Float8 types `nk_e4m3_t` and `nk_e5m2_t` follow the OFP8 specification.
+The narrower `nk_e2m3_t` and `nk_e3m2_t` types are MX-compatible micro-floats.
+Sub-byte types `nk_u1x8_t`, `nk_u4x2_t`, and `nk_i4x2_t` pack multiple logical values into a single byte.
+Default promotions are encoded on the type.
+For example, `f32_t::dot_result_t` is wider than `f32_t`.
+`i8_t::dot_result_t` is `i32_t`.
+`u1x8_t::dot_result_t` is `u32_t`.
+The higher-level templates use `result_type_ = typename in_type_::dot_result_t` and similar defaults.
+The fast typed overloads are constrained so that overriding the result type away from the native policy can disable the specialized path and fall back to the more generic one.
+## Dot Products
+Dot products are one of the broadest parts of the native SDK.
+They include real, complex, packed-binary, mini-float, and mixed-precision forms.
+```c
+nk_f32c_t a[384];
+nk_f32c_t b[384];
+nk_f32_t out[2] = {0, 0};
+nk_dot_f32c(a, b, 384, out);  // complex inner product
+nk_vdot_f32c(a, b, 384, out); // conjugated variant, like numpy.vdot
+```
+For quantized retrieval pipelines, the storage format often matters more than the nominal math family.
+The native SDK lets you keep the compact representation and still get a widened output.
+## Dense Distances
+The dense spatial kernels cover the SciPy-style `sqeuclidean`, `euclidean`, and `angular` family.
+The important difference is that storage type and output type are not forced to match.
+```c
+nk_f16_t a[768];
+nk_f16_t b[768];
+nk_f32_t sqeuclidean = 0, euclidean = 0, angular = 0;
+// `_Float16` support varies across compilers, and
+// auto-vectorization targets `f32` — not `f16`.
+nk_sqeuclidean_f16(a, b, 768, &sqeuclidean);
+nk_euclidean_f16(a, b, 768, &euclidean);
+nk_angular_f16(a, b, 768, &angular);
+```
+For `i8`, `u8`, `i4`, `u4`, and `u1`, the widening is even more important.
+The output type is chosen to avoid the obvious overflow trap of same-width accumulation.
+## Set Similarity
+Packed-binary metrics operate on packed words, not on byte-wise booleans.
+That is why `u1x8_t` exists as a storage type instead of pretending that `bool[8]` is the right primitive.
+```c
+nk_u1x8_t a[128], b[128];
+nk_u32_t hamming = 0;
+nk_f32_t jaccard = 0;
+nk_hamming_u1(a, b, 128 * 8, &hamming);
+nk_jaccard_u1(a, b, 128 * 8, &jaccard);
+```
+Integer set Jaccard works on sorted arrays of integer identifiers.
+```c
+nk_u32_t set_a[] = {1, 3, 5, 7, 9}, set_b[] = {3, 5, 8, 9, 10};
+nk_f32_t jaccard_sets = 0;
+nk_jaccard_u32(set_a, set_b, 5, &jaccard_sets); // |A ∩ B| / |A ∪ B|
+assert(jaccard_sets > 0.0f && jaccard_sets < 1.0f && "|A ∩ B| / |A ∪ B| should be in (0, 1)");
+```
+## Probability Metrics
+Probability kernels target divergences directly instead of making you rebuild them from scalar loops.
+```c
+nk_f32_t p[] = {0.2f, 0.3f, 0.5f}, q[] = {0.1f, 0.3f, 0.6f};
+nk_f64_t kl_forward = 0, kl_reverse = 0, js_forward = 0, js_reverse = 0;
+nk_kld_f32(p, q, 3, &kl_forward);
+nk_kld_f32(q, p, 3, &kl_reverse);
+assert(kl_forward != kl_reverse && "KLD is asymmetric");
+nk_jsd_f32(p, q, 3, &js_forward);
+nk_jsd_f32(q, p, 3, &js_reverse);
+assert(js_forward == js_reverse && "JSD is symmetric");
+```
+These paths are especially valuable once you move below `f64`.
+Naive implementations are usually dominated by repeated scalar transcendental calls and weak accumulation policy.
+## Geospatial Metrics
+The native SDK exposes both the fast spherical approximation and the more accurate ellipsoidal solver.
+Inputs are in radians.
+Outputs are in meters.
+```c
+// Statue of Liberty (40.6892°N, 74.0445°W) → Big Ben (51.5007°N, 0.1246°W)
+nk_f64_t liberty_lat[] = {0.7101605100}, liberty_lon[] = {-1.2923203180};
+nk_f64_t big_ben_lat[] = {0.8988567821}, big_ben_lon[] = {-0.0021746802};
+nk_f64_t distance[1];
+nk_vincenty_f64(liberty_lat, liberty_lon, big_ben_lat, big_ben_lon, 1, distance);  // ≈ 5,589,857 m (ellipsoidal, baseline)
+nk_haversine_f64(liberty_lat, liberty_lon, big_ben_lat, big_ben_lon, 1, distance); // ≈ 5,543,723 m (spherical, ~46 km less)
+// Vincenty in f32 — drifts ~2 m from f64
+nk_f32_t liberty_lat32[] = {0.7101605100f}, liberty_lon32[] = {-1.2923203180f};
+nk_f32_t big_ben_lat32[] = {0.8988567821f}, big_ben_lon32[] = {-0.0021746802f};
+nk_f32_t distance_f32[1];
+nk_vincenty_f32(liberty_lat32, liberty_lon32, big_ben_lat32, big_ben_lon32, 1, distance_f32); // ≈ 5,589,859 m (+2 m drift)
+```
+## Curved Metrics
+Curved-space kernels are separate from the flat Euclidean family because their dataflow is different.
+They combine vectors with an extra metric tensor or covariance inverse.
+```c
+// Complex bilinear form: aᴴ M b
+nk_f32c_t a[32], b[32], metric[32 * 32];
+nk_f64c_t result = {0, 0};
+nk_bilinear_f32c(a, b, metric, 32, &result);
+// Real Mahalanobis distance: √((a−b)ᵀ M⁻¹ (a−b))
+nk_f32_t x[64], y[64], inv_cov[64 * 64];
+nk_f64_t distance = 0;
+nk_mahalanobis_f32(x, y, inv_cov, 64, &distance);
+```
+## Tensors, Views, and Memory Layout
+The native containers are where most integration mistakes happen.
+They need to be documented explicitly.
+- `vector<T, A>` owns storage and defaults to `aligned_allocator<T, 64>`.
+- `vector_view<T>` is a const strided non-owning view.
+- `vector_span<T>` is a mutable strided non-owning view.
+- `tensor<T, A, R>` owns rank-`R` storage and also defaults to aligned allocation.
+- `tensor_view<T>` and `tensor_span<T>` are the view forms.
+- `matrix`, `matrix_view`, and `matrix_span` are rank-2 aliases.
+The important layout rules are:
+- Signed strides are supported by the view types.
+- Reversed and sliced views are valid for many elementwise and reduction kernels.
+- `reshape` and `flatten` require contiguous layout.
+- Matrix-style kernels care about _row contiguity_, not just total tensor contiguity.
+- Negative strides are conceptually valid views, but matrix packing and packed matmul workflows are not written around them.
+Memory ownership is explicit.
+`vector` and `tensor` deallocate through their allocator.
+`vector_view`, `tensor_view`, `matrix_view`, and spans never own memory.
+And heterogenous index types for `operator[]` enable more interesting access patterns:
+```cpp
+#include <numkong/numkong.hpp>
+namespace nk = ashvardanian::numkong;
+using nk::slice, nk::all, nk::f32_t, nk::tensor, nk::tensor_view;
+auto t = tensor<f32_t>::try_from({
+    {1, 2, 3},
+    {4, 5, 6},
+    {7, 8, 9},
+});
+f32_t scalar_at_2d_coordinate = t[1, -1];
+f32_t scalar_at_global_offset = t[4];
+assert(scalar_at_2d_coordinate == scalar_at_global_offset && "same value");
+tensor_view<f32_t> scalar_as_tensor = t[1, 1, slice];
+tensor_view<f32_t> second_row = t[1, slice];
+tensor_view<f32_t> second_column = t[all, 1, slice];
+assert(second_row[1] == second_column[1] && "same value");
+```
+You can also use a more traditional syntax with member functions, also leveraging built-in functionality for hardware-accelerated strided reductions and elementwise operations along any axis combination.
+Similar to NumPy, but statically typed:
+```cpp
+auto first_column = t[all, 1, slice];             // strided column view → {2, 5, 8}
+auto minimum_index = nk::argmin(first_column);    // index of the minimum in the second column
+```
+The view types are conceptually close to `std::mdspan` from C++23.
+The main differences are sub-byte element support, signed strides, and the kernel dispatch integration that `std::mdspan` does not provide.
+If your codebase already uses `std::mdspan`, converting at the NumKong call boundary is straightforward:
+```cpp
+#include <numkong/numkong.hpp>
+#include <mdspan>
+namespace nk = ashvardanian::numkong;
+// Existing std::mdspan from your codebase
+float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+auto md = std::mdspan<float, std::extents<std::size_t, 3, 3>>(data);
+// Wrap into a NumKong matrix_view — data pointer, extents, and strides map directly
+auto view = nk::matrix_view<nk::f32_t>(
+    reinterpret_cast<nk::f32_t const *>(md.data_handle()),
+    md.extent(0), md.extent(1));
+// Now use any NumKong kernel on it
+nk::f64_t dot {};
+nk::dot(view.row(0), view.row(1), md.extent(1), &dot);
+```
+## Packed Matrix Kernels for GEMM-Like Workloads
+This is the most distinctive native subsystem outside the raw vector kernels.
+It is the right tool when the right-hand side is reused many times.
+```cpp
+#include <numkong/numkong.hpp>
+namespace nk = ashvardanian::numkong;
+auto a = nk::tensor<nk::f32_t>::try_full({2, 4}, nk::f32_t {1});
+auto b = nk::tensor<nk::f32_t>::try_full({3, 4}, nk::f32_t {2});
+auto packed = nk::packed_matrix<nk::f32_t>::try_pack(b.as_matrix_view());
+// Dot products, angular distances, and Euclidean distances all reuse the same packed B
+auto dots = nk::try_dots_packed(a.as_matrix_view(), packed);
+auto angulars = nk::try_angulars_packed(a.as_matrix_view(), packed);
+auto euclideans = nk::try_euclideans_packed(a.as_matrix_view(), packed);
+```
+This is GEMM-like in the workload shape, not in the strict BLAS API.
+The useful economics are:
+- one-time packing of `B`
+- one-time type preconversion where needed
+- depth padding handled internally
+- per-column norm reuse for `angulars_packed` and `euclideans_packed`
+- repeated reuse of the same packed RHS across many `A` batches
+Caller-side alignment is not required.
+Owned `packed_matrix` storage uses its allocator.
+The C ABI also exposes `nk_dots_packed_size_*` so you can `malloc` the exact external buffer yourself.
+## Symmetric Kernels for SYRK-Like Workloads
+The symmetric kernels solve a different problem.
+They compute self-similarity or self-distance without paying for both triangles independently.
+```cpp
+auto vectors = nk::tensor<nk::f32_t>::try_full({100, 768}, nk::f32_t {1});
+auto gram = nk::try_dots_symmetric(vectors.as_matrix_view());
+auto angular_dists = nk::try_angulars_symmetric(vectors.as_matrix_view());
+auto euclidean_dists = nk::try_euclideans_symmetric(vectors.as_matrix_view());
+```
+This is SYRK-like in the sense that the output is square and symmetric.
+The important difference from packed GEMM-style work is the partitioning model.
+You typically split by output row windows, not by distinct left batches against a shared packed right-hand side.
+The arithmetic advantage is direct and honest.
+The symmetric kernels avoid recomputing both `(i, j)` and `(j, i)` pairs.
+That cuts the pair count almost in half before any micro-kernel details matter.
+## Sparse Operations and Intersections
+Sparse helpers cover sorted-index intersection and weighted sparse dot products.
+```c
+nk_u32_t a_idx[] = {1, 3, 5, 7}, b_idx[] = {3, 4, 5, 8};
+nk_u32_t intersection[4];
+nk_size_t count = 0;
+nk_sparse_intersect_u32(a_idx, b_idx, 4, 4, intersection, &count);
+assert(count == 2 && "indices 3 and 5");
+nk_f32_t a_weights[] = {1.0f, 2.0f, 3.0f, 4.0f};
+nk_f32_t b_weights[] = {5.0f, 6.0f, 7.0f, 8.0f};
+nk_f64_t result = 0;
+nk_sparse_dot_u32f32(a_idx, b_idx, a_weights, b_weights, 4, 4, &result);
+assert(result > 0 && "weighted dot over shared indices");
+```
+This family deserves explicit mention because it is not just sparse dot.
+Set intersection itself is often the workload.
+## Geometric Mesh Alignment
+Mesh alignment returns structured outputs, not just one scalar.
+The native API covers `rmsd`, `kabsch`, and `umeyama`.
+```c
+// Three 3D points, target is source scaled by 2x
+nk_f32_t source[] = {0, 0, 0, 1, 0, 0, 0, 1, 0};
+nk_f32_t target[] = {0, 0, 0, 2, 0, 0, 0, 2, 0};
+nk_f32_t a_centroid[3], b_centroid[3], rotation[9];
+nk_f32_t scale = 0, rmsd = 0;
+nk_umeyama_f32(source, target, 3, a_centroid, b_centroid, rotation, &scale, &rmsd);
+assert(rmsd < 1e-6f && "umeyama should recover exact alignment");
+assert(scale > 1.99f && scale < 2.01f && "umeyama should recover 2x scale");
+```
+This family is separate from curved metrics because the output is a transform, not just a distance.
+## MaxSim and Late Interaction
+MaxSim is the late-interaction primitive used by systems such as [ColBERT](https://arxiv.org/abs/2004.12832).
+It is not generic matrix multiplication.
+It packs query and document token vectors into a scoring-specific layout and computes a late-interaction score.
+```cpp
+auto queries = nk::tensor<nk::bf16_t>::try_full({32, 128}, nk::bf16_t::one());
+auto docs = nk::tensor<nk::bf16_t>::try_full({192, 128}, nk::bf16_t::one());
+auto q = nk::packed_maxsim<nk::bf16_t>::try_pack(queries.as_matrix_view());
+auto d = nk::packed_maxsim<nk::bf16_t>::try_pack(docs.as_matrix_view());
+auto score = nk::maxsim(q, d);
+```
+`packed_maxsim` is allocator-aware in the same way as `packed_matrix`.
+Its footprint is exposed through `size_bytes()`.
+## Runtime Dispatch and Capabilities
+Dynamic dispatch is the default recommendation for shipping one binary across many CPU generations.
+`nk_configure_thread` configures rounding behavior and enables CPU-specific acceleration features such as Intel AMX.
+It must be called once per thread before any kernel invocation and returns 1 on success, 0 on failure.
+```c
+nk_capability_t caps = nk_capabilities();
+nk_configure_thread(caps);
+if (caps & nk_cap_sapphireamx_k) { /* AMX available */ }
+```
+For exact register-level details, see `capabilities.h`.
+The C++ wrappers can also call directly into named backends if you want to pin a path for testing or benchmarking.
+## Parallelism and Fork Union
+NumKong does not manage its own threads.
+That is deliberate.
+The library is designed to sit inside a larger scheduler.
+GEMM-like packed work is usually partitioned across row ranges of `A` against one shared packed `B`:
+```cpp
+using nk::range, nk::all, nk::slice;
+fork_union.parallel_for(0, worker_count, [&](std::size_t t) {
+    auto start = t * rows_per_worker;
+    auto stop = std::min(start + rows_per_worker, total_rows);
+    auto a_slice = a[range(start, stop), all, slice].as_matrix_view();
+    auto c_slice = c[range(start, stop), all, slice].as_matrix_span();
+    nk::dots_packed<value_type_>(a_slice, packed, c_slice);
+});
+```
+SYRK-like symmetric work is partitioned by output row windows on one matrix:
+```cpp
+fork_union.parallel_for(0, worker_count, [&](std::size_t t) {
+    auto start = t * rows_per_worker;
+    auto count = std::min(rows_per_worker, total_rows - start);
+    nk::dots_symmetric<value_type_>(vectors.as_matrix_view(), gram.as_matrix_span(), start, count);
+    nk::angulars_symmetric<value_type_>(vectors.as_matrix_view(), angular_dists.as_matrix_span(), start, count);
+});
+```
+We recommend [Fork Union](https://github.com/ashvardanian/ForkUnion) for that host-side orchestration.
+OpenMP is still a reasonable fit if the rest of your application already uses it.
+Manual thread pools and task systems also work well because the kernels have explicit row-range interfaces.
+The C++26 Executors TS (`std::execution`) is a natural fit here.
+NumKong kernels take explicit row-range parameters and do not own threads, so they compose directly with `std::execution::bulk` or any sender/receiver scheduler.
+When executors ship in your toolchain, replacing the `parallel_for` lambda above with a `bulk` sender is a one-line change.
+## Integration Notes
+- The C ABI is the easiest place to integrate with foreign runtimes and custom allocators.
+- The C++ layer is the easiest place to express typed packed workflows, tensor slicing, and allocator-aware ownership.
+- `aligned_allocator` defaults to 64-byte alignment for owned containers, but unaligned caller inputs are still valid for the kernels that accept raw pointers or views.
+- If you override result types away from the scalar defaults, document that choice carefully because it can change both performance and numerical policy.
+## CMake Configuration
+The main user-facing CMake options are:
+- `NK_BUILD_SHARED` builds a shared library, ON by default for standalone builds and OFF when included as a subdirectory.
+- `NK_BUILD_TEST` and `NK_BUILD_BENCH` enable precision tests and benchmarks respectively, both OFF by default.
+- `NK_DYNAMIC_DISPATCH=1` compiles all backends into one binary and selects at runtime via `nk_capabilities()`, recommended for shipping one binary across CPU generations.
+- `NK_COMPARE_TO_BLAS` and `NK_COMPARE_TO_MKL` link benchmarks against a system BLAS or Intel MKL, each accepting `AUTO`, `ON`, or `OFF` with `AUTO` as the default.
+The build enforces C99 for the C layer and C++23 for the C++ layer.
+```sh
+cmake -B build -D CMAKE_BUILD_TYPE=Release -D NK_BUILD_TEST=ON
+cmake -B build -D NK_DYNAMIC_DISPATCH=1 -D NK_BUILD_BENCH=ON -D NK_COMPARE_TO_MKL=ON
+```
+## Cross-Compilation
+Toolchain files for cross-compilation live in `cmake/`:
+- `cmake/toolchain-aarch64-gnu.cmake` for ARM64 Linux with the GNU toolchain.
+- `cmake/toolchain-riscv64-gnu.cmake` for RISC-V 64 Linux with the GNU toolchain.
+- `cmake/toolchain-android-arm64.cmake` for Android ARM64 via the NDK.
+- `cmake/toolchain-x86_64-llvm.cmake` and `cmake/toolchain-riscv64-llvm.cmake` for Clang/LLD builds.
+- `cmake/toolchain-wasm.cmake`, `toolchain-wasm64.cmake`, and `toolchain-wasi.cmake` for WebAssembly targets.
+```sh
+cmake -B build -D CMAKE_TOOLCHAIN_FILE=cmake/toolchain-aarch64-gnu.cmake
+```
+## Threading Model
+NumKong does not use OpenMP and does not create a hidden thread pool.
+Standard pthreads are linked via CMake's `Threads` package.
+Parallelism is host-controlled: partition work across row ranges and dispatch through Fork Union, `std::thread`, or any external scheduler.

package/include/module.modulemap ADDED Viewed

@@ -0,0 +1,129 @@
+// Hierarchical Clang C Module Map for NumKong
+//
+// Layout for fast partial recompilation:
+//   CNumKongCore → CNumKongKernels (per-header submodules) → CNumKongCapabilities → CNumKong
+//
+// `config_macros` documents which macros affect the ABI of each module.
+// SPM's `cSettings: [.define(...)]` only applies when compiling .c sources,
+// NOT when Swift imports these headers — but listing the macros here prevents
+// silent miscompilation if flags ever do differ between TUs.
+// Layer 1: Core types (changes rarely, no dependencies)
+module CNumKongCore {
+    config_macros [exhaustive] NK_DYNAMIC_DISPATCH, NK_NATIVE_F16, NK_NATIVE_BF16
+    header "numkong/types.h"
+    header "numkong/scalar.h"
+    export *
+}
+// Layer 2: Kernel families (independent, change frequently during development)
+module CNumKongKernels {
+    config_macros [exhaustive] NK_DYNAMIC_DISPATCH, NK_NATIVE_F16, NK_NATIVE_BF16
+    use CNumKongCore
+    explicit module Cast {
+        header "numkong/cast.h"
+        export *
+    }
+    explicit module Dot {
+        header "numkong/dot.h"
+        export *
+    }
+    explicit module Dots {
+        header "numkong/dots.h"
+        export *
+    }
+    explicit module Spatial {
+        header "numkong/spatial.h"
+        export *
+    }
+    explicit module Spatials {
+        header "numkong/spatials.h"
+        export *
+    }
+    explicit module Set {
+        header "numkong/set.h"
+        export *
+    }
+    explicit module Sets {
+        header "numkong/sets.h"
+        export *
+    }
+    explicit module Geospatial {
+        header "numkong/geospatial.h"
+        export *
+    }
+    explicit module Each {
+        header "numkong/each.h"
+        export *
+    }
+    explicit module Trigonometry {
+        header "numkong/trigonometry.h"
+        export *
+    }
+    explicit module Reduce {
+        header "numkong/reduce.h"
+        export *
+    }
+    explicit module MaxSim {
+        header "numkong/maxsim.h"
+        export *
+    }
+    explicit module Mesh {
+        header "numkong/mesh.h"
+        export *
+    }
+    explicit module Curved {
+        header "numkong/curved.h"
+        export *
+    }
+    explicit module Probability {
+        header "numkong/probability.h"
+        export *
+    }
+    explicit module Sparse {
+        header "numkong/sparse.h"
+        export *
+    }
+    export *
+}
+// Layer 3: Capability detection (depends on Core, used by umbrella)
+module CNumKongCapabilities {
+    config_macros [exhaustive] NK_DYNAMIC_DISPATCH, NK_NATIVE_F16, NK_NATIVE_BF16
+    use CNumKongCore
+    header "numkong/capabilities.h"
+    export *
+}
+// Layer 4: Unified API (top-level umbrella module)
+module CNumKong {
+    config_macros [exhaustive] NK_DYNAMIC_DISPATCH, NK_NATIVE_F16, NK_NATIVE_BF16
+    use CNumKongCore
+    use CNumKongKernels
+    use CNumKongCapabilities
+    header "numkong/numkong.h"
+    export *
+}