numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,571 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SIMD-accelerated MaxSim (ColBERT Late Interaction).
|
|
3
|
+
* @file include/numkong/maxsim.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 17, 2026
|
|
6
|
+
*
|
|
7
|
+
* Computes angular distance late-interaction: result = Σᵢ minⱼ angular(qᵢ, dⱼ).
|
|
8
|
+
* Angular distance = 1 - dot(q, d) / sqrt(||q||² × ||d||²), clamped >= 0.
|
|
9
|
+
*
|
|
10
|
+
* Strategy: coarse i8-quantized screening with running argmax (dot as proxy for argmin angular),
|
|
11
|
+
* then full-precision refinement of the winning pairs via nk_dot_* primitives,
|
|
12
|
+
* finalized with angular distance and accumulated with `f64`.
|
|
13
|
+
*
|
|
14
|
+
* Precision policy:
|
|
15
|
+
* - `f32` inputs keep packed payloads and metadata narrow for memory bandwidth.
|
|
16
|
+
* - The refined scores and final late-interaction sum widen to `f64`.
|
|
17
|
+
*
|
|
18
|
+
* It implements several operations:
|
|
19
|
+
*
|
|
20
|
+
* - "maxsim_packed" - computing MaxSim where both Q and D are pre-packed into optimal form
|
|
21
|
+
* - "maxsim_packed_size" - estimating the memory requirements for external malloc
|
|
22
|
+
* - "maxsim_pack" - performing the pre-processing (quantization + original copy)
|
|
23
|
+
*
|
|
24
|
+
* @section maxsim_api Two-Phase API
|
|
25
|
+
*
|
|
26
|
+
* @code{.c}
|
|
27
|
+
* // Pack query and document matrices
|
|
28
|
+
* nk_size_t query_bytes = nk_maxsim_packed_size_bf16(query_count, depth);
|
|
29
|
+
* nk_size_t document_bytes = nk_maxsim_packed_size_bf16(document_count, depth);
|
|
30
|
+
* void *query_packed = malloc(query_bytes);
|
|
31
|
+
* void *document_packed = malloc(document_bytes);
|
|
32
|
+
* nk_maxsim_pack_bf16(queries, query_count, depth, depth * sizeof(nk_bf16_t), query_packed);
|
|
33
|
+
* nk_maxsim_pack_bf16(documents, document_count, depth, depth * sizeof(nk_bf16_t), document_packed);
|
|
34
|
+
*
|
|
35
|
+
* // Compute MaxSim score
|
|
36
|
+
* nk_f32_t score;
|
|
37
|
+
* nk_maxsim_packed_bf16(query_packed, document_packed, query_count, document_count, depth, &score);
|
|
38
|
+
* @endcode
|
|
39
|
+
*
|
|
40
|
+
* @section maxsim_packed_layout Packed Buffer Layout
|
|
41
|
+
*
|
|
42
|
+
* [Header 64B] [i8 vectors, 64B-aligned] [metadata, 64B-aligned] [originals row-major, 64B-aligned]
|
|
43
|
+
*
|
|
44
|
+
* The packed format is backend-specific: different ISAs use different i8 depth padding
|
|
45
|
+
* and clamp ranges. Pack with the matching ISA's pack function.
|
|
46
|
+
*
|
|
47
|
+
* @section maxsim_isa_support ISA Support
|
|
48
|
+
*
|
|
49
|
+
* Currently implemented:
|
|
50
|
+
* - Serial: scalar reference (all platforms)
|
|
51
|
+
* - Haswell: AVX2 VPMADDUBSW coarse [-79,79] + bias correction (bf16/f32/f16)
|
|
52
|
+
* - Icelake: AVX-512 VNNI VPDPBUSD coarse (f32/f16)
|
|
53
|
+
* - Genoa: AVX-512 VNNI coarse + VDPBF16PS refinement (bf16 only)
|
|
54
|
+
* - NEONSDOT: ARM SDOT (vdotq_s32) coarse, no bias correction (bf16/f32/f16)
|
|
55
|
+
* - SME: ARM fused BFMOPA (existing, unchanged)
|
|
56
|
+
*/
|
|
57
|
+
#ifndef NK_MAXSIM_H
|
|
58
|
+
#define NK_MAXSIM_H
|
|
59
|
+
|
|
60
|
+
#include "numkong/types.h"
|
|
61
|
+
|
|
62
|
+
#if defined(__cplusplus)
|
|
63
|
+
extern "C" {
|
|
64
|
+
#endif
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* @brief Returns packed buffer size in bytes for a maxsim vector set.
|
|
68
|
+
* @param[in] vector_count The number of vectors to pack.
|
|
69
|
+
* @param[in] depth The number of dimensions per vector.
|
|
70
|
+
* @note The packed layout is backend-specific and must be produced by the matching pack function.
|
|
71
|
+
*/
|
|
72
|
+
NK_DYNAMIC nk_size_t nk_maxsim_packed_size_bf16(nk_size_t vector_count, nk_size_t depth);
|
|
73
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
74
|
+
NK_DYNAMIC nk_size_t nk_maxsim_packed_size_f32(nk_size_t vector_count, nk_size_t depth);
|
|
75
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
76
|
+
NK_DYNAMIC nk_size_t nk_maxsim_packed_size_f16(nk_size_t vector_count, nk_size_t depth);
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* @brief Packs vectors into a backend-specific layout for maxsim computation.
|
|
80
|
+
* @param[in] vectors The input vectors in row-major order.
|
|
81
|
+
* @param[in] vector_count The number of vectors.
|
|
82
|
+
* @param[in] depth The number of dimensions per vector.
|
|
83
|
+
* @param[in] stride The row stride in bytes for the input vectors.
|
|
84
|
+
* @param[out] packed The output packed buffer from nk_maxsim_packed_size_bf16.
|
|
85
|
+
*/
|
|
86
|
+
NK_DYNAMIC void nk_maxsim_pack_bf16(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
|
|
87
|
+
void *packed);
|
|
88
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
89
|
+
NK_DYNAMIC void nk_maxsim_pack_f32(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
|
|
90
|
+
void *packed);
|
|
91
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
92
|
+
NK_DYNAMIC void nk_maxsim_pack_f16(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
|
|
93
|
+
void *packed);
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* @brief Computes angular distance late-interaction on pre-packed vectors.
|
|
97
|
+
* Returns Σᵢ minⱼ angular(qᵢ, dⱼ) where angular = 1 - dot / sqrt(||q||² × ||d||²).
|
|
98
|
+
*
|
|
99
|
+
* @param[in] query_packed Packed query vectors (from nk_maxsim_pack_bf16).
|
|
100
|
+
* @param[in] document_packed Packed document vectors (from nk_maxsim_pack_bf16).
|
|
101
|
+
* @param[in] query_count Number of query vectors.
|
|
102
|
+
* @param[in] document_count Number of document vectors.
|
|
103
|
+
* @param[in] depth Number of dimensions per vector.
|
|
104
|
+
* @param[out] result Pointer to store the sum of per-query minimum angular distances.
|
|
105
|
+
*/
|
|
106
|
+
NK_DYNAMIC void nk_maxsim_packed_bf16(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
107
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
|
|
108
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
109
|
+
NK_DYNAMIC void nk_maxsim_packed_f32(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
110
|
+
nk_size_t document_count, nk_size_t depth, nk_f64_t *result);
|
|
111
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
112
|
+
NK_DYNAMIC void nk_maxsim_packed_f16(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
113
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
|
|
114
|
+
|
|
115
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
116
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_serial(nk_size_t vector_count, nk_size_t depth);
|
|
117
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
118
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_serial(nk_size_t vector_count, nk_size_t depth);
|
|
119
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
120
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_serial(nk_size_t vector_count, nk_size_t depth);
|
|
121
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
122
|
+
NK_PUBLIC void nk_maxsim_pack_bf16_serial(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
123
|
+
nk_size_t stride, void *packed);
|
|
124
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
125
|
+
NK_PUBLIC void nk_maxsim_pack_f32_serial(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
126
|
+
nk_size_t stride, void *packed);
|
|
127
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
128
|
+
NK_PUBLIC void nk_maxsim_pack_f16_serial(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
129
|
+
nk_size_t stride, void *packed);
|
|
130
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
131
|
+
NK_PUBLIC void nk_maxsim_packed_bf16_serial(void const *query_packed, void const *document_packed,
|
|
132
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
133
|
+
nk_f32_t *result);
|
|
134
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
135
|
+
NK_PUBLIC void nk_maxsim_packed_f32_serial(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
136
|
+
nk_size_t document_count, nk_size_t depth, nk_f64_t *result);
|
|
137
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
138
|
+
NK_PUBLIC void nk_maxsim_packed_f16_serial(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
139
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
|
|
140
|
+
|
|
141
|
+
#if NK_TARGET_ICELAKE
|
|
142
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
143
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_icelake(nk_size_t vector_count, nk_size_t depth);
|
|
144
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
145
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_icelake(nk_size_t vector_count, nk_size_t depth);
|
|
146
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
147
|
+
NK_PUBLIC void nk_maxsim_pack_f32_icelake(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
148
|
+
nk_size_t stride, void *packed);
|
|
149
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
150
|
+
NK_PUBLIC void nk_maxsim_pack_f16_icelake(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
151
|
+
nk_size_t stride, void *packed);
|
|
152
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
153
|
+
NK_PUBLIC void nk_maxsim_packed_f32_icelake(void const *query_packed, void const *document_packed,
|
|
154
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
155
|
+
nk_f64_t *result);
|
|
156
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
157
|
+
NK_PUBLIC void nk_maxsim_packed_f16_icelake(void const *query_packed, void const *document_packed,
|
|
158
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
159
|
+
nk_f32_t *result);
|
|
160
|
+
#endif // NK_TARGET_ICELAKE
|
|
161
|
+
|
|
162
|
+
#if NK_TARGET_GENOA
|
|
163
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
164
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_genoa(nk_size_t vector_count, nk_size_t depth);
|
|
165
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
166
|
+
NK_PUBLIC void nk_maxsim_pack_bf16_genoa(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
167
|
+
nk_size_t stride, void *packed);
|
|
168
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
169
|
+
NK_PUBLIC void nk_maxsim_packed_bf16_genoa(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
170
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
|
|
171
|
+
#endif // NK_TARGET_GENOA
|
|
172
|
+
|
|
173
|
+
#if NK_TARGET_SAPPHIREAMX
|
|
174
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
175
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_sapphireamx(nk_size_t vector_count, nk_size_t depth);
|
|
176
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
177
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_sapphireamx(nk_size_t vector_count, nk_size_t depth);
|
|
178
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
179
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_sapphireamx(nk_size_t vector_count, nk_size_t depth);
|
|
180
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
181
|
+
NK_PUBLIC void nk_maxsim_pack_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
182
|
+
nk_size_t stride, void *packed);
|
|
183
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
184
|
+
NK_PUBLIC void nk_maxsim_pack_f32_sapphireamx(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
185
|
+
nk_size_t stride, void *packed);
|
|
186
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
187
|
+
NK_PUBLIC void nk_maxsim_pack_f16_sapphireamx(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
188
|
+
nk_size_t stride, void *packed);
|
|
189
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
190
|
+
NK_PUBLIC void nk_maxsim_packed_bf16_sapphireamx(void const *query_packed, void const *document_packed,
|
|
191
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
192
|
+
nk_f32_t *result);
|
|
193
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
194
|
+
NK_PUBLIC void nk_maxsim_packed_f32_sapphireamx(void const *query_packed, void const *document_packed,
|
|
195
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
196
|
+
nk_f64_t *result);
|
|
197
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
198
|
+
NK_PUBLIC void nk_maxsim_packed_f16_sapphireamx(void const *query_packed, void const *document_packed,
|
|
199
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
200
|
+
nk_f32_t *result);
|
|
201
|
+
#endif // NK_TARGET_SAPPHIREAMX
|
|
202
|
+
|
|
203
|
+
#if NK_TARGET_HASWELL
|
|
204
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
205
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_haswell(nk_size_t vector_count, nk_size_t depth);
|
|
206
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
207
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_haswell(nk_size_t vector_count, nk_size_t depth);
|
|
208
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
209
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_haswell(nk_size_t vector_count, nk_size_t depth);
|
|
210
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
211
|
+
NK_PUBLIC void nk_maxsim_pack_bf16_haswell(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
212
|
+
nk_size_t stride, void *packed);
|
|
213
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
214
|
+
NK_PUBLIC void nk_maxsim_pack_f32_haswell(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
215
|
+
nk_size_t stride, void *packed);
|
|
216
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
217
|
+
NK_PUBLIC void nk_maxsim_pack_f16_haswell(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
218
|
+
nk_size_t stride, void *packed);
|
|
219
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
220
|
+
NK_PUBLIC void nk_maxsim_packed_bf16_haswell(void const *query_packed, void const *document_packed,
|
|
221
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
222
|
+
nk_f32_t *result);
|
|
223
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
224
|
+
NK_PUBLIC void nk_maxsim_packed_f32_haswell(void const *query_packed, void const *document_packed,
|
|
225
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
226
|
+
nk_f64_t *result);
|
|
227
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
228
|
+
NK_PUBLIC void nk_maxsim_packed_f16_haswell(void const *query_packed, void const *document_packed,
|
|
229
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
230
|
+
nk_f32_t *result);
|
|
231
|
+
#endif // NK_TARGET_HASWELL
|
|
232
|
+
|
|
233
|
+
#if NK_TARGET_ALDER
|
|
234
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
235
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_alder(nk_size_t vector_count, nk_size_t depth);
|
|
236
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
237
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_alder(nk_size_t vector_count, nk_size_t depth);
|
|
238
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
239
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_alder(nk_size_t vector_count, nk_size_t depth);
|
|
240
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
241
|
+
NK_PUBLIC void nk_maxsim_pack_bf16_alder(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
242
|
+
nk_size_t stride, void *packed);
|
|
243
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
244
|
+
NK_PUBLIC void nk_maxsim_pack_f32_alder(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
245
|
+
nk_size_t stride, void *packed);
|
|
246
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
247
|
+
NK_PUBLIC void nk_maxsim_pack_f16_alder(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
248
|
+
nk_size_t stride, void *packed);
|
|
249
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
250
|
+
NK_PUBLIC void nk_maxsim_packed_bf16_alder(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
251
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
|
|
252
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
253
|
+
NK_PUBLIC void nk_maxsim_packed_f32_alder(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
254
|
+
nk_size_t document_count, nk_size_t depth, nk_f64_t *result);
|
|
255
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
256
|
+
NK_PUBLIC void nk_maxsim_packed_f16_alder(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
257
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
|
|
258
|
+
#endif // NK_TARGET_ALDER
|
|
259
|
+
|
|
260
|
+
#if NK_TARGET_V128RELAXED
|
|
261
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
262
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_v128relaxed(nk_size_t vector_count, nk_size_t depth);
|
|
263
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
264
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_v128relaxed(nk_size_t vector_count, nk_size_t depth);
|
|
265
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
266
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_v128relaxed(nk_size_t vector_count, nk_size_t depth);
|
|
267
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
268
|
+
NK_PUBLIC void nk_maxsim_pack_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
269
|
+
nk_size_t stride, void *packed);
|
|
270
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
271
|
+
NK_PUBLIC void nk_maxsim_pack_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
272
|
+
nk_size_t stride, void *packed);
|
|
273
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
274
|
+
NK_PUBLIC void nk_maxsim_pack_f16_v128relaxed(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
275
|
+
nk_size_t stride, void *packed);
|
|
276
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
277
|
+
NK_PUBLIC void nk_maxsim_packed_bf16_v128relaxed(void const *query_packed, void const *document_packed,
|
|
278
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
279
|
+
nk_f32_t *result);
|
|
280
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
281
|
+
NK_PUBLIC void nk_maxsim_packed_f32_v128relaxed(void const *query_packed, void const *document_packed,
|
|
282
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
283
|
+
nk_f64_t *result);
|
|
284
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
285
|
+
NK_PUBLIC void nk_maxsim_packed_f16_v128relaxed(void const *query_packed, void const *document_packed,
|
|
286
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
287
|
+
nk_f32_t *result);
|
|
288
|
+
#endif // NK_TARGET_V128RELAXED
|
|
289
|
+
|
|
290
|
+
#if NK_TARGET_NEONSDOT
|
|
291
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
292
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_neonsdot(nk_size_t vector_count, nk_size_t depth);
|
|
293
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
294
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_neonsdot(nk_size_t vector_count, nk_size_t depth);
|
|
295
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
296
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_neonsdot(nk_size_t vector_count, nk_size_t depth);
|
|
297
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
298
|
+
NK_PUBLIC void nk_maxsim_pack_bf16_neonsdot(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
299
|
+
nk_size_t stride, void *packed);
|
|
300
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
301
|
+
NK_PUBLIC void nk_maxsim_pack_f32_neonsdot(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
302
|
+
nk_size_t stride, void *packed);
|
|
303
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
304
|
+
NK_PUBLIC void nk_maxsim_pack_f16_neonsdot(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
305
|
+
nk_size_t stride, void *packed);
|
|
306
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
307
|
+
NK_PUBLIC void nk_maxsim_packed_bf16_neonsdot(void const *query_packed, void const *document_packed,
|
|
308
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
309
|
+
nk_f32_t *result);
|
|
310
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
311
|
+
NK_PUBLIC void nk_maxsim_packed_f32_neonsdot(void const *query_packed, void const *document_packed,
|
|
312
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
313
|
+
nk_f64_t *result);
|
|
314
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
315
|
+
NK_PUBLIC void nk_maxsim_packed_f16_neonsdot(void const *query_packed, void const *document_packed,
|
|
316
|
+
nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
|
|
317
|
+
nk_f32_t *result);
|
|
318
|
+
#endif // NK_TARGET_NEONSDOT
|
|
319
|
+
|
|
320
|
+
#if NK_TARGET_SME
|
|
321
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
322
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_sme(nk_size_t vector_count, nk_size_t depth);
|
|
323
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
324
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_sme(nk_size_t vector_count, nk_size_t depth);
|
|
325
|
+
/** @copydoc nk_maxsim_packed_size_bf16 */
|
|
326
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_sme(nk_size_t vector_count, nk_size_t depth);
|
|
327
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
328
|
+
NK_PUBLIC void nk_maxsim_pack_bf16_sme(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
329
|
+
nk_size_t stride, void *packed);
|
|
330
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
331
|
+
NK_PUBLIC void nk_maxsim_pack_f16_sme(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
332
|
+
nk_size_t stride, void *packed);
|
|
333
|
+
/** @copydoc nk_maxsim_pack_bf16 */
|
|
334
|
+
NK_PUBLIC void nk_maxsim_pack_f32_sme(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
|
|
335
|
+
nk_size_t stride, void *packed);
|
|
336
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
337
|
+
NK_PUBLIC void nk_maxsim_packed_bf16_sme(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
338
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
|
|
339
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
340
|
+
NK_PUBLIC void nk_maxsim_packed_f16_sme(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
341
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
|
|
342
|
+
/** @copydoc nk_maxsim_packed_bf16 */
|
|
343
|
+
NK_PUBLIC void nk_maxsim_packed_f32_sme(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
344
|
+
nk_size_t document_count, nk_size_t depth, nk_f64_t *result);
|
|
345
|
+
#endif // NK_TARGET_SME
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* @brief Returns the output dtype for MaxSim late-interaction.
|
|
349
|
+
*/
|
|
350
|
+
NK_INTERNAL nk_dtype_t nk_maxsim_output_dtype(nk_dtype_t dtype) {
|
|
351
|
+
switch (dtype) {
|
|
352
|
+
case nk_f32_k: return nk_f64_k;
|
|
353
|
+
case nk_f16_k: return nk_f32_k;
|
|
354
|
+
case nk_bf16_k: return nk_f32_k;
|
|
355
|
+
default: return nk_dtype_unknown_k;
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
#if defined(__cplusplus)
|
|
360
|
+
} // extern "C"
|
|
361
|
+
#endif
|
|
362
|
+
|
|
363
|
+
#include "numkong/maxsim/serial.h"
|
|
364
|
+
#include "numkong/maxsim/haswell.h"
|
|
365
|
+
#include "numkong/maxsim/alder.h"
|
|
366
|
+
#include "numkong/maxsim/icelake.h"
|
|
367
|
+
#include "numkong/maxsim/genoa.h"
|
|
368
|
+
#include "numkong/maxsim/sapphireamx.h"
|
|
369
|
+
#include "numkong/maxsim/neonsdot.h"
|
|
370
|
+
#include "numkong/maxsim/sme.h"
|
|
371
|
+
#include "numkong/maxsim/v128relaxed.h"
|
|
372
|
+
|
|
373
|
+
#if defined(__cplusplus)
|
|
374
|
+
extern "C" {
|
|
375
|
+
#endif
|
|
376
|
+
|
|
377
|
+
#if !NK_DYNAMIC_DISPATCH
|
|
378
|
+
|
|
379
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16(nk_size_t vector_count, nk_size_t depth) {
|
|
380
|
+
#if NK_TARGET_SME
|
|
381
|
+
return nk_maxsim_packed_size_bf16_sme(vector_count, depth);
|
|
382
|
+
#elif NK_TARGET_SAPPHIREAMX
|
|
383
|
+
return nk_maxsim_packed_size_bf16_sapphireamx(vector_count, depth);
|
|
384
|
+
#elif NK_TARGET_GENOA
|
|
385
|
+
return nk_maxsim_packed_size_bf16_genoa(vector_count, depth);
|
|
386
|
+
#elif NK_TARGET_ALDER
|
|
387
|
+
return nk_maxsim_packed_size_bf16_alder(vector_count, depth);
|
|
388
|
+
#elif NK_TARGET_HASWELL
|
|
389
|
+
return nk_maxsim_packed_size_bf16_haswell(vector_count, depth);
|
|
390
|
+
#elif NK_TARGET_NEONSDOT
|
|
391
|
+
return nk_maxsim_packed_size_bf16_neonsdot(vector_count, depth);
|
|
392
|
+
#elif NK_TARGET_V128RELAXED
|
|
393
|
+
return nk_maxsim_packed_size_bf16_v128relaxed(vector_count, depth);
|
|
394
|
+
#else
|
|
395
|
+
return nk_maxsim_packed_size_bf16_serial(vector_count, depth);
|
|
396
|
+
#endif
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32(nk_size_t vector_count, nk_size_t depth) {
|
|
400
|
+
#if NK_TARGET_SME
|
|
401
|
+
return nk_maxsim_packed_size_f32_sme(vector_count, depth);
|
|
402
|
+
#elif NK_TARGET_SAPPHIREAMX
|
|
403
|
+
return nk_maxsim_packed_size_f32_sapphireamx(vector_count, depth);
|
|
404
|
+
#elif NK_TARGET_ICELAKE
|
|
405
|
+
return nk_maxsim_packed_size_f32_icelake(vector_count, depth);
|
|
406
|
+
#elif NK_TARGET_ALDER
|
|
407
|
+
return nk_maxsim_packed_size_f32_alder(vector_count, depth);
|
|
408
|
+
#elif NK_TARGET_HASWELL
|
|
409
|
+
return nk_maxsim_packed_size_f32_haswell(vector_count, depth);
|
|
410
|
+
#elif NK_TARGET_NEONSDOT
|
|
411
|
+
return nk_maxsim_packed_size_f32_neonsdot(vector_count, depth);
|
|
412
|
+
#elif NK_TARGET_V128RELAXED
|
|
413
|
+
return nk_maxsim_packed_size_f32_v128relaxed(vector_count, depth);
|
|
414
|
+
#else
|
|
415
|
+
return nk_maxsim_packed_size_f32_serial(vector_count, depth);
|
|
416
|
+
#endif
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16(nk_size_t vector_count, nk_size_t depth) {
|
|
420
|
+
#if NK_TARGET_SME
|
|
421
|
+
return nk_maxsim_packed_size_f16_sme(vector_count, depth);
|
|
422
|
+
#elif NK_TARGET_SAPPHIREAMX
|
|
423
|
+
return nk_maxsim_packed_size_f16_sapphireamx(vector_count, depth);
|
|
424
|
+
#elif NK_TARGET_ICELAKE
|
|
425
|
+
return nk_maxsim_packed_size_f16_icelake(vector_count, depth);
|
|
426
|
+
#elif NK_TARGET_ALDER
|
|
427
|
+
return nk_maxsim_packed_size_f16_alder(vector_count, depth);
|
|
428
|
+
#elif NK_TARGET_HASWELL
|
|
429
|
+
return nk_maxsim_packed_size_f16_haswell(vector_count, depth);
|
|
430
|
+
#elif NK_TARGET_NEONSDOT
|
|
431
|
+
return nk_maxsim_packed_size_f16_neonsdot(vector_count, depth);
|
|
432
|
+
#elif NK_TARGET_V128RELAXED
|
|
433
|
+
return nk_maxsim_packed_size_f16_v128relaxed(vector_count, depth);
|
|
434
|
+
#else
|
|
435
|
+
return nk_maxsim_packed_size_f16_serial(vector_count, depth);
|
|
436
|
+
#endif
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
NK_PUBLIC void nk_maxsim_pack_bf16(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
|
|
440
|
+
void *packed) {
|
|
441
|
+
#if NK_TARGET_SME
|
|
442
|
+
nk_maxsim_pack_bf16_sme(vectors, vector_count, depth, stride, packed);
|
|
443
|
+
#elif NK_TARGET_SAPPHIREAMX
|
|
444
|
+
nk_maxsim_pack_bf16_sapphireamx(vectors, vector_count, depth, stride, packed);
|
|
445
|
+
#elif NK_TARGET_GENOA
|
|
446
|
+
nk_maxsim_pack_bf16_genoa(vectors, vector_count, depth, stride, packed);
|
|
447
|
+
#elif NK_TARGET_ALDER
|
|
448
|
+
nk_maxsim_pack_bf16_alder(vectors, vector_count, depth, stride, packed);
|
|
449
|
+
#elif NK_TARGET_HASWELL
|
|
450
|
+
nk_maxsim_pack_bf16_haswell(vectors, vector_count, depth, stride, packed);
|
|
451
|
+
#elif NK_TARGET_NEONSDOT
|
|
452
|
+
nk_maxsim_pack_bf16_neonsdot(vectors, vector_count, depth, stride, packed);
|
|
453
|
+
#elif NK_TARGET_V128RELAXED
|
|
454
|
+
nk_maxsim_pack_bf16_v128relaxed(vectors, vector_count, depth, stride, packed);
|
|
455
|
+
#else
|
|
456
|
+
nk_maxsim_pack_bf16_serial(vectors, vector_count, depth, stride, packed);
|
|
457
|
+
#endif
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
NK_PUBLIC void nk_maxsim_pack_f32(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
|
|
461
|
+
void *packed) {
|
|
462
|
+
#if NK_TARGET_SME
|
|
463
|
+
nk_maxsim_pack_f32_sme(vectors, vector_count, depth, stride, packed);
|
|
464
|
+
#elif NK_TARGET_SAPPHIREAMX
|
|
465
|
+
nk_maxsim_pack_f32_sapphireamx(vectors, vector_count, depth, stride, packed);
|
|
466
|
+
#elif NK_TARGET_ICELAKE
|
|
467
|
+
nk_maxsim_pack_f32_icelake(vectors, vector_count, depth, stride, packed);
|
|
468
|
+
#elif NK_TARGET_ALDER
|
|
469
|
+
nk_maxsim_pack_f32_alder(vectors, vector_count, depth, stride, packed);
|
|
470
|
+
#elif NK_TARGET_HASWELL
|
|
471
|
+
nk_maxsim_pack_f32_haswell(vectors, vector_count, depth, stride, packed);
|
|
472
|
+
#elif NK_TARGET_NEONSDOT
|
|
473
|
+
nk_maxsim_pack_f32_neonsdot(vectors, vector_count, depth, stride, packed);
|
|
474
|
+
#elif NK_TARGET_V128RELAXED
|
|
475
|
+
nk_maxsim_pack_f32_v128relaxed(vectors, vector_count, depth, stride, packed);
|
|
476
|
+
#else
|
|
477
|
+
nk_maxsim_pack_f32_serial(vectors, vector_count, depth, stride, packed);
|
|
478
|
+
#endif
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
NK_PUBLIC void nk_maxsim_pack_f16(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
|
|
482
|
+
void *packed) {
|
|
483
|
+
#if NK_TARGET_SME
|
|
484
|
+
nk_maxsim_pack_f16_sme(vectors, vector_count, depth, stride, packed);
|
|
485
|
+
#elif NK_TARGET_SAPPHIREAMX
|
|
486
|
+
nk_maxsim_pack_f16_sapphireamx(vectors, vector_count, depth, stride, packed);
|
|
487
|
+
#elif NK_TARGET_ICELAKE
|
|
488
|
+
nk_maxsim_pack_f16_icelake(vectors, vector_count, depth, stride, packed);
|
|
489
|
+
#elif NK_TARGET_ALDER
|
|
490
|
+
nk_maxsim_pack_f16_alder(vectors, vector_count, depth, stride, packed);
|
|
491
|
+
#elif NK_TARGET_HASWELL
|
|
492
|
+
nk_maxsim_pack_f16_haswell(vectors, vector_count, depth, stride, packed);
|
|
493
|
+
#elif NK_TARGET_NEONSDOT
|
|
494
|
+
nk_maxsim_pack_f16_neonsdot(vectors, vector_count, depth, stride, packed);
|
|
495
|
+
#elif NK_TARGET_V128RELAXED
|
|
496
|
+
nk_maxsim_pack_f16_v128relaxed(vectors, vector_count, depth, stride, packed);
|
|
497
|
+
#else
|
|
498
|
+
nk_maxsim_pack_f16_serial(vectors, vector_count, depth, stride, packed);
|
|
499
|
+
#endif
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
NK_PUBLIC void nk_maxsim_packed_bf16(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
503
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result) {
|
|
504
|
+
#if NK_TARGET_SME
|
|
505
|
+
nk_maxsim_packed_bf16_sme(query_packed, document_packed, query_count, document_count, depth, result);
|
|
506
|
+
#elif NK_TARGET_SAPPHIREAMX
|
|
507
|
+
nk_maxsim_packed_bf16_sapphireamx(query_packed, document_packed, query_count, document_count, depth, result);
|
|
508
|
+
#elif NK_TARGET_GENOA
|
|
509
|
+
nk_maxsim_packed_bf16_genoa(query_packed, document_packed, query_count, document_count, depth, result);
|
|
510
|
+
#elif NK_TARGET_ALDER
|
|
511
|
+
nk_maxsim_packed_bf16_alder(query_packed, document_packed, query_count, document_count, depth, result);
|
|
512
|
+
#elif NK_TARGET_HASWELL
|
|
513
|
+
nk_maxsim_packed_bf16_haswell(query_packed, document_packed, query_count, document_count, depth, result);
|
|
514
|
+
#elif NK_TARGET_NEONSDOT
|
|
515
|
+
nk_maxsim_packed_bf16_neonsdot(query_packed, document_packed, query_count, document_count, depth, result);
|
|
516
|
+
#elif NK_TARGET_V128RELAXED
|
|
517
|
+
nk_maxsim_packed_bf16_v128relaxed(query_packed, document_packed, query_count, document_count, depth, result);
|
|
518
|
+
#else
|
|
519
|
+
nk_maxsim_packed_bf16_serial(query_packed, document_packed, query_count, document_count, depth, result);
|
|
520
|
+
#endif
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
NK_PUBLIC void nk_maxsim_packed_f32(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
524
|
+
nk_size_t document_count, nk_size_t depth, nk_f64_t *result) {
|
|
525
|
+
#if NK_TARGET_SME
|
|
526
|
+
nk_maxsim_packed_f32_sme(query_packed, document_packed, query_count, document_count, depth, result);
|
|
527
|
+
#elif NK_TARGET_SAPPHIREAMX
|
|
528
|
+
nk_maxsim_packed_f32_sapphireamx(query_packed, document_packed, query_count, document_count, depth, result);
|
|
529
|
+
#elif NK_TARGET_ICELAKE
|
|
530
|
+
nk_maxsim_packed_f32_icelake(query_packed, document_packed, query_count, document_count, depth, result);
|
|
531
|
+
#elif NK_TARGET_ALDER
|
|
532
|
+
nk_maxsim_packed_f32_alder(query_packed, document_packed, query_count, document_count, depth, result);
|
|
533
|
+
#elif NK_TARGET_HASWELL
|
|
534
|
+
nk_maxsim_packed_f32_haswell(query_packed, document_packed, query_count, document_count, depth, result);
|
|
535
|
+
#elif NK_TARGET_NEONSDOT
|
|
536
|
+
nk_maxsim_packed_f32_neonsdot(query_packed, document_packed, query_count, document_count, depth, result);
|
|
537
|
+
#elif NK_TARGET_V128RELAXED
|
|
538
|
+
nk_maxsim_packed_f32_v128relaxed(query_packed, document_packed, query_count, document_count, depth, result);
|
|
539
|
+
#else
|
|
540
|
+
nk_maxsim_packed_f32_serial(query_packed, document_packed, query_count, document_count, depth, result);
|
|
541
|
+
#endif
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
NK_PUBLIC void nk_maxsim_packed_f16(void const *query_packed, void const *document_packed, nk_size_t query_count,
|
|
545
|
+
nk_size_t document_count, nk_size_t depth, nk_f32_t *result) {
|
|
546
|
+
#if NK_TARGET_SME
|
|
547
|
+
nk_maxsim_packed_f16_sme(query_packed, document_packed, query_count, document_count, depth, result);
|
|
548
|
+
#elif NK_TARGET_SAPPHIREAMX
|
|
549
|
+
nk_maxsim_packed_f16_sapphireamx(query_packed, document_packed, query_count, document_count, depth, result);
|
|
550
|
+
#elif NK_TARGET_ICELAKE
|
|
551
|
+
nk_maxsim_packed_f16_icelake(query_packed, document_packed, query_count, document_count, depth, result);
|
|
552
|
+
#elif NK_TARGET_ALDER
|
|
553
|
+
nk_maxsim_packed_f16_alder(query_packed, document_packed, query_count, document_count, depth, result);
|
|
554
|
+
#elif NK_TARGET_HASWELL
|
|
555
|
+
nk_maxsim_packed_f16_haswell(query_packed, document_packed, query_count, document_count, depth, result);
|
|
556
|
+
#elif NK_TARGET_NEONSDOT
|
|
557
|
+
nk_maxsim_packed_f16_neonsdot(query_packed, document_packed, query_count, document_count, depth, result);
|
|
558
|
+
#elif NK_TARGET_V128RELAXED
|
|
559
|
+
nk_maxsim_packed_f16_v128relaxed(query_packed, document_packed, query_count, document_count, depth, result);
|
|
560
|
+
#else
|
|
561
|
+
nk_maxsim_packed_f16_serial(query_packed, document_packed, query_count, document_count, depth, result);
|
|
562
|
+
#endif
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
#endif // !NK_DYNAMIC_DISPATCH
|
|
566
|
+
|
|
567
|
+
#if defined(__cplusplus)
|
|
568
|
+
} // extern "C"
|
|
569
|
+
#endif
|
|
570
|
+
|
|
571
|
+
#endif // NK_MAXSIM_H
|