numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,571 @@
1
+ /**
2
+ * @brief SIMD-accelerated MaxSim (ColBERT Late Interaction).
3
+ * @file include/numkong/maxsim.h
4
+ * @author Ash Vardanian
5
+ * @date February 17, 2026
6
+ *
7
+ * Computes angular distance late-interaction: result = Σᵢ minⱼ angular(qᵢ, dⱼ).
8
+ * Angular distance = 1 - dot(q, d) / sqrt(||q||² × ||d||²), clamped >= 0.
9
+ *
10
+ * Strategy: coarse i8-quantized screening with running argmax (dot as proxy for argmin angular),
11
+ * then full-precision refinement of the winning pairs via nk_dot_* primitives,
12
+ * finalized with angular distance and accumulated with `f64`.
13
+ *
14
+ * Precision policy:
15
+ * - `f32` inputs keep packed payloads and metadata narrow for memory bandwidth.
16
+ * - The refined scores and final late-interaction sum widen to `f64`.
17
+ *
18
+ * It implements several operations:
19
+ *
20
+ * - "maxsim_packed" - computing MaxSim where both Q and D are pre-packed into optimal form
21
+ * - "maxsim_packed_size" - estimating the memory requirements for external malloc
22
+ * - "maxsim_pack" - performing the pre-processing (quantization + original copy)
23
+ *
24
+ * @section maxsim_api Two-Phase API
25
+ *
26
+ * @code{.c}
27
+ * // Pack query and document matrices
28
+ * nk_size_t query_bytes = nk_maxsim_packed_size_bf16(query_count, depth);
29
+ * nk_size_t document_bytes = nk_maxsim_packed_size_bf16(document_count, depth);
30
+ * void *query_packed = malloc(query_bytes);
31
+ * void *document_packed = malloc(document_bytes);
32
+ * nk_maxsim_pack_bf16(queries, query_count, depth, depth * sizeof(nk_bf16_t), query_packed);
33
+ * nk_maxsim_pack_bf16(documents, document_count, depth, depth * sizeof(nk_bf16_t), document_packed);
34
+ *
35
+ * // Compute MaxSim score
36
+ * nk_f32_t score;
37
+ * nk_maxsim_packed_bf16(query_packed, document_packed, query_count, document_count, depth, &score);
38
+ * @endcode
39
+ *
40
+ * @section maxsim_packed_layout Packed Buffer Layout
41
+ *
42
+ * [Header 64B] [i8 vectors, 64B-aligned] [metadata, 64B-aligned] [originals row-major, 64B-aligned]
43
+ *
44
+ * The packed format is backend-specific: different ISAs use different i8 depth padding
45
+ * and clamp ranges. Pack with the matching ISA's pack function.
46
+ *
47
+ * @section maxsim_isa_support ISA Support
48
+ *
49
+ * Currently implemented:
50
+ * - Serial: scalar reference (all platforms)
51
+ * - Haswell: AVX2 VPMADDUBSW coarse [-79,79] + bias correction (bf16/f32/f16)
52
+ * - Icelake: AVX-512 VNNI VPDPBUSD coarse (f32/f16)
53
+ * - Genoa: AVX-512 VNNI coarse + VDPBF16PS refinement (bf16 only)
54
+ * - NEONSDOT: ARM SDOT (vdotq_s32) coarse, no bias correction (bf16/f32/f16)
55
+ * - SME: ARM fused BFMOPA (existing, unchanged)
56
+ */
57
+ #ifndef NK_MAXSIM_H
58
+ #define NK_MAXSIM_H
59
+
60
+ #include "numkong/types.h"
61
+
62
+ #if defined(__cplusplus)
63
+ extern "C" {
64
+ #endif
65
+
66
+ /**
67
+ * @brief Returns packed buffer size in bytes for a maxsim vector set.
68
+ * @param[in] vector_count The number of vectors to pack.
69
+ * @param[in] depth The number of dimensions per vector.
70
+ * @note The packed layout is backend-specific and must be produced by the matching pack function.
71
+ */
72
+ NK_DYNAMIC nk_size_t nk_maxsim_packed_size_bf16(nk_size_t vector_count, nk_size_t depth);
73
+ /** @copydoc nk_maxsim_packed_size_bf16 */
74
+ NK_DYNAMIC nk_size_t nk_maxsim_packed_size_f32(nk_size_t vector_count, nk_size_t depth);
75
+ /** @copydoc nk_maxsim_packed_size_bf16 */
76
+ NK_DYNAMIC nk_size_t nk_maxsim_packed_size_f16(nk_size_t vector_count, nk_size_t depth);
77
+
78
+ /**
79
+ * @brief Packs vectors into a backend-specific layout for maxsim computation.
80
+ * @param[in] vectors The input vectors in row-major order.
81
+ * @param[in] vector_count The number of vectors.
82
+ * @param[in] depth The number of dimensions per vector.
83
+ * @param[in] stride The row stride in bytes for the input vectors.
84
+ * @param[out] packed The output packed buffer from nk_maxsim_packed_size_bf16.
85
+ */
86
+ NK_DYNAMIC void nk_maxsim_pack_bf16(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
87
+ void *packed);
88
+ /** @copydoc nk_maxsim_pack_bf16 */
89
+ NK_DYNAMIC void nk_maxsim_pack_f32(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
90
+ void *packed);
91
+ /** @copydoc nk_maxsim_pack_bf16 */
92
+ NK_DYNAMIC void nk_maxsim_pack_f16(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
93
+ void *packed);
94
+
95
+ /**
96
+ * @brief Computes angular distance late-interaction on pre-packed vectors.
97
+ * Returns Σᵢ minⱼ angular(qᵢ, dⱼ) where angular = 1 - dot / sqrt(||q||² × ||d||²).
98
+ *
99
+ * @param[in] query_packed Packed query vectors (from nk_maxsim_pack_bf16).
100
+ * @param[in] document_packed Packed document vectors (from nk_maxsim_pack_bf16).
101
+ * @param[in] query_count Number of query vectors.
102
+ * @param[in] document_count Number of document vectors.
103
+ * @param[in] depth Number of dimensions per vector.
104
+ * @param[out] result Pointer to store the sum of per-query minimum angular distances.
105
+ */
106
+ NK_DYNAMIC void nk_maxsim_packed_bf16(void const *query_packed, void const *document_packed, nk_size_t query_count,
107
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
108
+ /** @copydoc nk_maxsim_packed_bf16 */
109
+ NK_DYNAMIC void nk_maxsim_packed_f32(void const *query_packed, void const *document_packed, nk_size_t query_count,
110
+ nk_size_t document_count, nk_size_t depth, nk_f64_t *result);
111
+ /** @copydoc nk_maxsim_packed_bf16 */
112
+ NK_DYNAMIC void nk_maxsim_packed_f16(void const *query_packed, void const *document_packed, nk_size_t query_count,
113
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
114
+
115
+ /** @copydoc nk_maxsim_packed_size_bf16 */
116
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_serial(nk_size_t vector_count, nk_size_t depth);
117
+ /** @copydoc nk_maxsim_packed_size_bf16 */
118
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_serial(nk_size_t vector_count, nk_size_t depth);
119
+ /** @copydoc nk_maxsim_packed_size_bf16 */
120
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_serial(nk_size_t vector_count, nk_size_t depth);
121
+ /** @copydoc nk_maxsim_pack_bf16 */
122
+ NK_PUBLIC void nk_maxsim_pack_bf16_serial(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
123
+ nk_size_t stride, void *packed);
124
+ /** @copydoc nk_maxsim_pack_bf16 */
125
+ NK_PUBLIC void nk_maxsim_pack_f32_serial(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
126
+ nk_size_t stride, void *packed);
127
+ /** @copydoc nk_maxsim_pack_bf16 */
128
+ NK_PUBLIC void nk_maxsim_pack_f16_serial(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
129
+ nk_size_t stride, void *packed);
130
+ /** @copydoc nk_maxsim_packed_bf16 */
131
+ NK_PUBLIC void nk_maxsim_packed_bf16_serial(void const *query_packed, void const *document_packed,
132
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
133
+ nk_f32_t *result);
134
+ /** @copydoc nk_maxsim_packed_bf16 */
135
+ NK_PUBLIC void nk_maxsim_packed_f32_serial(void const *query_packed, void const *document_packed, nk_size_t query_count,
136
+ nk_size_t document_count, nk_size_t depth, nk_f64_t *result);
137
+ /** @copydoc nk_maxsim_packed_bf16 */
138
+ NK_PUBLIC void nk_maxsim_packed_f16_serial(void const *query_packed, void const *document_packed, nk_size_t query_count,
139
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
140
+
141
+ #if NK_TARGET_ICELAKE
142
+ /** @copydoc nk_maxsim_packed_size_bf16 */
143
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_icelake(nk_size_t vector_count, nk_size_t depth);
144
+ /** @copydoc nk_maxsim_packed_size_bf16 */
145
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_icelake(nk_size_t vector_count, nk_size_t depth);
146
+ /** @copydoc nk_maxsim_pack_bf16 */
147
+ NK_PUBLIC void nk_maxsim_pack_f32_icelake(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
148
+ nk_size_t stride, void *packed);
149
+ /** @copydoc nk_maxsim_pack_bf16 */
150
+ NK_PUBLIC void nk_maxsim_pack_f16_icelake(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
151
+ nk_size_t stride, void *packed);
152
+ /** @copydoc nk_maxsim_packed_bf16 */
153
+ NK_PUBLIC void nk_maxsim_packed_f32_icelake(void const *query_packed, void const *document_packed,
154
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
155
+ nk_f64_t *result);
156
+ /** @copydoc nk_maxsim_packed_bf16 */
157
+ NK_PUBLIC void nk_maxsim_packed_f16_icelake(void const *query_packed, void const *document_packed,
158
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
159
+ nk_f32_t *result);
160
+ #endif // NK_TARGET_ICELAKE
161
+
162
+ #if NK_TARGET_GENOA
163
+ /** @copydoc nk_maxsim_packed_size_bf16 */
164
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_genoa(nk_size_t vector_count, nk_size_t depth);
165
+ /** @copydoc nk_maxsim_pack_bf16 */
166
+ NK_PUBLIC void nk_maxsim_pack_bf16_genoa(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
167
+ nk_size_t stride, void *packed);
168
+ /** @copydoc nk_maxsim_packed_bf16 */
169
+ NK_PUBLIC void nk_maxsim_packed_bf16_genoa(void const *query_packed, void const *document_packed, nk_size_t query_count,
170
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
171
+ #endif // NK_TARGET_GENOA
172
+
173
+ #if NK_TARGET_SAPPHIREAMX
174
+ /** @copydoc nk_maxsim_packed_size_bf16 */
175
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_sapphireamx(nk_size_t vector_count, nk_size_t depth);
176
+ /** @copydoc nk_maxsim_packed_size_bf16 */
177
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_sapphireamx(nk_size_t vector_count, nk_size_t depth);
178
+ /** @copydoc nk_maxsim_packed_size_bf16 */
179
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_sapphireamx(nk_size_t vector_count, nk_size_t depth);
180
+ /** @copydoc nk_maxsim_pack_bf16 */
181
+ NK_PUBLIC void nk_maxsim_pack_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
182
+ nk_size_t stride, void *packed);
183
+ /** @copydoc nk_maxsim_pack_bf16 */
184
+ NK_PUBLIC void nk_maxsim_pack_f32_sapphireamx(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
185
+ nk_size_t stride, void *packed);
186
+ /** @copydoc nk_maxsim_pack_bf16 */
187
+ NK_PUBLIC void nk_maxsim_pack_f16_sapphireamx(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
188
+ nk_size_t stride, void *packed);
189
+ /** @copydoc nk_maxsim_packed_bf16 */
190
+ NK_PUBLIC void nk_maxsim_packed_bf16_sapphireamx(void const *query_packed, void const *document_packed,
191
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
192
+ nk_f32_t *result);
193
+ /** @copydoc nk_maxsim_packed_bf16 */
194
+ NK_PUBLIC void nk_maxsim_packed_f32_sapphireamx(void const *query_packed, void const *document_packed,
195
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
196
+ nk_f64_t *result);
197
+ /** @copydoc nk_maxsim_packed_bf16 */
198
+ NK_PUBLIC void nk_maxsim_packed_f16_sapphireamx(void const *query_packed, void const *document_packed,
199
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
200
+ nk_f32_t *result);
201
+ #endif // NK_TARGET_SAPPHIREAMX
202
+
203
+ #if NK_TARGET_HASWELL
204
+ /** @copydoc nk_maxsim_packed_size_bf16 */
205
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_haswell(nk_size_t vector_count, nk_size_t depth);
206
+ /** @copydoc nk_maxsim_packed_size_bf16 */
207
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_haswell(nk_size_t vector_count, nk_size_t depth);
208
+ /** @copydoc nk_maxsim_packed_size_bf16 */
209
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_haswell(nk_size_t vector_count, nk_size_t depth);
210
+ /** @copydoc nk_maxsim_pack_bf16 */
211
+ NK_PUBLIC void nk_maxsim_pack_bf16_haswell(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
212
+ nk_size_t stride, void *packed);
213
+ /** @copydoc nk_maxsim_pack_bf16 */
214
+ NK_PUBLIC void nk_maxsim_pack_f32_haswell(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
215
+ nk_size_t stride, void *packed);
216
+ /** @copydoc nk_maxsim_pack_bf16 */
217
+ NK_PUBLIC void nk_maxsim_pack_f16_haswell(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
218
+ nk_size_t stride, void *packed);
219
+ /** @copydoc nk_maxsim_packed_bf16 */
220
+ NK_PUBLIC void nk_maxsim_packed_bf16_haswell(void const *query_packed, void const *document_packed,
221
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
222
+ nk_f32_t *result);
223
+ /** @copydoc nk_maxsim_packed_bf16 */
224
+ NK_PUBLIC void nk_maxsim_packed_f32_haswell(void const *query_packed, void const *document_packed,
225
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
226
+ nk_f64_t *result);
227
+ /** @copydoc nk_maxsim_packed_bf16 */
228
+ NK_PUBLIC void nk_maxsim_packed_f16_haswell(void const *query_packed, void const *document_packed,
229
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
230
+ nk_f32_t *result);
231
+ #endif // NK_TARGET_HASWELL
232
+
233
+ #if NK_TARGET_ALDER
234
+ /** @copydoc nk_maxsim_packed_size_bf16 */
235
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_alder(nk_size_t vector_count, nk_size_t depth);
236
+ /** @copydoc nk_maxsim_packed_size_bf16 */
237
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_alder(nk_size_t vector_count, nk_size_t depth);
238
+ /** @copydoc nk_maxsim_packed_size_bf16 */
239
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_alder(nk_size_t vector_count, nk_size_t depth);
240
+ /** @copydoc nk_maxsim_pack_bf16 */
241
+ NK_PUBLIC void nk_maxsim_pack_bf16_alder(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
242
+ nk_size_t stride, void *packed);
243
+ /** @copydoc nk_maxsim_pack_bf16 */
244
+ NK_PUBLIC void nk_maxsim_pack_f32_alder(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
245
+ nk_size_t stride, void *packed);
246
+ /** @copydoc nk_maxsim_pack_bf16 */
247
+ NK_PUBLIC void nk_maxsim_pack_f16_alder(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
248
+ nk_size_t stride, void *packed);
249
+ /** @copydoc nk_maxsim_packed_bf16 */
250
+ NK_PUBLIC void nk_maxsim_packed_bf16_alder(void const *query_packed, void const *document_packed, nk_size_t query_count,
251
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
252
+ /** @copydoc nk_maxsim_packed_bf16 */
253
+ NK_PUBLIC void nk_maxsim_packed_f32_alder(void const *query_packed, void const *document_packed, nk_size_t query_count,
254
+ nk_size_t document_count, nk_size_t depth, nk_f64_t *result);
255
+ /** @copydoc nk_maxsim_packed_bf16 */
256
+ NK_PUBLIC void nk_maxsim_packed_f16_alder(void const *query_packed, void const *document_packed, nk_size_t query_count,
257
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
258
+ #endif // NK_TARGET_ALDER
259
+
260
+ #if NK_TARGET_V128RELAXED
261
+ /** @copydoc nk_maxsim_packed_size_bf16 */
262
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_v128relaxed(nk_size_t vector_count, nk_size_t depth);
263
+ /** @copydoc nk_maxsim_packed_size_bf16 */
264
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_v128relaxed(nk_size_t vector_count, nk_size_t depth);
265
+ /** @copydoc nk_maxsim_packed_size_bf16 */
266
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_v128relaxed(nk_size_t vector_count, nk_size_t depth);
267
+ /** @copydoc nk_maxsim_pack_bf16 */
268
+ NK_PUBLIC void nk_maxsim_pack_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
269
+ nk_size_t stride, void *packed);
270
+ /** @copydoc nk_maxsim_pack_bf16 */
271
+ NK_PUBLIC void nk_maxsim_pack_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
272
+ nk_size_t stride, void *packed);
273
+ /** @copydoc nk_maxsim_pack_bf16 */
274
+ NK_PUBLIC void nk_maxsim_pack_f16_v128relaxed(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
275
+ nk_size_t stride, void *packed);
276
+ /** @copydoc nk_maxsim_packed_bf16 */
277
+ NK_PUBLIC void nk_maxsim_packed_bf16_v128relaxed(void const *query_packed, void const *document_packed,
278
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
279
+ nk_f32_t *result);
280
+ /** @copydoc nk_maxsim_packed_bf16 */
281
+ NK_PUBLIC void nk_maxsim_packed_f32_v128relaxed(void const *query_packed, void const *document_packed,
282
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
283
+ nk_f64_t *result);
284
+ /** @copydoc nk_maxsim_packed_bf16 */
285
+ NK_PUBLIC void nk_maxsim_packed_f16_v128relaxed(void const *query_packed, void const *document_packed,
286
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
287
+ nk_f32_t *result);
288
+ #endif // NK_TARGET_V128RELAXED
289
+
290
+ #if NK_TARGET_NEONSDOT
291
+ /** @copydoc nk_maxsim_packed_size_bf16 */
292
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_neonsdot(nk_size_t vector_count, nk_size_t depth);
293
+ /** @copydoc nk_maxsim_packed_size_bf16 */
294
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_neonsdot(nk_size_t vector_count, nk_size_t depth);
295
+ /** @copydoc nk_maxsim_packed_size_bf16 */
296
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_neonsdot(nk_size_t vector_count, nk_size_t depth);
297
+ /** @copydoc nk_maxsim_pack_bf16 */
298
+ NK_PUBLIC void nk_maxsim_pack_bf16_neonsdot(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
299
+ nk_size_t stride, void *packed);
300
+ /** @copydoc nk_maxsim_pack_bf16 */
301
+ NK_PUBLIC void nk_maxsim_pack_f32_neonsdot(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
302
+ nk_size_t stride, void *packed);
303
+ /** @copydoc nk_maxsim_pack_bf16 */
304
+ NK_PUBLIC void nk_maxsim_pack_f16_neonsdot(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
305
+ nk_size_t stride, void *packed);
306
+ /** @copydoc nk_maxsim_packed_bf16 */
307
+ NK_PUBLIC void nk_maxsim_packed_bf16_neonsdot(void const *query_packed, void const *document_packed,
308
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
309
+ nk_f32_t *result);
310
+ /** @copydoc nk_maxsim_packed_bf16 */
311
+ NK_PUBLIC void nk_maxsim_packed_f32_neonsdot(void const *query_packed, void const *document_packed,
312
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
313
+ nk_f64_t *result);
314
+ /** @copydoc nk_maxsim_packed_bf16 */
315
+ NK_PUBLIC void nk_maxsim_packed_f16_neonsdot(void const *query_packed, void const *document_packed,
316
+ nk_size_t query_count, nk_size_t document_count, nk_size_t depth,
317
+ nk_f32_t *result);
318
+ #endif // NK_TARGET_NEONSDOT
319
+
320
+ #if NK_TARGET_SME
321
+ /** @copydoc nk_maxsim_packed_size_bf16 */
322
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_sme(nk_size_t vector_count, nk_size_t depth);
323
+ /** @copydoc nk_maxsim_packed_size_bf16 */
324
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_sme(nk_size_t vector_count, nk_size_t depth);
325
+ /** @copydoc nk_maxsim_packed_size_bf16 */
326
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_sme(nk_size_t vector_count, nk_size_t depth);
327
+ /** @copydoc nk_maxsim_pack_bf16 */
328
+ NK_PUBLIC void nk_maxsim_pack_bf16_sme(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
329
+ nk_size_t stride, void *packed);
330
+ /** @copydoc nk_maxsim_pack_bf16 */
331
+ NK_PUBLIC void nk_maxsim_pack_f16_sme(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth,
332
+ nk_size_t stride, void *packed);
333
+ /** @copydoc nk_maxsim_pack_bf16 */
334
+ NK_PUBLIC void nk_maxsim_pack_f32_sme(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth,
335
+ nk_size_t stride, void *packed);
336
+ /** @copydoc nk_maxsim_packed_bf16 */
337
+ NK_PUBLIC void nk_maxsim_packed_bf16_sme(void const *query_packed, void const *document_packed, nk_size_t query_count,
338
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
339
+ /** @copydoc nk_maxsim_packed_bf16 */
340
+ NK_PUBLIC void nk_maxsim_packed_f16_sme(void const *query_packed, void const *document_packed, nk_size_t query_count,
341
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result);
342
+ /** @copydoc nk_maxsim_packed_bf16 */
343
+ NK_PUBLIC void nk_maxsim_packed_f32_sme(void const *query_packed, void const *document_packed, nk_size_t query_count,
344
+ nk_size_t document_count, nk_size_t depth, nk_f64_t *result);
345
+ #endif // NK_TARGET_SME
346
+
347
+ /**
348
+ * @brief Returns the output dtype for MaxSim late-interaction.
349
+ */
350
+ NK_INTERNAL nk_dtype_t nk_maxsim_output_dtype(nk_dtype_t dtype) {
351
+ switch (dtype) {
352
+ case nk_f32_k: return nk_f64_k;
353
+ case nk_f16_k: return nk_f32_k;
354
+ case nk_bf16_k: return nk_f32_k;
355
+ default: return nk_dtype_unknown_k;
356
+ }
357
+ }
358
+
359
+ #if defined(__cplusplus)
360
+ } // extern "C"
361
+ #endif
362
+
363
+ #include "numkong/maxsim/serial.h"
364
+ #include "numkong/maxsim/haswell.h"
365
+ #include "numkong/maxsim/alder.h"
366
+ #include "numkong/maxsim/icelake.h"
367
+ #include "numkong/maxsim/genoa.h"
368
+ #include "numkong/maxsim/sapphireamx.h"
369
+ #include "numkong/maxsim/neonsdot.h"
370
+ #include "numkong/maxsim/sme.h"
371
+ #include "numkong/maxsim/v128relaxed.h"
372
+
373
+ #if defined(__cplusplus)
374
+ extern "C" {
375
+ #endif
376
+
377
+ #if !NK_DYNAMIC_DISPATCH
378
+
379
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16(nk_size_t vector_count, nk_size_t depth) {
380
+ #if NK_TARGET_SME
381
+ return nk_maxsim_packed_size_bf16_sme(vector_count, depth);
382
+ #elif NK_TARGET_SAPPHIREAMX
383
+ return nk_maxsim_packed_size_bf16_sapphireamx(vector_count, depth);
384
+ #elif NK_TARGET_GENOA
385
+ return nk_maxsim_packed_size_bf16_genoa(vector_count, depth);
386
+ #elif NK_TARGET_ALDER
387
+ return nk_maxsim_packed_size_bf16_alder(vector_count, depth);
388
+ #elif NK_TARGET_HASWELL
389
+ return nk_maxsim_packed_size_bf16_haswell(vector_count, depth);
390
+ #elif NK_TARGET_NEONSDOT
391
+ return nk_maxsim_packed_size_bf16_neonsdot(vector_count, depth);
392
+ #elif NK_TARGET_V128RELAXED
393
+ return nk_maxsim_packed_size_bf16_v128relaxed(vector_count, depth);
394
+ #else
395
+ return nk_maxsim_packed_size_bf16_serial(vector_count, depth);
396
+ #endif
397
+ }
398
+
399
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32(nk_size_t vector_count, nk_size_t depth) {
400
+ #if NK_TARGET_SME
401
+ return nk_maxsim_packed_size_f32_sme(vector_count, depth);
402
+ #elif NK_TARGET_SAPPHIREAMX
403
+ return nk_maxsim_packed_size_f32_sapphireamx(vector_count, depth);
404
+ #elif NK_TARGET_ICELAKE
405
+ return nk_maxsim_packed_size_f32_icelake(vector_count, depth);
406
+ #elif NK_TARGET_ALDER
407
+ return nk_maxsim_packed_size_f32_alder(vector_count, depth);
408
+ #elif NK_TARGET_HASWELL
409
+ return nk_maxsim_packed_size_f32_haswell(vector_count, depth);
410
+ #elif NK_TARGET_NEONSDOT
411
+ return nk_maxsim_packed_size_f32_neonsdot(vector_count, depth);
412
+ #elif NK_TARGET_V128RELAXED
413
+ return nk_maxsim_packed_size_f32_v128relaxed(vector_count, depth);
414
+ #else
415
+ return nk_maxsim_packed_size_f32_serial(vector_count, depth);
416
+ #endif
417
+ }
418
+
419
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16(nk_size_t vector_count, nk_size_t depth) {
420
+ #if NK_TARGET_SME
421
+ return nk_maxsim_packed_size_f16_sme(vector_count, depth);
422
+ #elif NK_TARGET_SAPPHIREAMX
423
+ return nk_maxsim_packed_size_f16_sapphireamx(vector_count, depth);
424
+ #elif NK_TARGET_ICELAKE
425
+ return nk_maxsim_packed_size_f16_icelake(vector_count, depth);
426
+ #elif NK_TARGET_ALDER
427
+ return nk_maxsim_packed_size_f16_alder(vector_count, depth);
428
+ #elif NK_TARGET_HASWELL
429
+ return nk_maxsim_packed_size_f16_haswell(vector_count, depth);
430
+ #elif NK_TARGET_NEONSDOT
431
+ return nk_maxsim_packed_size_f16_neonsdot(vector_count, depth);
432
+ #elif NK_TARGET_V128RELAXED
433
+ return nk_maxsim_packed_size_f16_v128relaxed(vector_count, depth);
434
+ #else
435
+ return nk_maxsim_packed_size_f16_serial(vector_count, depth);
436
+ #endif
437
+ }
438
+
439
+ NK_PUBLIC void nk_maxsim_pack_bf16(nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
440
+ void *packed) {
441
+ #if NK_TARGET_SME
442
+ nk_maxsim_pack_bf16_sme(vectors, vector_count, depth, stride, packed);
443
+ #elif NK_TARGET_SAPPHIREAMX
444
+ nk_maxsim_pack_bf16_sapphireamx(vectors, vector_count, depth, stride, packed);
445
+ #elif NK_TARGET_GENOA
446
+ nk_maxsim_pack_bf16_genoa(vectors, vector_count, depth, stride, packed);
447
+ #elif NK_TARGET_ALDER
448
+ nk_maxsim_pack_bf16_alder(vectors, vector_count, depth, stride, packed);
449
+ #elif NK_TARGET_HASWELL
450
+ nk_maxsim_pack_bf16_haswell(vectors, vector_count, depth, stride, packed);
451
+ #elif NK_TARGET_NEONSDOT
452
+ nk_maxsim_pack_bf16_neonsdot(vectors, vector_count, depth, stride, packed);
453
+ #elif NK_TARGET_V128RELAXED
454
+ nk_maxsim_pack_bf16_v128relaxed(vectors, vector_count, depth, stride, packed);
455
+ #else
456
+ nk_maxsim_pack_bf16_serial(vectors, vector_count, depth, stride, packed);
457
+ #endif
458
+ }
459
+
460
+ NK_PUBLIC void nk_maxsim_pack_f32(nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
461
+ void *packed) {
462
+ #if NK_TARGET_SME
463
+ nk_maxsim_pack_f32_sme(vectors, vector_count, depth, stride, packed);
464
+ #elif NK_TARGET_SAPPHIREAMX
465
+ nk_maxsim_pack_f32_sapphireamx(vectors, vector_count, depth, stride, packed);
466
+ #elif NK_TARGET_ICELAKE
467
+ nk_maxsim_pack_f32_icelake(vectors, vector_count, depth, stride, packed);
468
+ #elif NK_TARGET_ALDER
469
+ nk_maxsim_pack_f32_alder(vectors, vector_count, depth, stride, packed);
470
+ #elif NK_TARGET_HASWELL
471
+ nk_maxsim_pack_f32_haswell(vectors, vector_count, depth, stride, packed);
472
+ #elif NK_TARGET_NEONSDOT
473
+ nk_maxsim_pack_f32_neonsdot(vectors, vector_count, depth, stride, packed);
474
+ #elif NK_TARGET_V128RELAXED
475
+ nk_maxsim_pack_f32_v128relaxed(vectors, vector_count, depth, stride, packed);
476
+ #else
477
+ nk_maxsim_pack_f32_serial(vectors, vector_count, depth, stride, packed);
478
+ #endif
479
+ }
480
+
481
+ NK_PUBLIC void nk_maxsim_pack_f16(nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride,
482
+ void *packed) {
483
+ #if NK_TARGET_SME
484
+ nk_maxsim_pack_f16_sme(vectors, vector_count, depth, stride, packed);
485
+ #elif NK_TARGET_SAPPHIREAMX
486
+ nk_maxsim_pack_f16_sapphireamx(vectors, vector_count, depth, stride, packed);
487
+ #elif NK_TARGET_ICELAKE
488
+ nk_maxsim_pack_f16_icelake(vectors, vector_count, depth, stride, packed);
489
+ #elif NK_TARGET_ALDER
490
+ nk_maxsim_pack_f16_alder(vectors, vector_count, depth, stride, packed);
491
+ #elif NK_TARGET_HASWELL
492
+ nk_maxsim_pack_f16_haswell(vectors, vector_count, depth, stride, packed);
493
+ #elif NK_TARGET_NEONSDOT
494
+ nk_maxsim_pack_f16_neonsdot(vectors, vector_count, depth, stride, packed);
495
+ #elif NK_TARGET_V128RELAXED
496
+ nk_maxsim_pack_f16_v128relaxed(vectors, vector_count, depth, stride, packed);
497
+ #else
498
+ nk_maxsim_pack_f16_serial(vectors, vector_count, depth, stride, packed);
499
+ #endif
500
+ }
501
+
502
+ NK_PUBLIC void nk_maxsim_packed_bf16(void const *query_packed, void const *document_packed, nk_size_t query_count,
503
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result) {
504
+ #if NK_TARGET_SME
505
+ nk_maxsim_packed_bf16_sme(query_packed, document_packed, query_count, document_count, depth, result);
506
+ #elif NK_TARGET_SAPPHIREAMX
507
+ nk_maxsim_packed_bf16_sapphireamx(query_packed, document_packed, query_count, document_count, depth, result);
508
+ #elif NK_TARGET_GENOA
509
+ nk_maxsim_packed_bf16_genoa(query_packed, document_packed, query_count, document_count, depth, result);
510
+ #elif NK_TARGET_ALDER
511
+ nk_maxsim_packed_bf16_alder(query_packed, document_packed, query_count, document_count, depth, result);
512
+ #elif NK_TARGET_HASWELL
513
+ nk_maxsim_packed_bf16_haswell(query_packed, document_packed, query_count, document_count, depth, result);
514
+ #elif NK_TARGET_NEONSDOT
515
+ nk_maxsim_packed_bf16_neonsdot(query_packed, document_packed, query_count, document_count, depth, result);
516
+ #elif NK_TARGET_V128RELAXED
517
+ nk_maxsim_packed_bf16_v128relaxed(query_packed, document_packed, query_count, document_count, depth, result);
518
+ #else
519
+ nk_maxsim_packed_bf16_serial(query_packed, document_packed, query_count, document_count, depth, result);
520
+ #endif
521
+ }
522
+
523
+ NK_PUBLIC void nk_maxsim_packed_f32(void const *query_packed, void const *document_packed, nk_size_t query_count,
524
+ nk_size_t document_count, nk_size_t depth, nk_f64_t *result) {
525
+ #if NK_TARGET_SME
526
+ nk_maxsim_packed_f32_sme(query_packed, document_packed, query_count, document_count, depth, result);
527
+ #elif NK_TARGET_SAPPHIREAMX
528
+ nk_maxsim_packed_f32_sapphireamx(query_packed, document_packed, query_count, document_count, depth, result);
529
+ #elif NK_TARGET_ICELAKE
530
+ nk_maxsim_packed_f32_icelake(query_packed, document_packed, query_count, document_count, depth, result);
531
+ #elif NK_TARGET_ALDER
532
+ nk_maxsim_packed_f32_alder(query_packed, document_packed, query_count, document_count, depth, result);
533
+ #elif NK_TARGET_HASWELL
534
+ nk_maxsim_packed_f32_haswell(query_packed, document_packed, query_count, document_count, depth, result);
535
+ #elif NK_TARGET_NEONSDOT
536
+ nk_maxsim_packed_f32_neonsdot(query_packed, document_packed, query_count, document_count, depth, result);
537
+ #elif NK_TARGET_V128RELAXED
538
+ nk_maxsim_packed_f32_v128relaxed(query_packed, document_packed, query_count, document_count, depth, result);
539
+ #else
540
+ nk_maxsim_packed_f32_serial(query_packed, document_packed, query_count, document_count, depth, result);
541
+ #endif
542
+ }
543
+
544
+ NK_PUBLIC void nk_maxsim_packed_f16(void const *query_packed, void const *document_packed, nk_size_t query_count,
545
+ nk_size_t document_count, nk_size_t depth, nk_f32_t *result) {
546
+ #if NK_TARGET_SME
547
+ nk_maxsim_packed_f16_sme(query_packed, document_packed, query_count, document_count, depth, result);
548
+ #elif NK_TARGET_SAPPHIREAMX
549
+ nk_maxsim_packed_f16_sapphireamx(query_packed, document_packed, query_count, document_count, depth, result);
550
+ #elif NK_TARGET_ICELAKE
551
+ nk_maxsim_packed_f16_icelake(query_packed, document_packed, query_count, document_count, depth, result);
552
+ #elif NK_TARGET_ALDER
553
+ nk_maxsim_packed_f16_alder(query_packed, document_packed, query_count, document_count, depth, result);
554
+ #elif NK_TARGET_HASWELL
555
+ nk_maxsim_packed_f16_haswell(query_packed, document_packed, query_count, document_count, depth, result);
556
+ #elif NK_TARGET_NEONSDOT
557
+ nk_maxsim_packed_f16_neonsdot(query_packed, document_packed, query_count, document_count, depth, result);
558
+ #elif NK_TARGET_V128RELAXED
559
+ nk_maxsim_packed_f16_v128relaxed(query_packed, document_packed, query_count, document_count, depth, result);
560
+ #else
561
+ nk_maxsim_packed_f16_serial(query_packed, document_packed, query_count, document_count, depth, result);
562
+ #endif
563
+ }
564
+
565
+ #endif // !NK_DYNAMIC_DISPATCH
566
+
567
+ #if defined(__cplusplus)
568
+ } // extern "C"
569
+ #endif
570
+
571
+ #endif // NK_MAXSIM_H