numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,394 @@
1
+ /**
2
+ * @brief SIMD-accelerated MaxSim (angular distance late-interaction) for ARM NEONSDOT.
3
+ * @file include/numkong/maxsim/neonsdot.h
4
+ * @author Ash Vardanian
5
+ * @date February 28, 2026
6
+ *
7
+ * @sa include/numkong/maxsim.h
8
+ *
9
+ * Uses ARM SDOT (vdotq_s32) for coarse i8 screening — signed×signed natively, no bias correction.
10
+ * 4x4 register tiling: 4 queries x 4 documents = 16 int32x4_t accumulators per depth loop.
11
+ * Depth steps at 16 bytes (128-bit NEON = 16 i8 lanes).
12
+ */
13
+ #ifndef NK_MAXSIM_NEONSDOT_H
14
+ #define NK_MAXSIM_NEONSDOT_H
15
+
16
+ #if NK_TARGET_ARM_
17
+ #if NK_TARGET_NEONSDOT
18
+
19
+ #include "numkong/types.h"
20
+ #include "numkong/maxsim/serial.h" // `nk_maxsim_packed_header_t`
21
+ #include "numkong/dot.h" // `nk_dot_bf16`, `nk_dot_f32`, `nk_dot_f16`
22
+ #include "numkong/cast/neon.h" // `nk_f16_to_f32_neon`
23
+ #include "numkong/spatial/neon.h" // `nk_f32_sqrt_neon`
24
+
25
+ #if defined(__cplusplus)
26
+ extern "C" {
27
+ #endif
28
+
29
+ #if defined(__clang__)
30
+ #pragma clang attribute push(__attribute__((target("dotprod"))), apply_to = function)
31
+ #elif defined(__GNUC__)
32
+ #pragma GCC push_options
33
+ #pragma GCC target("+dotprod")
34
+ #endif
35
+
36
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_neonsdot(nk_size_t vector_count, nk_size_t depth) {
37
+ return nk_maxsim_packed_size_(vector_count, depth, sizeof(nk_bf16_t), 16);
38
+ }
39
+
40
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_neonsdot(nk_size_t vector_count, nk_size_t depth) {
41
+ return nk_maxsim_packed_size_(vector_count, depth, sizeof(nk_f32_t), 16);
42
+ }
43
+
44
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_neonsdot(nk_size_t vector_count, nk_size_t depth) {
45
+ return nk_maxsim_packed_size_(vector_count, depth, sizeof(nk_f16_t), 16);
46
+ }
47
+
48
+ NK_PUBLIC void nk_maxsim_pack_bf16_neonsdot( //
49
+ nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride, void *packed) {
50
+
51
+ nk_size_t const element_bytes = sizeof(nk_bf16_t);
52
+ nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
53
+
54
+ nk_maxsim_packed_header_t const *header = (nk_maxsim_packed_header_t const *)packed;
55
+ nk_i8_t *quantized_i8 = (nk_i8_t *)((char *)packed + header->offset_i8_data);
56
+ nk_maxsim_vector_metadata_t *metadata = (nk_maxsim_vector_metadata_t *)((char *)packed + header->offset_metadata);
57
+ char *originals = (char *)packed + header->offset_original_data;
58
+ nk_size_t const original_stride = header->original_stride_bytes;
59
+
60
+ for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
61
+ char const *source_row = (char const *)vectors + vector_index * stride;
62
+ nk_f32_t norm_sq;
63
+ nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 127.0f,
64
+ (nk_maxsim_to_f32_t)nk_bf16_to_f32_serial,
65
+ &quantized_i8[vector_index * depth_i8_padded], &metadata[vector_index], &norm_sq);
66
+ metadata[vector_index].inverse_norm_f32 = norm_sq > 0.0f ? (1.0f / nk_f32_sqrt_neon(norm_sq)) : 0.0f;
67
+ char *destination_original = originals + vector_index * original_stride;
68
+ nk_copy_bytes_(destination_original, source_row, depth * element_bytes);
69
+ for (nk_size_t byte_index = depth * element_bytes; byte_index < original_stride; byte_index++)
70
+ destination_original[byte_index] = 0;
71
+ }
72
+ }
73
+
74
+ NK_PUBLIC void nk_maxsim_pack_f32_neonsdot( //
75
+ nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride, void *packed) {
76
+
77
+ nk_size_t const element_bytes = sizeof(nk_f32_t);
78
+ nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
79
+
80
+ nk_maxsim_packed_header_t const *header = (nk_maxsim_packed_header_t const *)packed;
81
+ nk_i8_t *quantized_i8 = (nk_i8_t *)((char *)packed + header->offset_i8_data);
82
+ nk_maxsim_vector_metadata_t *metadata = (nk_maxsim_vector_metadata_t *)((char *)packed + header->offset_metadata);
83
+ char *originals = (char *)packed + header->offset_original_data;
84
+ nk_size_t const original_stride = header->original_stride_bytes;
85
+
86
+ for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
87
+ char const *source_row = (char const *)vectors + vector_index * stride;
88
+ nk_f32_t norm_sq;
89
+ nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 127.0f, nk_f32_to_f32_,
90
+ &quantized_i8[vector_index * depth_i8_padded], &metadata[vector_index], &norm_sq);
91
+ metadata[vector_index].inverse_norm_f32 = norm_sq > 0.0f ? (1.0f / nk_f32_sqrt_neon(norm_sq)) : 0.0f;
92
+ char *destination_original = originals + vector_index * original_stride;
93
+ nk_copy_bytes_(destination_original, source_row, depth * element_bytes);
94
+ for (nk_size_t byte_index = depth * element_bytes; byte_index < original_stride; byte_index++)
95
+ destination_original[byte_index] = 0;
96
+ }
97
+ }
98
+
99
+ NK_PUBLIC void nk_maxsim_pack_f16_neonsdot( //
100
+ nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride, void *packed) {
101
+
102
+ nk_size_t const element_bytes = sizeof(nk_f16_t);
103
+ nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
104
+
105
+ nk_maxsim_packed_header_t const *header = (nk_maxsim_packed_header_t const *)packed;
106
+ nk_i8_t *quantized_i8 = (nk_i8_t *)((char *)packed + header->offset_i8_data);
107
+ nk_maxsim_vector_metadata_t *metadata = (nk_maxsim_vector_metadata_t *)((char *)packed + header->offset_metadata);
108
+ char *originals = (char *)packed + header->offset_original_data;
109
+ nk_size_t const original_stride = header->original_stride_bytes;
110
+
111
+ for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
112
+ char const *source_row = (char const *)vectors + vector_index * stride;
113
+ nk_f32_t norm_sq;
114
+ nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 127.0f,
115
+ (nk_maxsim_to_f32_t)nk_f16_to_f32_neon,
116
+ &quantized_i8[vector_index * depth_i8_padded], &metadata[vector_index], &norm_sq);
117
+ metadata[vector_index].inverse_norm_f32 = norm_sq > 0.0f ? (1.0f / nk_f32_sqrt_neon(norm_sq)) : 0.0f;
118
+ char *destination_original = originals + vector_index * original_stride;
119
+ nk_copy_bytes_(destination_original, source_row, depth * element_bytes);
120
+ for (nk_size_t byte_index = depth * element_bytes; byte_index < original_stride; byte_index++)
121
+ destination_original[byte_index] = 0;
122
+ }
123
+ }
124
+
125
+ /**
126
+ * @brief Factored coarse i8 argmax kernel for NEONSDOT.
127
+ * Uses vdotq_s32 (signed×signed) — no XOR bias, no metadata parameter.
128
+ * 4Q×4D register tiling with 16 int32x4_t accumulators.
129
+ */
130
+ NK_INTERNAL void nk_maxsim_coarse_argmax_neonsdot_( //
131
+ nk_i8_t const *query_i8, nk_i8_t const *document_i8, nk_size_t query_count, nk_size_t document_count, //
132
+ nk_size_t depth_i8_padded, nk_u32_t *best_document_indices) {
133
+
134
+ // Primary path: 4-query grouping
135
+ nk_size_t query_block_start_index = 0;
136
+ for (; query_block_start_index + 4 <= query_count; query_block_start_index += 4) {
137
+ nk_i32_t running_max_i32[4] = {NK_I32_MIN, NK_I32_MIN, NK_I32_MIN, NK_I32_MIN};
138
+ nk_u32_t running_argmax_u32[4] = {0, 0, 0, 0};
139
+
140
+ // 4Q×4D document blocking
141
+ nk_size_t document_block_start_index = 0;
142
+ for (; document_block_start_index + 4 <= document_count; document_block_start_index += 4) {
143
+ // 16 accumulators: [query_idx][doc_idx]
144
+ int32x4_t accumulator_tiles_i32x4[4][4];
145
+ for (nk_size_t query_tile_index = 0; query_tile_index < 4; query_tile_index++)
146
+ for (nk_size_t document_tile_index = 0; document_tile_index < 4; document_tile_index++)
147
+ accumulator_tiles_i32x4[query_tile_index][document_tile_index] = vdupq_n_s32(0);
148
+
149
+ // Depth loop: 16 bytes per step
150
+ for (nk_size_t depth_index = 0; depth_index < depth_i8_padded; depth_index += 16) {
151
+ int8x16_t query_i8x16_0 = vld1q_s8(
152
+ (int8_t const *)(query_i8 + (query_block_start_index + 0) * depth_i8_padded + depth_index));
153
+ int8x16_t query_i8x16_1 = vld1q_s8(
154
+ (int8_t const *)(query_i8 + (query_block_start_index + 1) * depth_i8_padded + depth_index));
155
+ int8x16_t query_i8x16_2 = vld1q_s8(
156
+ (int8_t const *)(query_i8 + (query_block_start_index + 2) * depth_i8_padded + depth_index));
157
+ int8x16_t query_i8x16_3 = vld1q_s8(
158
+ (int8_t const *)(query_i8 + (query_block_start_index + 3) * depth_i8_padded + depth_index));
159
+
160
+ int8x16_t document_i8x16;
161
+
162
+ document_i8x16 = vld1q_s8(
163
+ (int8_t const *)(document_i8 + (document_block_start_index + 0) * depth_i8_padded + depth_index));
164
+ accumulator_tiles_i32x4[0][0] = vdotq_s32(accumulator_tiles_i32x4[0][0], query_i8x16_0, document_i8x16);
165
+ accumulator_tiles_i32x4[1][0] = vdotq_s32(accumulator_tiles_i32x4[1][0], query_i8x16_1, document_i8x16);
166
+ accumulator_tiles_i32x4[2][0] = vdotq_s32(accumulator_tiles_i32x4[2][0], query_i8x16_2, document_i8x16);
167
+ accumulator_tiles_i32x4[3][0] = vdotq_s32(accumulator_tiles_i32x4[3][0], query_i8x16_3, document_i8x16);
168
+
169
+ document_i8x16 = vld1q_s8(
170
+ (int8_t const *)(document_i8 + (document_block_start_index + 1) * depth_i8_padded + depth_index));
171
+ accumulator_tiles_i32x4[0][1] = vdotq_s32(accumulator_tiles_i32x4[0][1], query_i8x16_0, document_i8x16);
172
+ accumulator_tiles_i32x4[1][1] = vdotq_s32(accumulator_tiles_i32x4[1][1], query_i8x16_1, document_i8x16);
173
+ accumulator_tiles_i32x4[2][1] = vdotq_s32(accumulator_tiles_i32x4[2][1], query_i8x16_2, document_i8x16);
174
+ accumulator_tiles_i32x4[3][1] = vdotq_s32(accumulator_tiles_i32x4[3][1], query_i8x16_3, document_i8x16);
175
+
176
+ document_i8x16 = vld1q_s8(
177
+ (int8_t const *)(document_i8 + (document_block_start_index + 2) * depth_i8_padded + depth_index));
178
+ accumulator_tiles_i32x4[0][2] = vdotq_s32(accumulator_tiles_i32x4[0][2], query_i8x16_0, document_i8x16);
179
+ accumulator_tiles_i32x4[1][2] = vdotq_s32(accumulator_tiles_i32x4[1][2], query_i8x16_1, document_i8x16);
180
+ accumulator_tiles_i32x4[2][2] = vdotq_s32(accumulator_tiles_i32x4[2][2], query_i8x16_2, document_i8x16);
181
+ accumulator_tiles_i32x4[3][2] = vdotq_s32(accumulator_tiles_i32x4[3][2], query_i8x16_3, document_i8x16);
182
+
183
+ document_i8x16 = vld1q_s8(
184
+ (int8_t const *)(document_i8 + (document_block_start_index + 3) * depth_i8_padded + depth_index));
185
+ accumulator_tiles_i32x4[0][3] = vdotq_s32(accumulator_tiles_i32x4[0][3], query_i8x16_0, document_i8x16);
186
+ accumulator_tiles_i32x4[1][3] = vdotq_s32(accumulator_tiles_i32x4[1][3], query_i8x16_1, document_i8x16);
187
+ accumulator_tiles_i32x4[2][3] = vdotq_s32(accumulator_tiles_i32x4[2][3], query_i8x16_2, document_i8x16);
188
+ accumulator_tiles_i32x4[3][3] = vdotq_s32(accumulator_tiles_i32x4[3][3], query_i8x16_3, document_i8x16);
189
+ }
190
+
191
+ // Reduce and update argmax for each of 4 queries × 4 documents
192
+ for (nk_size_t query_tile_index = 0; query_tile_index < 4; query_tile_index++) {
193
+ for (nk_size_t document_tile_index = 0; document_tile_index < 4; document_tile_index++) {
194
+ nk_i32_t dot = vaddvq_s32(accumulator_tiles_i32x4[query_tile_index][document_tile_index]);
195
+ if (dot > running_max_i32[query_tile_index]) {
196
+ running_max_i32[query_tile_index] = dot;
197
+ running_argmax_u32[query_tile_index] = (nk_u32_t)(document_block_start_index +
198
+ document_tile_index);
199
+ }
200
+ }
201
+ }
202
+ }
203
+
204
+ // Document tail: 4Q×1D
205
+ for (nk_size_t document_index = document_block_start_index; document_index < document_count; document_index++) {
206
+ nk_i8_t const *document_i8_row = document_i8 + document_index * depth_i8_padded;
207
+
208
+ int32x4_t accumulator_i32x4_0 = vdupq_n_s32(0);
209
+ int32x4_t accumulator_i32x4_1 = vdupq_n_s32(0);
210
+ int32x4_t accumulator_i32x4_2 = vdupq_n_s32(0);
211
+ int32x4_t accumulator_i32x4_3 = vdupq_n_s32(0);
212
+
213
+ for (nk_size_t depth_index = 0; depth_index < depth_i8_padded; depth_index += 16) {
214
+ int8x16_t document_i8x16 = vld1q_s8((int8_t const *)(document_i8_row + depth_index));
215
+
216
+ accumulator_i32x4_0 = vdotq_s32(
217
+ accumulator_i32x4_0,
218
+ vld1q_s8(
219
+ (int8_t const *)(query_i8 + (query_block_start_index + 0) * depth_i8_padded + depth_index)),
220
+ document_i8x16);
221
+ accumulator_i32x4_1 = vdotq_s32(
222
+ accumulator_i32x4_1,
223
+ vld1q_s8(
224
+ (int8_t const *)(query_i8 + (query_block_start_index + 1) * depth_i8_padded + depth_index)),
225
+ document_i8x16);
226
+ accumulator_i32x4_2 = vdotq_s32(
227
+ accumulator_i32x4_2,
228
+ vld1q_s8(
229
+ (int8_t const *)(query_i8 + (query_block_start_index + 2) * depth_i8_padded + depth_index)),
230
+ document_i8x16);
231
+ accumulator_i32x4_3 = vdotq_s32(
232
+ accumulator_i32x4_3,
233
+ vld1q_s8(
234
+ (int8_t const *)(query_i8 + (query_block_start_index + 3) * depth_i8_padded + depth_index)),
235
+ document_i8x16);
236
+ }
237
+
238
+ nk_i32_t dots[4] = {vaddvq_s32(accumulator_i32x4_0), vaddvq_s32(accumulator_i32x4_1),
239
+ vaddvq_s32(accumulator_i32x4_2), vaddvq_s32(accumulator_i32x4_3)};
240
+ for (nk_size_t query_tile_index = 0; query_tile_index < 4; query_tile_index++) {
241
+ if (dots[query_tile_index] > running_max_i32[query_tile_index]) {
242
+ running_max_i32[query_tile_index] = dots[query_tile_index];
243
+ running_argmax_u32[query_tile_index] = (nk_u32_t)document_index;
244
+ }
245
+ }
246
+ }
247
+
248
+ for (nk_size_t query_tile_index = 0; query_tile_index < 4; query_tile_index++)
249
+ best_document_indices[query_block_start_index + query_tile_index] = running_argmax_u32[query_tile_index];
250
+ }
251
+
252
+ // Query tail: 1Q×1D
253
+ for (nk_size_t query_index = query_block_start_index; query_index < query_count; query_index++) {
254
+ nk_i8_t const *query_i8_row = query_i8 + query_index * depth_i8_padded;
255
+ nk_i32_t running_max_i32 = NK_I32_MIN;
256
+ nk_u32_t running_argmax_u32 = 0;
257
+
258
+ for (nk_size_t document_index = 0; document_index < document_count; document_index++) {
259
+ nk_i8_t const *document_i8_row = document_i8 + document_index * depth_i8_padded;
260
+ int32x4_t accumulator_i32x4 = vdupq_n_s32(0);
261
+
262
+ for (nk_size_t depth_index = 0; depth_index < depth_i8_padded; depth_index += 16) {
263
+ int8x16_t query_i8x16 = vld1q_s8((int8_t const *)(query_i8_row + depth_index));
264
+ int8x16_t document_i8x16 = vld1q_s8((int8_t const *)(document_i8_row + depth_index));
265
+ accumulator_i32x4 = vdotq_s32(accumulator_i32x4, query_i8x16, document_i8x16);
266
+ }
267
+
268
+ nk_i32_t coarse_dot_i32 = vaddvq_s32(accumulator_i32x4);
269
+ if (coarse_dot_i32 > running_max_i32) {
270
+ running_max_i32 = coarse_dot_i32;
271
+ running_argmax_u32 = (nk_u32_t)document_index;
272
+ }
273
+ }
274
+
275
+ best_document_indices[query_index] = running_argmax_u32;
276
+ }
277
+ }
278
+
279
+ NK_PUBLIC void nk_maxsim_packed_bf16_neonsdot( //
280
+ void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
281
+ nk_size_t depth, nk_f32_t *result) {
282
+
283
+ nk_maxsim_packed_regions_t regions = nk_maxsim_extract_packed_regions_(query_packed, document_packed);
284
+ nk_f64_t total_angular_distance = 0.0;
285
+
286
+ for (nk_size_t chunk_start = 0; chunk_start < query_count; chunk_start += 256) {
287
+ nk_size_t chunk_size = query_count - chunk_start < 256 ? query_count - chunk_start : 256;
288
+ nk_u32_t best_document_indices[256];
289
+
290
+ nk_maxsim_coarse_argmax_neonsdot_(regions.query_quantized + chunk_start * regions.depth_i8_padded,
291
+ regions.document_quantized, chunk_size, document_count,
292
+ regions.depth_i8_padded, best_document_indices);
293
+
294
+ for (nk_size_t query_index = 0; query_index < chunk_size; query_index++) {
295
+ nk_u32_t best_document_index = best_document_indices[query_index];
296
+ nk_f32_t dot_result;
297
+ nk_dot_bf16((nk_bf16_t const *)(regions.query_originals +
298
+ (chunk_start + query_index) * regions.query_original_stride),
299
+ (nk_bf16_t const *)(regions.document_originals +
300
+ best_document_index * regions.document_original_stride),
301
+ depth, &dot_result);
302
+ nk_f32_t cosine = dot_result * regions.query_metadata[chunk_start + query_index].inverse_norm_f32 *
303
+ regions.document_metadata[best_document_index].inverse_norm_f32;
304
+ nk_f32_t angular = 1.0f - cosine;
305
+ if (angular < 0.0f) angular = 0.0f;
306
+ total_angular_distance += (nk_f64_t)angular;
307
+ }
308
+ }
309
+
310
+ *result = (nk_f32_t)total_angular_distance;
311
+ }
312
+
313
+ NK_PUBLIC void nk_maxsim_packed_f32_neonsdot( //
314
+ void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
315
+ nk_size_t depth, nk_f64_t *result) {
316
+
317
+ nk_maxsim_packed_regions_t regions = nk_maxsim_extract_packed_regions_(query_packed, document_packed);
318
+ nk_f64_t total_angular_distance = 0.0;
319
+
320
+ for (nk_size_t chunk_start = 0; chunk_start < query_count; chunk_start += 256) {
321
+ nk_size_t chunk_size = query_count - chunk_start < 256 ? query_count - chunk_start : 256;
322
+ nk_u32_t best_document_indices[256];
323
+
324
+ nk_maxsim_coarse_argmax_neonsdot_(regions.query_quantized + chunk_start * regions.depth_i8_padded,
325
+ regions.document_quantized, chunk_size, document_count,
326
+ regions.depth_i8_padded, best_document_indices);
327
+
328
+ for (nk_size_t query_index = 0; query_index < chunk_size; query_index++) {
329
+ nk_u32_t best_document_index = best_document_indices[query_index];
330
+ nk_f64_t dot_result;
331
+ nk_dot_f32(
332
+ (nk_f32_t const *)(regions.query_originals +
333
+ (chunk_start + query_index) * regions.query_original_stride),
334
+ (nk_f32_t const *)(regions.document_originals + best_document_index * regions.document_original_stride),
335
+ depth, &dot_result);
336
+ nk_f64_t cosine = dot_result *
337
+ (nk_f64_t)regions.query_metadata[chunk_start + query_index].inverse_norm_f32 *
338
+ (nk_f64_t)regions.document_metadata[best_document_index].inverse_norm_f32;
339
+ nk_f64_t angular = 1.0 - cosine;
340
+ if (angular < 0.0) angular = 0.0;
341
+ total_angular_distance += angular;
342
+ }
343
+ }
344
+
345
+ *result = total_angular_distance;
346
+ }
347
+
348
+ NK_PUBLIC void nk_maxsim_packed_f16_neonsdot( //
349
+ void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
350
+ nk_size_t depth, nk_f32_t *result) {
351
+
352
+ nk_maxsim_packed_regions_t regions = nk_maxsim_extract_packed_regions_(query_packed, document_packed);
353
+ nk_f64_t total_angular_distance = 0.0;
354
+
355
+ for (nk_size_t chunk_start = 0; chunk_start < query_count; chunk_start += 256) {
356
+ nk_size_t chunk_size = query_count - chunk_start < 256 ? query_count - chunk_start : 256;
357
+ nk_u32_t best_document_indices[256];
358
+
359
+ nk_maxsim_coarse_argmax_neonsdot_(regions.query_quantized + chunk_start * regions.depth_i8_padded,
360
+ regions.document_quantized, chunk_size, document_count,
361
+ regions.depth_i8_padded, best_document_indices);
362
+
363
+ for (nk_size_t query_index = 0; query_index < chunk_size; query_index++) {
364
+ nk_u32_t best_document_index = best_document_indices[query_index];
365
+ nk_f32_t dot_result;
366
+ nk_dot_f16(
367
+ (nk_f16_t const *)(regions.query_originals +
368
+ (chunk_start + query_index) * regions.query_original_stride),
369
+ (nk_f16_t const *)(regions.document_originals + best_document_index * regions.document_original_stride),
370
+ depth, &dot_result);
371
+ nk_f32_t cosine = dot_result * regions.query_metadata[chunk_start + query_index].inverse_norm_f32 *
372
+ regions.document_metadata[best_document_index].inverse_norm_f32;
373
+ nk_f32_t angular = 1.0f - cosine;
374
+ if (angular < 0.0f) angular = 0.0f;
375
+ total_angular_distance += (nk_f64_t)angular;
376
+ }
377
+ }
378
+
379
+ *result = (nk_f32_t)total_angular_distance;
380
+ }
381
+
382
+ #if defined(__clang__)
383
+ #pragma clang attribute pop
384
+ #elif defined(__GNUC__)
385
+ #pragma GCC pop_options
386
+ #endif
387
+
388
+ #if defined(__cplusplus)
389
+ } // extern "C"
390
+ #endif
391
+
392
+ #endif // NK_TARGET_NEONSDOT
393
+ #endif // NK_TARGET_ARM_
394
+ #endif // NK_MAXSIM_NEONSDOT_H