numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,280 @@
1
+ /**
2
+ * @brief SIMD-accelerated MaxSim (angular distance late-interaction) for WASM Relaxed SIMD.
3
+ * @file include/numkong/maxsim/v128relaxed.h
4
+ * @author Ash Vardanian
5
+ * @date March 5, 2026
6
+ *
7
+ * @sa include/numkong/maxsim.h
8
+ *
9
+ * Uses wasm_i32x4_relaxed_dot_i8x16_i7x16_add for coarse i8 screening.
10
+ * Both operands stay within i7 range [-63, 63] for native signed×signed arithmetic.
11
+ * No bias correction needed (unlike Haswell/Alder XOR-0x80 approach).
12
+ *
13
+ * 1Q×1D tiling (simpler than x86 4x4) with scalar running argmax.
14
+ * Depth steps at 16 bytes (v128 width in bytes).
15
+ */
16
+ #ifndef NK_MAXSIM_V128RELAXED_H
17
+ #define NK_MAXSIM_V128RELAXED_H
18
+
19
+ #if NK_TARGET_V128RELAXED
20
+
21
+ #include "numkong/types.h"
22
+ #include "numkong/maxsim/serial.h" // `nk_maxsim_packed_header_t`
23
+ #include "numkong/dot.h" // `nk_dot_bf16`, `nk_dot_f32`, `nk_dot_f16`
24
+ #include "numkong/cast/serial.h" // `nk_bf16_to_f32_serial`
25
+ #include "numkong/scalar/v128relaxed.h" // `nk_f32_sqrt_v128relaxed`
26
+ #include "numkong/reduce/v128relaxed.h" // `nk_reduce_add_i32x4_v128relaxed_`
27
+
28
+ #if defined(__cplusplus)
29
+ extern "C" {
30
+ #endif
31
+
32
+ #if defined(__clang__)
33
+ #pragma clang attribute push(__attribute__((target("relaxed-simd"))), apply_to = function)
34
+ #endif
35
+
36
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_v128relaxed(nk_size_t vector_count, nk_size_t depth) {
37
+ return nk_maxsim_packed_size_(vector_count, depth, sizeof(nk_bf16_t), 16);
38
+ }
39
+
40
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f32_v128relaxed(nk_size_t vector_count, nk_size_t depth) {
41
+ return nk_maxsim_packed_size_(vector_count, depth, sizeof(nk_f32_t), 16);
42
+ }
43
+
44
+ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_v128relaxed(nk_size_t vector_count, nk_size_t depth) {
45
+ return nk_maxsim_packed_size_(vector_count, depth, sizeof(nk_f16_t), 16);
46
+ }
47
+
48
+ NK_PUBLIC void nk_maxsim_pack_bf16_v128relaxed( //
49
+ nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride, void *packed) {
50
+
51
+ nk_size_t const element_bytes = sizeof(nk_bf16_t);
52
+ nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
53
+
54
+ nk_maxsim_packed_header_t const *header = (nk_maxsim_packed_header_t const *)packed;
55
+ nk_i8_t *quantized_i8 = (nk_i8_t *)((char *)packed + header->offset_i8_data);
56
+ nk_maxsim_vector_metadata_t *metadata = (nk_maxsim_vector_metadata_t *)((char *)packed + header->offset_metadata);
57
+ char *originals = (char *)packed + header->offset_original_data;
58
+ nk_size_t const original_stride = header->original_stride_bytes;
59
+
60
+ for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
61
+ char const *source_row = (char const *)vectors + vector_index * stride;
62
+ nk_f32_t norm_sq;
63
+ nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 63.0f,
64
+ (nk_maxsim_to_f32_t)nk_bf16_to_f32_serial,
65
+ &quantized_i8[vector_index * depth_i8_padded], &metadata[vector_index], &norm_sq);
66
+ metadata[vector_index].inverse_norm_f32 = norm_sq > 0.0f ? (1.0f / nk_f32_sqrt_v128relaxed(norm_sq)) : 0.0f;
67
+ char *destination_original = originals + vector_index * original_stride;
68
+ nk_copy_bytes_(destination_original, source_row, depth * element_bytes);
69
+ for (nk_size_t byte_index = depth * element_bytes; byte_index < original_stride; byte_index++)
70
+ destination_original[byte_index] = 0;
71
+ }
72
+ }
73
+
74
+ NK_PUBLIC void nk_maxsim_pack_f32_v128relaxed( //
75
+ nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride, void *packed) {
76
+
77
+ nk_size_t const element_bytes = sizeof(nk_f32_t);
78
+ nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
79
+
80
+ nk_maxsim_packed_header_t const *header = (nk_maxsim_packed_header_t const *)packed;
81
+ nk_i8_t *quantized_i8 = (nk_i8_t *)((char *)packed + header->offset_i8_data);
82
+ nk_maxsim_vector_metadata_t *metadata = (nk_maxsim_vector_metadata_t *)((char *)packed + header->offset_metadata);
83
+ char *originals = (char *)packed + header->offset_original_data;
84
+ nk_size_t const original_stride = header->original_stride_bytes;
85
+
86
+ for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
87
+ char const *source_row = (char const *)vectors + vector_index * stride;
88
+ nk_f32_t norm_sq;
89
+ nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 63.0f, nk_f32_to_f32_,
90
+ &quantized_i8[vector_index * depth_i8_padded], &metadata[vector_index], &norm_sq);
91
+ metadata[vector_index].inverse_norm_f32 = norm_sq > 0.0f ? (1.0f / nk_f32_sqrt_v128relaxed(norm_sq)) : 0.0f;
92
+ char *destination_original = originals + vector_index * original_stride;
93
+ nk_copy_bytes_(destination_original, source_row, depth * element_bytes);
94
+ for (nk_size_t byte_index = depth * element_bytes; byte_index < original_stride; byte_index++)
95
+ destination_original[byte_index] = 0;
96
+ }
97
+ }
98
+
99
+ NK_PUBLIC void nk_maxsim_pack_f16_v128relaxed( //
100
+ nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride, void *packed) {
101
+
102
+ nk_size_t const element_bytes = sizeof(nk_f16_t);
103
+ nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
104
+
105
+ nk_maxsim_packed_header_t const *header = (nk_maxsim_packed_header_t const *)packed;
106
+ nk_i8_t *quantized_i8 = (nk_i8_t *)((char *)packed + header->offset_i8_data);
107
+ nk_maxsim_vector_metadata_t *metadata = (nk_maxsim_vector_metadata_t *)((char *)packed + header->offset_metadata);
108
+ char *originals = (char *)packed + header->offset_original_data;
109
+ nk_size_t const original_stride = header->original_stride_bytes;
110
+
111
+ for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
112
+ char const *source_row = (char const *)vectors + vector_index * stride;
113
+ nk_f32_t norm_sq;
114
+ nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 63.0f,
115
+ (nk_maxsim_to_f32_t)nk_f16_to_f32_serial,
116
+ &quantized_i8[vector_index * depth_i8_padded], &metadata[vector_index], &norm_sq);
117
+ metadata[vector_index].inverse_norm_f32 = norm_sq > 0.0f ? (1.0f / nk_f32_sqrt_v128relaxed(norm_sq)) : 0.0f;
118
+ char *destination_original = originals + vector_index * original_stride;
119
+ nk_copy_bytes_(destination_original, source_row, depth * element_bytes);
120
+ for (nk_size_t byte_index = depth * element_bytes; byte_index < original_stride; byte_index++)
121
+ destination_original[byte_index] = 0;
122
+ }
123
+ }
124
+
125
+ /**
126
+ * @brief Coarse i8 argmax kernel for WASM Relaxed SIMD.
127
+ * Uses relaxed_dot_i8x16_i7x16_add with both operands in [-63, 63].
128
+ * No bias correction needed (native signed×signed arithmetic).
129
+ * Simple 1Q×1D tiling with scalar running argmax.
130
+ */
131
+ NK_INTERNAL void nk_maxsim_coarse_argmax_v128relaxed_( //
132
+ nk_i8_t const *query_i8, nk_i8_t const *document_i8, //
133
+ nk_maxsim_vector_metadata_t const *document_metadata, //
134
+ nk_size_t query_count, nk_size_t document_count, //
135
+ nk_size_t depth_i8_padded, nk_u32_t *best_document_indices) {
136
+
137
+ nk_unused_(document_metadata);
138
+
139
+ for (nk_size_t query_index = 0; query_index < query_count; query_index++) {
140
+ nk_i8_t const *query_i8_row = query_i8 + query_index * depth_i8_padded;
141
+ nk_i32_t running_max_i32 = NK_I32_MIN;
142
+ nk_u32_t running_argmax_u32 = 0;
143
+
144
+ for (nk_size_t document_index = 0; document_index < document_count; document_index++) {
145
+ nk_i8_t const *document_i8_row = document_i8 + document_index * depth_i8_padded;
146
+ v128_t accumulator_i32x4 = wasm_i32x4_splat(0);
147
+
148
+ for (nk_size_t depth_index = 0; depth_index < depth_i8_padded; depth_index += 16) {
149
+ v128_t query_i8x16 = wasm_v128_load(query_i8_row + depth_index);
150
+ v128_t document_i8x16 = wasm_v128_load(document_i8_row + depth_index);
151
+ accumulator_i32x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(query_i8x16, document_i8x16,
152
+ accumulator_i32x4);
153
+ }
154
+
155
+ // Horizontal i32x4 reduce → scalar
156
+ nk_i32_t coarse_dot_i32 = nk_reduce_add_i32x4_v128relaxed_(accumulator_i32x4);
157
+
158
+ if (coarse_dot_i32 > running_max_i32) {
159
+ running_max_i32 = coarse_dot_i32;
160
+ running_argmax_u32 = (nk_u32_t)document_index;
161
+ }
162
+ }
163
+
164
+ best_document_indices[query_index] = running_argmax_u32;
165
+ }
166
+ }
167
+
168
+ NK_PUBLIC void nk_maxsim_packed_bf16_v128relaxed( //
169
+ void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
170
+ nk_size_t depth, nk_f32_t *result) {
171
+
172
+ nk_maxsim_packed_regions_t regions = nk_maxsim_extract_packed_regions_(query_packed, document_packed);
173
+ nk_f64_t total_angular_distance = 0.0;
174
+
175
+ for (nk_size_t chunk_start = 0; chunk_start < query_count; chunk_start += 256) {
176
+ nk_size_t chunk_size = query_count - chunk_start < 256 ? query_count - chunk_start : 256;
177
+ nk_u32_t best_document_indices[256];
178
+
179
+ nk_maxsim_coarse_argmax_v128relaxed_(regions.query_quantized + chunk_start * regions.depth_i8_padded,
180
+ regions.document_quantized, regions.document_metadata, chunk_size,
181
+ document_count, regions.depth_i8_padded, best_document_indices);
182
+
183
+ for (nk_size_t query_index = 0; query_index < chunk_size; query_index++) {
184
+ nk_u32_t best_document_index = best_document_indices[query_index];
185
+ nk_f32_t dot_result;
186
+ nk_dot_bf16((nk_bf16_t const *)(regions.query_originals +
187
+ (chunk_start + query_index) * regions.query_original_stride),
188
+ (nk_bf16_t const *)(regions.document_originals +
189
+ best_document_index * regions.document_original_stride),
190
+ depth, &dot_result);
191
+ nk_f32_t cosine = dot_result * regions.query_metadata[chunk_start + query_index].inverse_norm_f32 *
192
+ regions.document_metadata[best_document_index].inverse_norm_f32;
193
+ nk_f32_t angular = 1.0f - cosine;
194
+ if (angular < 0.0f) angular = 0.0f;
195
+ total_angular_distance += (nk_f64_t)angular;
196
+ }
197
+ }
198
+
199
+ *result = (nk_f32_t)total_angular_distance;
200
+ }
201
+
202
+ NK_PUBLIC void nk_maxsim_packed_f32_v128relaxed( //
203
+ void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
204
+ nk_size_t depth, nk_f64_t *result) {
205
+
206
+ nk_maxsim_packed_regions_t regions = nk_maxsim_extract_packed_regions_(query_packed, document_packed);
207
+ nk_f64_t total_angular_distance = 0.0;
208
+
209
+ for (nk_size_t chunk_start = 0; chunk_start < query_count; chunk_start += 256) {
210
+ nk_size_t chunk_size = query_count - chunk_start < 256 ? query_count - chunk_start : 256;
211
+ nk_u32_t best_document_indices[256];
212
+
213
+ nk_maxsim_coarse_argmax_v128relaxed_(regions.query_quantized + chunk_start * regions.depth_i8_padded,
214
+ regions.document_quantized, regions.document_metadata, chunk_size,
215
+ document_count, regions.depth_i8_padded, best_document_indices);
216
+
217
+ for (nk_size_t query_index = 0; query_index < chunk_size; query_index++) {
218
+ nk_u32_t best_document_index = best_document_indices[query_index];
219
+ nk_f64_t dot_result;
220
+ nk_dot_f32(
221
+ (nk_f32_t const *)(regions.query_originals +
222
+ (chunk_start + query_index) * regions.query_original_stride),
223
+ (nk_f32_t const *)(regions.document_originals + best_document_index * regions.document_original_stride),
224
+ depth, &dot_result);
225
+ nk_f64_t cosine = dot_result *
226
+ (nk_f64_t)regions.query_metadata[chunk_start + query_index].inverse_norm_f32 *
227
+ (nk_f64_t)regions.document_metadata[best_document_index].inverse_norm_f32;
228
+ nk_f64_t angular = 1.0 - cosine;
229
+ if (angular < 0.0) angular = 0.0;
230
+ total_angular_distance += angular;
231
+ }
232
+ }
233
+
234
+ *result = total_angular_distance;
235
+ }
236
+
237
+ NK_PUBLIC void nk_maxsim_packed_f16_v128relaxed( //
238
+ void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
239
+ nk_size_t depth, nk_f32_t *result) {
240
+
241
+ nk_maxsim_packed_regions_t regions = nk_maxsim_extract_packed_regions_(query_packed, document_packed);
242
+ nk_f64_t total_angular_distance = 0.0;
243
+
244
+ for (nk_size_t chunk_start = 0; chunk_start < query_count; chunk_start += 256) {
245
+ nk_size_t chunk_size = query_count - chunk_start < 256 ? query_count - chunk_start : 256;
246
+ nk_u32_t best_document_indices[256];
247
+
248
+ nk_maxsim_coarse_argmax_v128relaxed_(regions.query_quantized + chunk_start * regions.depth_i8_padded,
249
+ regions.document_quantized, regions.document_metadata, chunk_size,
250
+ document_count, regions.depth_i8_padded, best_document_indices);
251
+
252
+ for (nk_size_t query_index = 0; query_index < chunk_size; query_index++) {
253
+ nk_u32_t best_document_index = best_document_indices[query_index];
254
+ nk_f32_t dot_result;
255
+ nk_dot_f16(
256
+ (nk_f16_t const *)(regions.query_originals +
257
+ (chunk_start + query_index) * regions.query_original_stride),
258
+ (nk_f16_t const *)(regions.document_originals + best_document_index * regions.document_original_stride),
259
+ depth, &dot_result);
260
+ nk_f32_t cosine = dot_result * regions.query_metadata[chunk_start + query_index].inverse_norm_f32 *
261
+ regions.document_metadata[best_document_index].inverse_norm_f32;
262
+ nk_f32_t angular = 1.0f - cosine;
263
+ if (angular < 0.0f) angular = 0.0f;
264
+ total_angular_distance += (nk_f64_t)angular;
265
+ }
266
+ }
267
+
268
+ *result = (nk_f32_t)total_angular_distance;
269
+ }
270
+
271
+ #if defined(__clang__)
272
+ #pragma clang attribute pop
273
+ #endif
274
+
275
+ #if defined(__cplusplus)
276
+ } // extern "C"
277
+ #endif
278
+
279
+ #endif // NK_TARGET_V128RELAXED
280
+ #endif // NK_MAXSIM_V128RELAXED_H