numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,240 @@
1
+ /**
2
+ * @brief SIMD-accelerated Set Similarity Measures for WASM.
3
+ * @file include/numkong/set/v128relaxed.h
4
+ * @author Ash Vardanian
5
+ * @date February 1, 2026
6
+ *
7
+ * This file contains windowed implementations of Hamming and Jaccard distance
8
+ * for bit-level operations (u1 packed bits). The windowing optimization reduces
9
+ * widening overhead by 96.7%, providing 5-10× speedup over naive implementations.
10
+ *
11
+ * Algorithm: Accumulate popcount results in u8 for 31 iterations, then widen
12
+ * to u16 → u32 once. Since max(popcount(u8)) = 8, we can safely accumulate
13
+ * 31 × 8 = 248 < 255 (u8 max) without overflow.
14
+ */
15
+
16
+ #ifndef NK_SET_V128RELAXED_H
17
+ #define NK_SET_V128RELAXED_H
18
+
19
+ #if NK_TARGET_V128RELAXED
20
+
21
+ #include "numkong/types.h"
22
+ #include "numkong/reduce/v128relaxed.h"
23
+ #include "numkong/set/serial.h"
24
+
25
+ #if defined(__cplusplus)
26
+ extern "C" {
27
+ #endif
28
+
29
+ #if defined(__clang__)
30
+ #pragma clang attribute push(__attribute__((target("relaxed-simd"))), apply_to = function)
31
+ #endif
32
+
33
+ #pragma region - Binary Sets
34
+
35
+ NK_PUBLIC void nk_hamming_u1_v128relaxed(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
36
+ nk_u8_t const *a_bytes = (nk_u8_t const *)a;
37
+ nk_u8_t const *b_bytes = (nk_u8_t const *)b;
38
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
39
+
40
+ nk_u32_t differences = 0;
41
+ nk_size_t i = 0;
42
+
43
+ // Windowed accumulation loop
44
+ while (i + 16 <= n_bytes) {
45
+ v128_t popcount_u8x16 = wasm_i8x16_splat(0);
46
+
47
+ // Inner loop: accumulate 31 iterations in u8 before widening
48
+ nk_size_t cycle = 0;
49
+ for (; cycle < 31 && i + 16 <= n_bytes; ++cycle, i += 16) {
50
+ v128_t a_u8x16 = wasm_v128_load(a_bytes + i);
51
+ v128_t b_u8x16 = wasm_v128_load(b_bytes + i);
52
+
53
+ // XOR to find differing bits
54
+ v128_t xor_u8x16 = wasm_v128_xor(a_u8x16, b_u8x16);
55
+
56
+ // Popcount each byte
57
+ v128_t popcnt_u8x16 = wasm_i8x16_popcnt(xor_u8x16);
58
+
59
+ // Accumulate in u8 (safe: 31 × 8 = 248 < 255)
60
+ popcount_u8x16 = wasm_i8x16_add(popcount_u8x16, popcnt_u8x16);
61
+ }
62
+
63
+ // Widen once per window: u8 → u16 → u32
64
+ differences += nk_reduce_add_u8x16_v128relaxed_(popcount_u8x16);
65
+ }
66
+
67
+ // Handle tail bytes
68
+ for (; i < n_bytes; i++) {
69
+ nk_u8_t xor_byte = a_bytes[i] ^ b_bytes[i];
70
+ differences += nk_u1x8_popcount_(xor_byte);
71
+ }
72
+
73
+ *result = differences;
74
+ }
75
+
76
+ NK_PUBLIC void nk_jaccard_u1_v128relaxed(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result) {
77
+ nk_u8_t const *a_bytes = (nk_u8_t const *)a;
78
+ nk_u8_t const *b_bytes = (nk_u8_t const *)b;
79
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
80
+
81
+ nk_u32_t intersection = 0;
82
+ nk_u32_t union_count = 0;
83
+ nk_size_t i = 0;
84
+
85
+ // Windowed accumulation loop
86
+ while (i + 16 <= n_bytes) {
87
+ v128_t popcount_and_u8x16 = wasm_i8x16_splat(0);
88
+ v128_t popcount_or_u8x16 = wasm_i8x16_splat(0);
89
+
90
+ // Inner loop: accumulate 31 iterations in u8 before widening
91
+ nk_size_t cycle = 0;
92
+ for (; cycle < 31 && i + 16 <= n_bytes; ++cycle, i += 16) {
93
+ v128_t a_u8x16 = wasm_v128_load(a_bytes + i);
94
+ v128_t b_u8x16 = wasm_v128_load(b_bytes + i);
95
+
96
+ // Intersection: a AND b
97
+ v128_t and_u8x16 = wasm_v128_and(a_u8x16, b_u8x16);
98
+ v128_t popcnt_and_u8x16 = wasm_i8x16_popcnt(and_u8x16);
99
+ popcount_and_u8x16 = wasm_i8x16_add(popcount_and_u8x16, popcnt_and_u8x16);
100
+
101
+ // Union: a OR b
102
+ v128_t or_u8x16 = wasm_v128_or(a_u8x16, b_u8x16);
103
+ v128_t popcnt_or_u8x16 = wasm_i8x16_popcnt(or_u8x16);
104
+ popcount_or_u8x16 = wasm_i8x16_add(popcount_or_u8x16, popcnt_or_u8x16);
105
+ }
106
+
107
+ // Widen once per window
108
+ intersection += nk_reduce_add_u8x16_v128relaxed_(popcount_and_u8x16);
109
+ union_count += nk_reduce_add_u8x16_v128relaxed_(popcount_or_u8x16);
110
+ }
111
+
112
+ // Handle tail bytes
113
+ for (; i < n_bytes; i++) {
114
+ nk_u8_t a_byte = a_bytes[i];
115
+ nk_u8_t b_byte = b_bytes[i];
116
+ intersection += nk_u1x8_popcount_(a_byte & b_byte);
117
+ union_count += nk_u1x8_popcount_(a_byte | b_byte);
118
+ }
119
+
120
+ // Jaccard distance = 1 - (intersection / union)
121
+ *result = union_count > 0 ? 1.0f - ((nk_f32_t)intersection / (nk_f32_t)union_count) : 0.0f;
122
+ }
123
+
124
+ #pragma endregion - Binary Sets
125
+
126
+ #pragma region - Integer Sets
127
+
128
+ NK_PUBLIC void nk_hamming_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
129
+ nk_u32_t sum_total = 0;
130
+ nk_size_t i = 0;
131
+
132
+ // Windowed accumulation: outer loop for windows, inner loop for iterations within window
133
+ while (i + 16 <= n) {
134
+ v128_t sum_u8x16 = wasm_i8x16_splat(0);
135
+
136
+ // Inner loop: accumulate up to 31 iterations in u8 (safe: 31 × 1 = 31 < 255)
137
+ nk_size_t cycle = 0;
138
+ for (; cycle < 31 && i + 16 <= n; ++cycle, i += 16) {
139
+ v128_t a_u8x16 = wasm_v128_load(a + i);
140
+ v128_t b_u8x16 = wasm_v128_load(b + i);
141
+
142
+ // Compare for inequality: 0xFF where different, 0x00 where same
143
+ v128_t neq_mask_u8x16 = wasm_i8x16_ne(a_u8x16, b_u8x16);
144
+
145
+ // Convert mask to count: 0xFF → 1, 0x00 → 0
146
+ v128_t neq_count_u8x16 = wasm_v128_and(neq_mask_u8x16, wasm_i8x16_splat(1));
147
+
148
+ // Accumulate counts
149
+ sum_u8x16 = wasm_i8x16_add(sum_u8x16, neq_count_u8x16);
150
+ }
151
+
152
+ // Widen and reduce once per window
153
+ sum_total += nk_reduce_add_u8x16_v128relaxed_(sum_u8x16);
154
+ }
155
+
156
+ // Traditional tail loop: handle remaining bytes (0-15) scalar-style
157
+ for (; i < n; i++) { sum_total += (a[i] != b[i]); }
158
+
159
+ *result = sum_total;
160
+ }
161
+
162
+ NK_PUBLIC void nk_jaccard_u32_v128relaxed(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
163
+ nk_u32_t matches = 0;
164
+ nk_size_t i = 0;
165
+ v128_t matches_u32x4 = wasm_i32x4_splat(0);
166
+
167
+ for (; i + 4 <= n; i += 4) {
168
+ v128_t a_u32x4 = wasm_v128_load(a + i);
169
+ v128_t b_u32x4 = wasm_v128_load(b + i);
170
+ v128_t eq_mask_u32x4 = wasm_i32x4_eq(a_u32x4, b_u32x4);
171
+ v128_t match_bits_u32x4 = wasm_u32x4_shr(eq_mask_u32x4, 31);
172
+ matches_u32x4 = wasm_i32x4_add(matches_u32x4, match_bits_u32x4);
173
+ }
174
+
175
+ matches += nk_reduce_add_u32x4_v128relaxed_(matches_u32x4);
176
+ for (; i < n; ++i) matches += (a[i] == b[i]);
177
+
178
+ *result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
179
+ }
180
+
181
+ NK_PUBLIC void nk_jaccard_u16_v128relaxed(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result) {
182
+ nk_u32_t matches = 0;
183
+ nk_size_t i = 0;
184
+ v128_t matches_u32x4 = wasm_i32x4_splat(0);
185
+
186
+ for (; i + 8 <= n; i += 8) {
187
+ v128_t a_u16x8 = wasm_v128_load(a + i);
188
+ v128_t b_u16x8 = wasm_v128_load(b + i);
189
+ v128_t eq_mask_u16x8 = wasm_i16x8_eq(a_u16x8, b_u16x8);
190
+ v128_t match_bits_u16x8 = wasm_u16x8_shr(eq_mask_u16x8, 15);
191
+ matches_u32x4 = wasm_i32x4_add(matches_u32x4, wasm_u32x4_extadd_pairwise_u16x8(match_bits_u16x8));
192
+ }
193
+
194
+ matches += nk_reduce_add_u32x4_v128relaxed_(matches_u32x4);
195
+ for (; i < n; ++i) matches += (a[i] == b[i]);
196
+
197
+ *result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
198
+ }
199
+
200
+ #pragma endregion - Integer Sets
201
+
202
+ #pragma region - Binary Sets from Dot
203
+
204
+ NK_INTERNAL void nk_hamming_u32x4_from_dot_v128relaxed_( //
205
+ nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops, nk_b128_vec_t *results) {
206
+ v128_t dots_u32x4 = dots.v128;
207
+ v128_t query_u32x4 = wasm_u32x4_splat(query_pop);
208
+ v128_t target_u32x4 = target_pops.v128;
209
+ results->v128 = wasm_i32x4_sub(wasm_i32x4_add(query_u32x4, target_u32x4), wasm_i32x4_shl(dots_u32x4, 1));
210
+ }
211
+
212
+ NK_INTERNAL void nk_jaccard_f32x4_from_dot_v128relaxed_( //
213
+ nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops, nk_b128_vec_t *results) {
214
+ v128_t dot_f32x4 = wasm_f32x4_convert_u32x4(dots.v128);
215
+ v128_t query_f32x4 = wasm_f32x4_splat((nk_f32_t)query_pop);
216
+ v128_t target_f32x4 = wasm_f32x4_convert_u32x4(target_pops.v128);
217
+ v128_t union_f32x4 = wasm_f32x4_sub(wasm_f32x4_add(query_f32x4, target_f32x4), dot_f32x4);
218
+
219
+ v128_t zero_f32x4 = wasm_f32x4_splat(0.0f);
220
+ v128_t one_f32x4 = wasm_f32x4_splat(1.0f);
221
+ v128_t zero_mask_u32x4 = wasm_f32x4_eq(union_f32x4, zero_f32x4);
222
+ v128_t safe_union_f32x4 = wasm_i32x4_relaxed_laneselect(one_f32x4, union_f32x4, zero_mask_u32x4);
223
+
224
+ v128_t ratio_f32x4 = wasm_f32x4_div(dot_f32x4, safe_union_f32x4);
225
+ v128_t jaccard_f32x4 = wasm_f32x4_sub(one_f32x4, ratio_f32x4);
226
+ results->v128 = wasm_i32x4_relaxed_laneselect(zero_f32x4, jaccard_f32x4, zero_mask_u32x4);
227
+ }
228
+
229
+ #pragma endregion - Binary Sets from Dot
230
+
231
+ #if defined(__clang__)
232
+ #pragma clang attribute pop
233
+ #endif
234
+
235
+ #if defined(__cplusplus)
236
+ } // extern "C"
237
+ #endif
238
+
239
+ #endif // NK_TARGET_V128RELAXED
240
+ #endif // NK_SET_V128RELAXED_H
@@ -0,0 +1,457 @@
1
+ /**
2
+ * @brief SIMD-accelerated Set Similarity Measures.
3
+ * @file include/numkong/set.h
4
+ * @author Ash Vardanian
5
+ * @date July 1, 2023
6
+ *
7
+ * Contains following similarity measures:
8
+ *
9
+ * - Bit-level Hamming distance → `u32` counter
10
+ * - Byte-level Hamming distance → `u32` counter
11
+ * - Bit-level Jaccard distance (Tanimoto coefficient) → `f32` ratio
12
+ * - Word-level Jaccard distance for `u16` and `u32` MinHash vectors from StringZilla → `f32` ratio
13
+ *
14
+ * For hardware architectures:
15
+ *
16
+ * - Arm: NEON, SVE
17
+ * - x86: Haswell, Ice Lake
18
+ * - RISC-V: RVV, RVV+BB
19
+ * - WASM: V128Relaxed
20
+ *
21
+ * @section numerical_stability Numerical Stability
22
+ *
23
+ * Hamming u1: u32 popcount accumulator. Overflows at n_bits > 2^32 (~4.3 billion).
24
+ * The streaming u1x512 variant uses u64, safe for any practical dimension.
25
+ * Jaccard u1: u32 intersection/union counts, f32 division at finalization.
26
+ * Popcount values above 2^24 lose precision in f32 cast (24-bit mantissa).
27
+ * Byte-level Hamming/Jaccard u8: u32 mismatch counter. Overflows at n > 2^32.
28
+ *
29
+ * @section popcount_strategies Population Count Strategies
30
+ *
31
+ * Jaccard distances are extremely common and also fairly cheap to compute on binary vectors.
32
+ * The hardest part of optimizing binary similarity measures is the population count operation.
33
+ * It's natively supported by almost every instruction set, but the throughput and latency can
34
+ * be suboptimal. There are several ways to optimize this operation:
35
+ *
36
+ * - Lookup tables, mostly using nibbles (4-bit lookups)
37
+ * - Harley-Seal population counts using Carry-Save Adders (CSA)
38
+ *
39
+ * @section x86_instructions Relevant x86 Instructions
40
+ *
41
+ * On binary vectors, when computing Jaccard distance, the CPU often struggles to compute the
42
+ * large number of required population counts. There are several instructions we should keep in mind:
43
+ *
44
+ * Intrinsic Instruction Ice Genoa
45
+ * _mm512_popcnt_epi64 VPOPCNTQ (ZMM, K, ZMM) 3cy @ p5 2cy @ p01
46
+ * _mm512_shuffle_epi8 VPSHUFB (ZMM, ZMM, ZMM) 1cy @ p5 2cy @ p12
47
+ * _mm512_sad_epu8 VPSADBW (ZMM, ZMM, ZMM) 3cy @ p5 3cy @ p01
48
+ * _mm512_ternarylogic_epi64 VPTERNLOGQ (ZMM, ZMM, ZMM, I8) 1cy @ p05 1cy @ p0123
49
+ * _mm512_gf2p8mul_epi8 VGF2P8MULB (ZMM, ZMM, ZMM) 5cy @ p0 3cy @ p01
50
+ *
51
+ * On Ice Lake, VPOPCNTQ bottlenecks on port 5. On AMD Genoa/Turin, it dual-issues
52
+ * on ports 0-1, making native popcount significantly faster without CSA tricks.
53
+ *
54
+ * @section harley_seal Harley-Seal Carry-Save Adders
55
+ *
56
+ * The Harley-Seal algorithm uses Carry-Save Adders (CSA) to accumulate population counts
57
+ * with fewer VPOPCNTQ instructions. A CSA computes (a + b + c) as (sum, carry) using only
58
+ * bitwise operations, deferring expensive popcounts to the final reduction.
59
+ *
60
+ * Performance varies significantly by architecture and buffer size (cycles/byte):
61
+ *
62
+ * Method Buffer Ice Lake Sapphire Genoa
63
+ * Native VPOPCNTQ any ~0.12 ~0.10 ~0.06
64
+ * Harley-Seal CSA 1 KB 0.107 0.095 0.08
65
+ * Harley-Seal CSA 4 KB 0.056 0.052 0.05
66
+ * VPSHUFB lookup 4 KB 0.063 0.058 0.07
67
+ *
68
+ * For small buffers (<1KB), loop overhead dominates and unrolled native VPOPCNTQ wins.
69
+ * Harley-Seal shines on large buffers where CSA chains amortize the setup cost.
70
+ * On AMD Genoa, native VPOPCNTQ is competitive even for large buffers.
71
+ *
72
+ * @section jaccard_norms Jaccard Optimization via Norms
73
+ *
74
+ * There is a trivial optimization to halve the number of population counts needed for
75
+ * binary Jaccard distance, if one knows the set magnitudes ahead of time:
76
+ *
77
+ * J = |A ∩ B| / |A ∪ B| = |A ∩ B| / (|A| + |B| - |A ∩ B|)
78
+ *
79
+ * At that point the problem reduces to optimizing memory accesses and register usage.
80
+ * For such cases, we provide additional function variants designed exclusively for compile-time
81
+ * dispatch in heavily inlined code, operating on wider vectors with known sizes:
82
+ *
83
+ * - nk_jaccard_u1x512_state_<isa>_t - Smallest optimal running state
84
+ * - nk_jaccard_u1x512_init_<isa> - Initializes the running state
85
+ * - nk_jaccard_u1x512_update_<isa> - Updates the running state with 2 new 512-bit vectors
86
+ * - nk_jaccard_u1x512_finalize_<isa> - Finalizes the running state and produces the distance
87
+ *
88
+ * @section streaming_api Streaming API
89
+ *
90
+ * The streaming variants aren't always strictly equivalent to their counterparts above
91
+ * and their usage also differs quite drastically. For large-scale batch processing where
92
+ * vectors won't be reused, consider non-temporal loads (`_mm512_stream_load_si512`) to
93
+ * bypass the cache and avoid pollution. This is especially beneficial when computing
94
+ * distances across millions of vectors in a single pass.
95
+ *
96
+ * @code{.c}
97
+ * // 1024-dimensional binary vectors, one query and four targets
98
+ * nk_u1x8_t query[128], target_first[128], target_second[128], target_third[128], target_fourth[128];
99
+ * // Precomputed popcount of 'a' as f32
100
+ * nk_f32_t query_popcount = ...;
101
+ * nk_f32_t target_popcount_first = ..., target_popcount_second = ...;
102
+ *
103
+ * nk_jaccard_u1x512_state_icelake_t state_first, state_second, state_third, state_fourth;
104
+ * nk_jaccard_u1x512_init_icelake(&state_first);
105
+ * nk_jaccard_u1x512_init_icelake(&state_second);
106
+ * nk_jaccard_u1x512_init_icelake(&state_third);
107
+ * nk_jaccard_u1x512_init_icelake(&state_fourth);
108
+ * nk_jaccard_u1x512_update_icelake(&state_first, &query[0], &target_first[0], 0, 512); // First 512 bits
109
+ * nk_jaccard_u1x512_update_icelake(&state_first, &query[64], &target_first[64], 512, 512); // Second 512 bits
110
+ * // ... update state_second, state_third, state_fourth similarly ...
111
+ *
112
+ * nk_f32_t results[4];
113
+ * nk_jaccard_u1x512_finalize_icelake(&state_first, &state_second, &state_third, &state_fourth,
114
+ * query_popcount, target_popcount_first, target_popcount_second,
115
+ * target_popcount_third, target_popcount_fourth, total_dimensions, results);
116
+ * @endcode
117
+ *
118
+ * @section tail_handling Tail Handling
119
+ *
120
+ * The trickiest part is handling the tails of the vectors when their size isn't divisible
121
+ * by our step size. In such cases, it's recommended to use masked loads when supported by
122
+ * the ISA, or fall back to scalar code and a local on-stack buffer.
123
+ *
124
+ * @section references References
125
+ *
126
+ * - Intel Intrinsics Guide: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
127
+ * - Arm Intrinsics Reference: https://developer.arm.com/architectures/instruction-sets/intrinsics/
128
+ * - Muła et al. "Faster Population Counts": https://arxiv.org/pdf/1611.07612
129
+ * - Muła SSE POPCOUNT experiments: https://github.com/WojciechMula/sse-popcount
130
+ * - NumKong binary R&D tracker: https://github.com/ashvardanian/NumKong/pull/138
131
+ *
132
+ * @section Finalize Output Types
133
+ *
134
+ * Jaccard similarity finalize outputs to f32:
135
+ * - Jaccard = intersection / union, always ∈ [0.0, 1.0]
136
+ * - f32 provides ~7 decimal digits, far exceeding practical needs
137
+ * - Matches spatial.h convention for non-f64 distance outputs
138
+ * - Reduces memory footprint in large-scale binary similarity search
139
+ *
140
+ * The intersection and union counts are u64 internally for correctness,
141
+ * but the final ratio fits comfortably in f32.
142
+ *
143
+ */
144
+ #ifndef NK_SET_H
145
+ #define NK_SET_H
146
+
147
+ #include "numkong/types.h"
148
+
149
+ #if defined(__cplusplus)
150
+ extern "C" {
151
+ #endif
152
+
153
+ /**
154
+ * @brief Binary Hamming distance computing the number of differing bits between two binary vectors.
155
+ *
156
+ * @param[in] a The first binary vector.
157
+ * @param[in] b The second binary vector.
158
+ * @param[in] n The number of bits in the vectors.
159
+ * @param[out] result The output distance value.
160
+ *
161
+ * @note The output distance value is non-negative.
162
+ * @note The output distance value is zero if and only if the two vectors are identical.
163
+ */
164
+ NK_DYNAMIC void nk_hamming_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result);
165
+
166
+ /**
167
+ * @brief Binary Jaccard distance computing the ratio of differing bits to the union of bits.
168
+ *
169
+ * @param[in] a The first binary vector.
170
+ * @param[in] b The second binary vector.
171
+ * @param[in] n The number of bits in the vectors.
172
+ * @param[out] result The output distance value.
173
+ *
174
+ * @note The output distance value is non-negative.
175
+ * @note The output distance value is zero if and only if the two vectors are identical.
176
+ */
177
+ NK_DYNAMIC void nk_jaccard_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result);
178
+
179
+ /**
180
+ * @brief Integral Jaccard distance computing the ratio of differing bits to the union of bits.
181
+ *
182
+ * @param[in] a The first binary vector.
183
+ * @param[in] b The second binary vector.
184
+ * @param[in] n The number of 32-bit scalars in the vectors.
185
+ * @param[out] result The output distance value.
186
+ *
187
+ * @note The output distance value is non-negative.
188
+ * @note The output distance value is zero if and only if the two vectors are identical.
189
+ */
190
+ NK_DYNAMIC void nk_jaccard_u32(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result);
191
+
192
+ /**
193
+ * @brief Byte-level Hamming distance computing the number of differing bytes between two vectors.
194
+ *
195
+ * @param[in] a The first byte vector.
196
+ * @param[in] b The second byte vector.
197
+ * @param[in] n The number of bytes in the vectors.
198
+ * @param[out] result The output distance value.
199
+ *
200
+ * @note The output distance value is non-negative.
201
+ * @note The output distance value is zero if and only if the two vectors are identical.
202
+ */
203
+ NK_DYNAMIC void nk_hamming_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
204
+
205
+ /**
206
+ * @brief Integral Jaccard distance for 16-bit unsigned integer vectors.
207
+ *
208
+ * @param[in] a The first vector.
209
+ * @param[in] b The second vector.
210
+ * @param[in] n The number of 16-bit scalars in the vectors.
211
+ * @param[out] result The output distance value.
212
+ *
213
+ * @note The output distance value is non-negative.
214
+ * @note The output distance value is zero if and only if the two vectors are identical.
215
+ */
216
+ NK_DYNAMIC void nk_jaccard_u16(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result);
217
+
218
+ /** @copydoc nk_hamming_u1 */
219
+ NK_PUBLIC void nk_hamming_u1_serial(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result);
220
+ /** @copydoc nk_hamming_u8 */
221
+ NK_PUBLIC void nk_hamming_u8_serial(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
222
+ /** @copydoc nk_jaccard_u1 */
223
+ NK_PUBLIC void nk_jaccard_u1_serial(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result);
224
+ /** @copydoc nk_jaccard_u32 */
225
+ NK_PUBLIC void nk_jaccard_u32_serial(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result);
226
+ /** @copydoc nk_jaccard_u16 */
227
+ NK_PUBLIC void nk_jaccard_u16_serial(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result);
228
+
229
+ #if NK_TARGET_NEON
230
+ /** @copydoc nk_hamming_u1 */
231
+ NK_PUBLIC void nk_hamming_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result);
232
+ /** @copydoc nk_hamming_u8 */
233
+ NK_PUBLIC void nk_hamming_u8_neon(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
234
+ /** @copydoc nk_jaccard_u1 */
235
+ NK_PUBLIC void nk_jaccard_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result);
236
+ /** @copydoc nk_jaccard_u32 */
237
+ NK_PUBLIC void nk_jaccard_u32_neon(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result);
238
+ /** @copydoc nk_jaccard_u16 */
239
+ NK_PUBLIC void nk_jaccard_u16_neon(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result);
240
+
241
+ #endif // NK_TARGET_NEON
242
+
243
+ #if NK_TARGET_SVE
244
+ /** @copydoc nk_hamming_u1 */
245
+ NK_PUBLIC void nk_hamming_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result);
246
+ /** @copydoc nk_hamming_u8 */
247
+ NK_PUBLIC void nk_hamming_u8_sve(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
248
+ /** @copydoc nk_jaccard_u1 */
249
+ NK_PUBLIC void nk_jaccard_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result);
250
+ /** @copydoc nk_jaccard_u32 */
251
+ NK_PUBLIC void nk_jaccard_u32_sve(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result);
252
+ /** @copydoc nk_jaccard_u16 */
253
+ NK_PUBLIC void nk_jaccard_u16_sve(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result);
254
+ #endif // NK_TARGET_SVE
255
+
256
+ #if NK_TARGET_HASWELL
257
+ /** @copydoc nk_hamming_u1 */
258
+ NK_PUBLIC void nk_hamming_u1_haswell(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result);
259
+ /** @copydoc nk_hamming_u8 */
260
+ NK_PUBLIC void nk_hamming_u8_haswell(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
261
+ /** @copydoc nk_jaccard_u1 */
262
+ NK_PUBLIC void nk_jaccard_u1_haswell(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result);
263
+ /** @copydoc nk_jaccard_u16 */
264
+ NK_PUBLIC void nk_jaccard_u16_haswell(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result);
265
+ /** @copydoc nk_jaccard_u32 */
266
+ NK_PUBLIC void nk_jaccard_u32_haswell(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result);
267
+ #endif // NK_TARGET_HASWELL
268
+
269
+ #if NK_TARGET_ICELAKE
270
+ /** @copydoc nk_hamming_u1 */
271
+ NK_PUBLIC void nk_hamming_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result);
272
+ /** @copydoc nk_hamming_u8 */
273
+ NK_PUBLIC void nk_hamming_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
274
+ /** @copydoc nk_jaccard_u1 */
275
+ NK_PUBLIC void nk_jaccard_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result);
276
+ /** @copydoc nk_jaccard_u32 */
277
+ NK_PUBLIC void nk_jaccard_u32_icelake(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result);
278
+ /** @copydoc nk_jaccard_u16 */
279
+ NK_PUBLIC void nk_jaccard_u16_icelake(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result);
280
+ #endif // NK_TARGET_ICELAKE
281
+
282
+ #if NK_TARGET_RVVBB
283
+ /** @copydoc nk_hamming_u1 */
284
+ NK_PUBLIC void nk_hamming_u1_rvvbb(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result);
285
+ /** @copydoc nk_jaccard_u1 */
286
+ NK_PUBLIC void nk_jaccard_u1_rvvbb(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result);
287
+ #endif // NK_TARGET_RVVBB
288
+
289
+ #if NK_TARGET_RVV
290
+ /** @copydoc nk_hamming_u1 */
291
+ NK_PUBLIC void nk_hamming_u1_rvv(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result);
292
+ /** @copydoc nk_hamming_u8 */
293
+ NK_PUBLIC void nk_hamming_u8_rvv(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
294
+ /** @copydoc nk_jaccard_u1 */
295
+ NK_PUBLIC void nk_jaccard_u1_rvv(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result);
296
+ /** @copydoc nk_jaccard_u16 */
297
+ NK_PUBLIC void nk_jaccard_u16_rvv(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result);
298
+ /** @copydoc nk_jaccard_u32 */
299
+ NK_PUBLIC void nk_jaccard_u32_rvv(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result);
300
+ #endif // NK_TARGET_RVV
301
+
302
+ #if NK_TARGET_V128RELAXED
303
+ /** @copydoc nk_hamming_u1 */
304
+ NK_PUBLIC void nk_hamming_u1_v128relaxed(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result);
305
+ /** @copydoc nk_hamming_u8 */
306
+ NK_PUBLIC void nk_hamming_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
307
+ /** @copydoc nk_jaccard_u1 */
308
+ NK_PUBLIC void nk_jaccard_u1_v128relaxed(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result);
309
+ /** @copydoc nk_jaccard_u16 */
310
+ NK_PUBLIC void nk_jaccard_u16_v128relaxed(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result);
311
+ /** @copydoc nk_jaccard_u32 */
312
+ NK_PUBLIC void nk_jaccard_u32_v128relaxed(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result);
313
+ #endif // NK_TARGET_V128RELAXED
314
+
315
+ /**
316
+ * @brief Returns the output dtype for Hamming distance.
317
+ */
318
+ NK_INTERNAL nk_dtype_t nk_hamming_output_dtype(nk_dtype_t dtype) {
319
+ switch (dtype) {
320
+ case nk_u1_k: return nk_u32_k;
321
+ case nk_u8_k: return nk_u32_k;
322
+ default: return nk_dtype_unknown_k;
323
+ }
324
+ }
325
+
326
+ /**
327
+ * @brief Returns the output dtype for Jaccard distance.
328
+ */
329
+ NK_INTERNAL nk_dtype_t nk_jaccard_output_dtype(nk_dtype_t dtype) {
330
+ switch (dtype) {
331
+ case nk_u1_k: return nk_f32_k;
332
+ case nk_u16_k: return nk_f32_k;
333
+ case nk_u32_k: return nk_f32_k;
334
+ default: return nk_dtype_unknown_k;
335
+ }
336
+ }
337
+
338
+ #if defined(__cplusplus)
339
+ } // extern "C"
340
+ #endif
341
+
342
+ #include "numkong/set/serial.h"
343
+ #include "numkong/set/neon.h"
344
+ #include "numkong/set/sve.h"
345
+ #include "numkong/set/icelake.h"
346
+ #include "numkong/set/haswell.h"
347
+ #include "numkong/set/v128relaxed.h"
348
+ #include "numkong/set/rvv.h"
349
+ #include "numkong/set/rvvbb.h"
350
+
351
+ #if defined(__cplusplus)
352
+ extern "C" {
353
+ #endif
354
+
355
+ #if !NK_DYNAMIC_DISPATCH
356
+
357
+ NK_PUBLIC void nk_hamming_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
358
+ #if NK_TARGET_V128RELAXED
359
+ nk_hamming_u1_v128relaxed(a, b, n, result);
360
+ #elif NK_TARGET_SVE
361
+ nk_hamming_u1_sve(a, b, n, result);
362
+ #elif NK_TARGET_NEON
363
+ nk_hamming_u1_neon(a, b, n, result);
364
+ #elif NK_TARGET_ICELAKE
365
+ nk_hamming_u1_icelake(a, b, n, result);
366
+ #elif NK_TARGET_HASWELL
367
+ nk_hamming_u1_haswell(a, b, n, result);
368
+ #elif NK_TARGET_RVVBB
369
+ nk_hamming_u1_rvvbb(a, b, n, result);
370
+ #elif NK_TARGET_RVV
371
+ nk_hamming_u1_rvv(a, b, n, result);
372
+ #else
373
+ nk_hamming_u1_serial(a, b, n, result);
374
+ #endif
375
+ }
376
+
377
+ NK_PUBLIC void nk_jaccard_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result) {
378
+ #if NK_TARGET_V128RELAXED
379
+ nk_jaccard_u1_v128relaxed(a, b, n, result);
380
+ #elif NK_TARGET_SVE
381
+ nk_jaccard_u1_sve(a, b, n, result);
382
+ #elif NK_TARGET_NEON
383
+ nk_jaccard_u1_neon(a, b, n, result);
384
+ #elif NK_TARGET_ICELAKE
385
+ nk_jaccard_u1_icelake(a, b, n, result);
386
+ #elif NK_TARGET_HASWELL
387
+ nk_jaccard_u1_haswell(a, b, n, result);
388
+ #elif NK_TARGET_RVVBB
389
+ nk_jaccard_u1_rvvbb(a, b, n, result);
390
+ #elif NK_TARGET_RVV
391
+ nk_jaccard_u1_rvv(a, b, n, result);
392
+ #else
393
+ nk_jaccard_u1_serial(a, b, n, result);
394
+ #endif
395
+ }
396
+
397
+ NK_PUBLIC void nk_jaccard_u32(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
398
+ #if NK_TARGET_V128RELAXED
399
+ nk_jaccard_u32_v128relaxed(a, b, n, result);
400
+ #elif NK_TARGET_SVE
401
+ nk_jaccard_u32_sve(a, b, n, result);
402
+ #elif NK_TARGET_NEON
403
+ nk_jaccard_u32_neon(a, b, n, result);
404
+ #elif NK_TARGET_ICELAKE
405
+ nk_jaccard_u32_icelake(a, b, n, result);
406
+ #elif NK_TARGET_HASWELL
407
+ nk_jaccard_u32_haswell(a, b, n, result);
408
+ #elif NK_TARGET_RVV
409
+ nk_jaccard_u32_rvv(a, b, n, result);
410
+ #else
411
+ nk_jaccard_u32_serial(a, b, n, result);
412
+ #endif
413
+ }
414
+
415
+ NK_PUBLIC void nk_hamming_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
416
+ #if NK_TARGET_V128RELAXED
417
+ nk_hamming_u8_v128relaxed(a, b, n, result);
418
+ #elif NK_TARGET_SVE
419
+ nk_hamming_u8_sve(a, b, n, result);
420
+ #elif NK_TARGET_NEON
421
+ nk_hamming_u8_neon(a, b, n, result);
422
+ #elif NK_TARGET_ICELAKE
423
+ nk_hamming_u8_icelake(a, b, n, result);
424
+ #elif NK_TARGET_HASWELL
425
+ nk_hamming_u8_haswell(a, b, n, result);
426
+ #elif NK_TARGET_RVV
427
+ nk_hamming_u8_rvv(a, b, n, result);
428
+ #else
429
+ nk_hamming_u8_serial(a, b, n, result);
430
+ #endif
431
+ }
432
+
433
+ NK_PUBLIC void nk_jaccard_u16(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result) {
434
+ #if NK_TARGET_V128RELAXED
435
+ nk_jaccard_u16_v128relaxed(a, b, n, result);
436
+ #elif NK_TARGET_SVE
437
+ nk_jaccard_u16_sve(a, b, n, result);
438
+ #elif NK_TARGET_NEON
439
+ nk_jaccard_u16_neon(a, b, n, result);
440
+ #elif NK_TARGET_ICELAKE
441
+ nk_jaccard_u16_icelake(a, b, n, result);
442
+ #elif NK_TARGET_HASWELL
443
+ nk_jaccard_u16_haswell(a, b, n, result);
444
+ #elif NK_TARGET_RVV
445
+ nk_jaccard_u16_rvv(a, b, n, result);
446
+ #else
447
+ nk_jaccard_u16_serial(a, b, n, result);
448
+ #endif
449
+ }
450
+
451
+ #endif // !NK_DYNAMIC_DISPATCH
452
+
453
+ #if defined(__cplusplus)
454
+ } // extern "C"
455
+ #endif
456
+
457
+ #endif