numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,114 @@
1
+ /**
2
+ * @brief C++ bindings for set-intersection kernels.
3
+ * @file include/numkong/set.hpp
4
+ * @author Ash Vardanian
5
+ * @date February 5, 2026
6
+ */
7
+ #ifndef NK_SET_HPP
8
+ #define NK_SET_HPP
9
+
10
+ #include <cstdint>
11
+ #include <type_traits>
12
+
13
+ #include "numkong/set.h"
14
+ #include "numkong/sets.h"
15
+
16
+ #include "numkong/types.hpp"
17
+
18
+ namespace ashvardanian::numkong {
19
+
20
+ /**
21
+ * @brief Hamming distance: Σ(aᵢ ⊕ bᵢ)
22
+ * @param[in] a,b Input vectors
23
+ * @param[in] d Number of dimensions
24
+ * @param[out] r Pointer to output count
25
+ *
26
+ * @tparam in_type_ Input vector element type (u1x8_t or u8_t)
27
+ * @tparam result_type_ Accumulator type, defaults to `in_type_::hamming_result_t`
28
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
29
+ */
30
+ template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::hamming_result_t,
31
+ allow_simd_t allow_simd_ = prefer_simd_k>
32
+ void hamming(in_type_ const *a, in_type_ const *b, std::size_t d, result_type_ *r) noexcept {
33
+ constexpr bool simd = allow_simd_ == prefer_simd_k &&
34
+ std::is_same_v<result_type_, typename in_type_::hamming_result_t>;
35
+
36
+ if constexpr (std::is_same_v<in_type_, u1x8_t> && simd) nk_hamming_u1(&a->raw_, &b->raw_, d, &r->raw_);
37
+ else if constexpr (std::is_same_v<in_type_, u8_t> && simd) nk_hamming_u8(&a->raw_, &b->raw_, d, &r->raw_);
38
+ else {
39
+ constexpr std::size_t dims_per_value = dimensions_per_value<in_type_>();
40
+ std::size_t n = divide_round_up(d, dims_per_value);
41
+ typename result_type_::raw_t count = 0;
42
+ for (std::size_t i = 0; i < n; i++) count += count_differences(a[i], b[i]);
43
+ *r = result_type_::from_raw(count);
44
+ }
45
+ }
46
+
47
+ /**
48
+ * @brief Jaccard distance: 1 − |A ∩ B| / |A ∪ B|
49
+ * @param[in] a,b Input vectors
50
+ * @param[in] d Number of dimensions
51
+ * @param[out] r Pointer to output distance
52
+ *
53
+ * For u1x8_t (bit vectors): uses popcount(AND) / popcount(OR)
54
+ * For u16_t/u32_t (element vectors): uses count of matching elements / total
55
+ *
56
+ * @tparam in_type_ Input vector element type (u1x8_t, u16_t, or u32_t)
57
+ * @tparam result_type_ Accumulator type, defaults to `in_type_::jaccard_result_t`
58
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
59
+ */
60
+ template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::jaccard_result_t,
61
+ allow_simd_t allow_simd_ = prefer_simd_k>
62
+ void jaccard(in_type_ const *a, in_type_ const *b, std::size_t d, result_type_ *r) noexcept {
63
+ constexpr bool simd = allow_simd_ == prefer_simd_k &&
64
+ std::is_same_v<result_type_, typename in_type_::jaccard_result_t>;
65
+
66
+ if constexpr (std::is_same_v<in_type_, u1x8_t> && simd) nk_jaccard_u1(&a->raw_, &b->raw_, d, &r->raw_);
67
+ else if constexpr (std::is_same_v<in_type_, u16_t> && simd) nk_jaccard_u16(&a->raw_, &b->raw_, d, &r->raw_);
68
+ else if constexpr (std::is_same_v<in_type_, u32_t> && simd) nk_jaccard_u32(&a->raw_, &b->raw_, d, &r->raw_);
69
+ else {
70
+ constexpr std::size_t dims_per_value = dimensions_per_value<in_type_>();
71
+ std::size_t n = divide_round_up(d, dims_per_value);
72
+ std::uint32_t intersection_count = 0, union_count = 0;
73
+ for (std::size_t i = 0; i < n; i++)
74
+ intersection_count += count_intersection(a[i], b[i]), union_count += count_union(a[i], b[i]);
75
+ if (union_count == 0) *r = result_type_();
76
+ else *r = result_type_(1) - result_type_(intersection_count) / result_type_(union_count);
77
+ }
78
+ }
79
+
80
+ } // namespace ashvardanian::numkong
81
+
82
+ #include "numkong/tensor.hpp"
83
+
84
+ namespace ashvardanian::numkong {
85
+
86
+ template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::hamming_result_t,
87
+ allow_simd_t allow_simd_ = prefer_simd_k, std::size_t max_rank_a_, std::size_t max_rank_b_>
88
+ void hamming(tensor_view<in_type_, max_rank_a_> a, tensor_view<in_type_, max_rank_b_> b, std::size_t d,
89
+ result_type_ *r) noexcept {
90
+ hamming<in_type_, result_type_, allow_simd_>(a.data(), b.data(), d, r);
91
+ }
92
+
93
+ template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::hamming_result_t,
94
+ allow_simd_t allow_simd_ = prefer_simd_k>
95
+ void hamming(vector_view<in_type_> a, vector_view<in_type_> b, std::size_t d, result_type_ *r) noexcept {
96
+ hamming<in_type_, result_type_, allow_simd_>(a.data(), b.data(), d, r);
97
+ }
98
+
99
+ template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::jaccard_result_t,
100
+ allow_simd_t allow_simd_ = prefer_simd_k, std::size_t max_rank_a_, std::size_t max_rank_b_>
101
+ void jaccard(tensor_view<in_type_, max_rank_a_> a, tensor_view<in_type_, max_rank_b_> b, std::size_t d,
102
+ result_type_ *r) noexcept {
103
+ jaccard<in_type_, result_type_, allow_simd_>(a.data(), b.data(), d, r);
104
+ }
105
+
106
+ template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::jaccard_result_t,
107
+ allow_simd_t allow_simd_ = prefer_simd_k>
108
+ void jaccard(vector_view<in_type_> a, vector_view<in_type_> b, std::size_t d, result_type_ *r) noexcept {
109
+ jaccard<in_type_, result_type_, allow_simd_>(a.data(), b.data(), d, r);
110
+ }
111
+
112
+ } // namespace ashvardanian::numkong
113
+
114
+ #endif // NK_SET_HPP
@@ -0,0 +1,149 @@
1
+ # Batched Set Distances in NumKong
2
+
3
+ NumKong implements batched M×N Hamming and Jaccard distance matrices for binary vectors. The module reuses the dots u1 packing and GEMM infrastructure, converting popcount-of-AND dot products to set distances via precomputed norms.
4
+
5
+ Hamming distance from batched dot products:
6
+
7
+ ```math
8
+ D_{ij} = \|A_i\|_1 + \|B_j\|_1 - 2 \cdot \text{dot}(A_i, B_j)
9
+ ```
10
+
11
+ Where dot = popcount(AND), measuring intersection size.
12
+
13
+ Jaccard distance from batched dot products:
14
+
15
+ ```math
16
+ D_{ij} = 1 - \frac{\text{dot}(A_i, B_j)}{\|A_i\|_1 + \|B_j\|_1 - \text{dot}(A_i, B_j)}
17
+ ```
18
+
19
+ Reformulating as Python pseudocode:
20
+
21
+ ```python
22
+ import numpy as np
23
+
24
+ def hammings_packed(a: np.ndarray, b: np.ndarray) -> np.ndarray:
25
+ dots = np.array([[np.unpackbits(np.bitwise_and(ai, bj)).sum()
26
+ for bj in b] for ai in a])
27
+ a_pop = np.array([np.unpackbits(ai).sum() for ai in a])[:, None]
28
+ b_pop = np.array([np.unpackbits(bj).sum() for bj in b])[None, :]
29
+ return a_pop + b_pop - 2 * dots
30
+
31
+ def jaccards_packed(a: np.ndarray, b: np.ndarray) -> np.ndarray:
32
+ dots = np.array([[np.unpackbits(np.bitwise_and(ai, bj)).sum()
33
+ for bj in b] for ai in a])
34
+ a_pop = np.array([np.unpackbits(ai).sum() for ai in a])[:, None]
35
+ b_pop = np.array([np.unpackbits(bj).sum() for bj in b])[None, :]
36
+ union = a_pop + b_pop - dots
37
+ return np.where(union > 0, 1.0 - dots / union, 0.0)
38
+ ```
39
+
40
+ ## Input & Output Types
41
+
42
+ | Input Type | Output Type | Description |
43
+ | ---------- | ----------- | -------------------------------------- |
44
+ | `u1` | `u32` | Binary Hamming distance, packed octets |
45
+ | `u1` | `f32` | Binary Jaccard distance, packed octets |
46
+
47
+ ## Optimizations
48
+
49
+ ### Hamming and Jaccard from Intersection Counts
50
+
51
+ `nk_hammings_packed_u1_serial`, `nk_hammings_packed_u1_haswell`, `nk_jaccards_packed_u1_serial`, `nk_jaccards_packed_u1_haswell` reuse the dots u1 GEMM output where each dot product $\text{dot}(a, b) = \text{popcount}(a \mathbin{\&} b) = |A \cap B|$ counts intersection bits.
52
+ The L1 norm of a binary vector is its popcount: $|A| = \text{popcount}(a) = \|a\|_1$.
53
+ By inclusion-exclusion, $|A \cup B| = |A| + |B| - |A \cap B|$.
54
+ Hamming distance counts positions where exactly one bit is set: $D_H = |A| + |B| - 2|A \cap B| = \text{popcount}(a \oplus b)$.
55
+ Finalizer `nk_hamming_u32x4_from_dot_serial_` computes `pop_a + pop_b - 2 * dot` in pure UInt32 arithmetic — no division, no float conversion, no sqrt.
56
+ Jaccard distance: $D_J = 1 - \frac{|A \cap B|}{|A \cup B|} = 1 - \frac{\text{dot}}{\text{pop}_a + \text{pop}_b - \text{dot}}$.
57
+ Finalizer `nk_jaccard_f32x4_from_dot_serial_` requires UInt32 → Float32 cast plus Float32 division (~11cy latency on Haswell), making it ~3× more expensive per element than Hamming's integer subtraction chain.
58
+ Per-column popcount norms ($\|a\|_1$, $\|b\|_1$) are precomputed during packing and stored in packed buffer metadata, avoiding per-pair recomputation.
59
+
60
+ ### SME Binary Outer-Product Accumulation
61
+
62
+ `nk_hammings_packed_u1_smebi32`, `nk_jaccards_packed_u1_smebi32` use the `BMOPA` instruction which computes $\text{popcount}(\text{XNOR}(a, b))$ — counting _matching_ bits in a single outer-product operation over 16×16 output tiles with 512-bit depth chunks.
63
+ This is fundamentally different from the AND+POPCNT used by scalar/NEON/x86 kernels, which count _intersection_ bits.
64
+ Hamming from `BMOPA`: $D_H = \text{depth\_bits} - \text{popcount}(\text{XNOR})$, since XOR popcount (differing bits) is the Hamming distance directly — no per-vector norm correction needed.
65
+ Jaccard from `BMOPA`: must convert matching-bit counts to intersection via $|A \cap B| = (\text{popcount}(\text{XNOR}) - (\text{depth\_bits} - |A| - |B|)) / 2$, then apply the Jaccard formula — more arithmetic than the AND-based path.
66
+ Streaming mode overhead (~50–100 cycles for `SMSTART`/`SMSTOP`) is amortized across the full M×N output.
67
+
68
+ ## Performance
69
+
70
+ The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
71
+ The input size is controlled by `NK_MATRIX_HEIGHT`, `NK_MATRIX_WIDTH`, and `NK_MATRIX_DEPTH` environment variables, all set to the same value for batched set operations over square matrices.
72
+ Columns show throughput for 256³, 1024³, and 4096³ configurations.
73
+ The throughput is measured in GSO/s as Giga Scalar Operations per Second.
74
+ Accuracy is reported where applicable as exact distance in the result representation; floating Jaccard rows are shown as mean ULP (units in last place).
75
+ Each kernel runs for at least 20 seconds per configuration.
76
+ Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
77
+ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
78
+
79
+ ### Intel Sapphire Rapids
80
+
81
+ #### Native
82
+
83
+ | Kernel | 256³ | 1024³ | 4096³ |
84
+ | :--------------------------------- | -----------------------: | -----------------------: | -----------------------: |
85
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
86
+ | `nk_hammings_packed_u1_serial` | 109 gso/s | 162 gso/s | 284 gso/s |
87
+ | `nk_hammings_symmetric_u1_serial` | 39.7 gso/s | 133 gso/s | 325 gso/s |
88
+ | `nk_jaccards_packed_u1_serial` | 54.8 gso/s, 0 ulp | 128 gso/s, 0 ulp | 259 gso/s, 0 ulp |
89
+ | `nk_jaccards_symmetric_u1_serial` | 29.8 gso/s, 0 ulp | 110 gso/s, 0 ulp | 292 gso/s, 0 ulp |
90
+ | `nk_hammings_packed_u1_haswell` | 100 gso/s | 126 gso/s | 168 gso/s |
91
+ | `nk_hammings_symmetric_u1_haswell` | 58.5 gso/s | 132 gso/s | 328 gso/s |
92
+ | `nk_jaccards_packed_u1_haswell` | 84.2 gso/s, 0.3 ulp | 124 gso/s, 0.3 ulp | 165 gso/s, 0.3 ulp |
93
+ | `nk_jaccards_symmetric_u1_haswell` | 57.6 gso/s, 0.3 ulp | 131 gso/s, 0.3 ulp | 324 gso/s, 0.3 ulp |
94
+ | `nk_hammings_packed_u1_icelake` | 110 gso/s | 340 gso/s | 604 gso/s |
95
+ | `nk_hammings_symmetric_u1_icelake` | 76.2 gso/s | 258 gso/s | 1,040 gso/s |
96
+ | `nk_jaccards_packed_u1_icelake` | 89.2 gso/s, 0.3 ulp | 312 gso/s, 0.3 ulp | 601 gso/s, 0.3 ulp |
97
+ | `nk_jaccards_symmetric_u1_icelake` | 66.9 gso/s, 0.3 ulp | 260 gso/s, 0.3 ulp | 965 gso/s, 0.3 ulp |
98
+
99
+ #### WASM
100
+
101
+ Measured with Wasmtime v42 (Cranelift backend).
102
+
103
+ | Kernel | 256³ | 1024³ | 4096³ |
104
+ | :------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
105
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
106
+ | `nk_hammings_packed_u1_serial` | 43.7 gso/s | 68.0 gso/s | 74.7 gso/s |
107
+ | `nk_hammings_packed_u1_v128relaxed` | 75.3 gso/s | 134 gso/s | 144 gso/s |
108
+ | `nk_hammings_symmetric_u1_serial` | 3.72 gso/s | 13.5 gso/s | 41.0 gso/s |
109
+ | `nk_hammings_symmetric_u1_v128relaxed` | 3.64 gso/s | 13.9 gso/s | 42.2 gso/s |
110
+ | `nk_jaccards_packed_u1_serial` | 33.7 gso/s, 0 ulp | 61.3 gso/s, 0 ulp | 73.2 gso/s, 0 ulp |
111
+ | `nk_jaccards_packed_u1_v128relaxed` | 66.4 gso/s, 0 ulp | 129 gso/s, 0 ulp | 143 gso/s, 0 ulp |
112
+ | `nk_jaccards_symmetric_u1_serial` | 3.57 gso/s, 0 ulp | 13.3 gso/s, 0 ulp | 40.6 gso/s, 0 ulp |
113
+ | `nk_jaccards_symmetric_u1_v128relaxed` | 3.65 gso/s, 0 ulp | 13.9 gso/s, 0 ulp | 42.2 gso/s, 0 ulp |
114
+
115
+ ### Apple M4
116
+
117
+ #### Native
118
+
119
+ | Kernel | 256³ | 1024³ | 4096³ |
120
+ | :--------------------------------- | -----------------------: | -----------------------: | -----------------------: |
121
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
122
+ | `nk_hammings_packed_u1_serial` | 154 gso/s | 204 gso/s | 221 gso/s |
123
+ | `nk_hammings_symmetric_u1_serial` | 101 gso/s | 159 gso/s | 172 gso/s |
124
+ | `nk_jaccards_packed_u1_serial` | 116 gso/s, 0 ulp | 203 gso/s, 0 ulp | 232 gso/s, 0 ulp |
125
+ | `nk_jaccards_symmetric_u1_serial` | 86.3 gso/s, 0 ulp | 157 gso/s, 0 ulp | 176 gso/s, 0 ulp |
126
+ | `nk_hammings_packed_u1_neon` | 315 gso/s | 428 gso/s | 481 gso/s |
127
+ | `nk_hammings_symmetric_u1_neon` | 132 gso/s | 240 gso/s | 294 gso/s |
128
+ | `nk_jaccards_packed_u1_neon` | 266 gso/s, 8.6 ulp | 416 gso/s, 8.6 ulp | 488 gso/s, 8.6 ulp |
129
+ | `nk_jaccards_symmetric_u1_neon` | 129 gso/s, 8.5 ulp | 242 gso/s, 8.5 ulp | 294 gso/s, 8.5 ulp |
130
+ | `nk_hammings_packed_u1_smebi32` | 1,420 gso/s | 2,928 gso/s | 4,027 gso/s |
131
+ | `nk_hammings_symmetric_u1_smebi32` | 629 gso/s | 1,438 gso/s | 1,111 gso/s |
132
+ | `nk_jaccards_packed_u1_smebi32` | 273 gso/s, 0 ulp | 1,381 gso/s, 0 ulp | 3,280 gso/s, 0 ulp |
133
+ | `nk_jaccards_symmetric_u1_smebi32` | 45.1 gso/s, 0 ulp | 267 gso/s, 0 ulp | 618 gso/s, 0 ulp |
134
+
135
+ #### WASM
136
+
137
+ Measured with Wasmtime v42 (Cranelift backend).
138
+
139
+ | Kernel | 256³ | 1024³ | 4096³ |
140
+ | :------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
141
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
142
+ | `nk_hammings_packed_u1_serial` | 35.2 gso/s | 47.6 gso/s | 52.8 gso/s |
143
+ | `nk_hammings_symmetric_u1_serial` | 25.4 gso/s | 51.5 gso/s | 129 gso/s |
144
+ | `nk_jaccards_packed_u1_serial` | 30.9 gso/s, 0 ulp | 46.0 gso/s, 0 ulp | 52.7 gso/s, 0 ulp |
145
+ | `nk_jaccards_symmetric_u1_serial` | 22.8 gso/s, 0 ulp | 48.9 gso/s, 0 ulp | 123 gso/s, 0 ulp |
146
+ | `nk_hammings_packed_u1_v128relaxed` | 102 gso/s | 144 gso/s | 160 gso/s |
147
+ | `nk_hammings_symmetric_u1_v128relaxed` | 28.2 gso/s | 61.7 gso/s | 175 gso/s |
148
+ | `nk_jaccards_packed_u1_v128relaxed` | 91.2 gso/s, 0 ulp | 140 gso/s, 0 ulp | 172 gso/s, 0 ulp |
149
+ | `nk_jaccards_symmetric_u1_v128relaxed` | 26.9 gso/s, 0 ulp | 60.3 gso/s, 0 ulp | 177 gso/s, 0 ulp |
@@ -0,0 +1,63 @@
1
+ /**
2
+ * @brief Batched Set Operations for Haswell (AVX2).
3
+ * @file include/numkong/sets/haswell.h
4
+ * @author Ash Vardanian
5
+ * @date February 23, 2026
6
+ *
7
+ * @sa include/numkong/sets.h
8
+ */
9
+ #ifndef NK_SETS_HASWELL_H
10
+ #define NK_SETS_HASWELL_H
11
+
12
+ #if NK_TARGET_X86_
13
+ #if NK_TARGET_HASWELL
14
+
15
+ #include "numkong/set/haswell.h"
16
+ #include "numkong/dots/haswell.h"
17
+
18
+ #if defined(__cplusplus)
19
+ extern "C" {
20
+ #endif
21
+
22
+ #if defined(__clang__)
23
+ #pragma clang attribute push(__attribute__((target("avx2,f16c,fma,bmi,bmi2,popcnt"))), apply_to = function)
24
+ #elif defined(__GNUC__)
25
+ #pragma GCC push_options
26
+ #pragma GCC target("avx2", "f16c", "fma", "bmi", "bmi2", "popcnt")
27
+ #endif
28
+
29
+ nk_define_cross_normalized_packed_(hamming, u1, haswell, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
30
+ nk_dots_packed_u1_haswell, nk_hamming_u32x4_from_dot_haswell_,
31
+ nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_haswell_,
32
+ nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_, /*dimensions_per_value=*/8)
33
+
34
+ nk_define_cross_normalized_packed_(jaccard, u1, haswell, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
35
+ nk_dots_packed_u1_haswell, nk_jaccard_f32x4_from_dot_haswell_,
36
+ nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_haswell_,
37
+ nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_, /*dimensions_per_value=*/8)
38
+
39
+ nk_define_cross_normalized_symmetric_(hamming, u1, haswell, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
40
+ nk_dots_symmetric_u1_haswell, nk_hamming_u32x4_from_dot_haswell_,
41
+ nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_haswell_,
42
+ nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_,
43
+ /*dimensions_per_value=*/8)
44
+
45
+ nk_define_cross_normalized_symmetric_(jaccard, u1, haswell, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
46
+ nk_dots_symmetric_u1_haswell, nk_jaccard_f32x4_from_dot_haswell_,
47
+ nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_haswell_,
48
+ nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_,
49
+ /*dimensions_per_value=*/8)
50
+
51
+ #if defined(__clang__)
52
+ #pragma clang attribute pop
53
+ #elif defined(__GNUC__)
54
+ #pragma GCC pop_options
55
+ #endif
56
+
57
+ #if defined(__cplusplus)
58
+ } // extern "C"
59
+ #endif
60
+
61
+ #endif // NK_TARGET_HASWELL
62
+ #endif // NK_TARGET_X86_
63
+ #endif // NK_SETS_HASWELL_H
@@ -0,0 +1,66 @@
1
+ /**
2
+ * @brief Batched Set Operations for Ice Lake (AVX-512 VNNI/VBMI).
3
+ * @file include/numkong/sets/icelake.h
4
+ * @author Ash Vardanian
5
+ * @date February 23, 2026
6
+ *
7
+ * @sa include/numkong/sets.h
8
+ */
9
+ #ifndef NK_SETS_ICELAKE_H
10
+ #define NK_SETS_ICELAKE_H
11
+
12
+ #if NK_TARGET_X86_
13
+ #if NK_TARGET_ICELAKE
14
+
15
+ #include "numkong/set/icelake.h"
16
+ #include "numkong/dots/icelake.h"
17
+
18
+ #if defined(__cplusplus)
19
+ extern "C" {
20
+ #endif
21
+
22
+ #if defined(__clang__)
23
+ #pragma clang attribute push( \
24
+ __attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512dq,avx512vnni,avx512vpopcntdq,f16c,fma,bmi,bmi2"))), \
25
+ apply_to = function)
26
+ #elif defined(__GNUC__)
27
+ #pragma GCC push_options
28
+ #pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vnni", "avx512vpopcntdq", "f16c", \
29
+ "fma", "bmi", "bmi2")
30
+ #endif
31
+
32
+ nk_define_cross_normalized_packed_(hamming, u1, icelake, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
33
+ nk_dots_packed_u1_icelake, nk_hamming_u32x4_from_dot_icelake_,
34
+ nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_skylake_,
35
+ nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_, /*dimensions_per_value=*/8)
36
+
37
+ nk_define_cross_normalized_packed_(jaccard, u1, icelake, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
38
+ nk_dots_packed_u1_icelake, nk_jaccard_f32x4_from_dot_icelake_,
39
+ nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_skylake_,
40
+ nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_, /*dimensions_per_value=*/8)
41
+
42
+ nk_define_cross_normalized_symmetric_(hamming, u1, icelake, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
43
+ nk_dots_symmetric_u1_icelake, nk_hamming_u32x4_from_dot_icelake_,
44
+ nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_skylake_,
45
+ nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
46
+ /*dimensions_per_value=*/8)
47
+
48
+ nk_define_cross_normalized_symmetric_(jaccard, u1, icelake, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
49
+ nk_dots_symmetric_u1_icelake, nk_jaccard_f32x4_from_dot_icelake_,
50
+ nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_skylake_,
51
+ nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
52
+ /*dimensions_per_value=*/8)
53
+
54
+ #if defined(__clang__)
55
+ #pragma clang attribute pop
56
+ #elif defined(__GNUC__)
57
+ #pragma GCC pop_options
58
+ #endif
59
+
60
+ #if defined(__cplusplus)
61
+ } // extern "C"
62
+ #endif
63
+
64
+ #endif // NK_TARGET_ICELAKE
65
+ #endif // NK_TARGET_X86_
66
+ #endif // NK_SETS_ICELAKE_H
@@ -0,0 +1,61 @@
1
+ /**
2
+ * @brief Batched Set Operations for NEON.
3
+ * @file include/numkong/sets/neon.h
4
+ * @author Ash Vardanian
5
+ * @date February 23, 2026
6
+ *
7
+ * @sa include/numkong/sets.h
8
+ */
9
+ #ifndef NK_SETS_NEON_H
10
+ #define NK_SETS_NEON_H
11
+
12
+ #if NK_TARGET_ARM_
13
+ #if NK_TARGET_NEON
14
+
15
+ #include "numkong/set/neon.h"
16
+ #include "numkong/dots/neon.h"
17
+
18
+ #if defined(__cplusplus)
19
+ extern "C" {
20
+ #endif
21
+
22
+ #if defined(__clang__)
23
+ #pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
24
+ #elif defined(__GNUC__)
25
+ #pragma GCC push_options
26
+ #pragma GCC target("arch=armv8-a+simd")
27
+ #endif
28
+
29
+ nk_define_cross_normalized_packed_(hamming, u1, neon, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
30
+ nk_dots_packed_u1_neon, nk_hamming_u32x4_from_dot_neon_, nk_dots_reduce_sum_u1_,
31
+ nk_load_b128_serial_, nk_partial_load_b32x4_serial_, nk_store_b128_serial_,
32
+ nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
33
+
34
+ nk_define_cross_normalized_packed_(jaccard, u1, neon, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
35
+ nk_dots_packed_u1_neon, nk_jaccard_f32x4_from_dot_neon_, nk_dots_reduce_sum_u1_,
36
+ nk_load_b128_serial_, nk_partial_load_b32x4_serial_, nk_store_b128_serial_,
37
+ nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
38
+
39
+ nk_define_cross_normalized_symmetric_(hamming, u1, neon, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
40
+ nk_dots_symmetric_u1_neon, nk_hamming_u32x4_from_dot_neon_,
41
+ nk_dots_reduce_sum_u1_, nk_load_b128_serial_, nk_partial_load_b32x4_serial_,
42
+ nk_store_b128_serial_, nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
43
+
44
+ nk_define_cross_normalized_symmetric_(jaccard, u1, neon, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
45
+ nk_dots_symmetric_u1_neon, nk_jaccard_f32x4_from_dot_neon_,
46
+ nk_dots_reduce_sum_u1_, nk_load_b128_serial_, nk_partial_load_b32x4_serial_,
47
+ nk_store_b128_serial_, nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
48
+
49
+ #if defined(__clang__)
50
+ #pragma clang attribute pop
51
+ #elif defined(__GNUC__)
52
+ #pragma GCC pop_options
53
+ #endif
54
+
55
+ #if defined(__cplusplus)
56
+ } // extern "C"
57
+ #endif
58
+
59
+ #endif // NK_TARGET_NEON
60
+ #endif // NK_TARGET_ARM_
61
+ #endif // NK_SETS_NEON_H
@@ -0,0 +1,43 @@
1
+ /**
2
+ * @brief Batched Set Operations for Serial (non-SIMD) Backends.
3
+ * @file include/numkong/sets/serial.h
4
+ * @author Ash Vardanian
5
+ * @date February 23, 2026
6
+ *
7
+ * @sa include/numkong/sets.h
8
+ */
9
+ #ifndef NK_SETS_SERIAL_H
10
+ #define NK_SETS_SERIAL_H
11
+
12
+ #include "numkong/set/serial.h"
13
+ #include "numkong/dots/serial.h"
14
+
15
+ #if defined(__cplusplus)
16
+ extern "C" {
17
+ #endif
18
+
19
+ nk_define_cross_normalized_packed_(hamming, u1, serial, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
20
+ nk_dots_packed_u1_serial, nk_hamming_u32x4_from_dot_serial_, nk_dots_reduce_sum_u1_,
21
+ nk_load_b128_serial_, nk_partial_load_b32x4_serial_, nk_store_b128_serial_,
22
+ nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
23
+
24
+ nk_define_cross_normalized_packed_(jaccard, u1, serial, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
25
+ nk_dots_packed_u1_serial, nk_jaccard_f32x4_from_dot_serial_, nk_dots_reduce_sum_u1_,
26
+ nk_load_b128_serial_, nk_partial_load_b32x4_serial_, nk_store_b128_serial_,
27
+ nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
28
+
29
+ nk_define_cross_normalized_symmetric_(hamming, u1, serial, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
30
+ nk_dots_symmetric_u1_serial, nk_hamming_u32x4_from_dot_serial_,
31
+ nk_dots_reduce_sum_u1_, nk_load_b128_serial_, nk_partial_load_b32x4_serial_,
32
+ nk_store_b128_serial_, nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
33
+
34
+ nk_define_cross_normalized_symmetric_(jaccard, u1, serial, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
35
+ nk_dots_symmetric_u1_serial, nk_jaccard_f32x4_from_dot_serial_,
36
+ nk_dots_reduce_sum_u1_, nk_load_b128_serial_, nk_partial_load_b32x4_serial_,
37
+ nk_store_b128_serial_, nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
38
+
39
+ #if defined(__cplusplus)
40
+ } // extern "C"
41
+ #endif
42
+
43
+ #endif // NK_SETS_SERIAL_H