numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,179 @@
1
+ # Set Similarity Measures in NumKong
2
+
3
+ NumKong implements set similarity functions for binary and integer vectors: Hamming distance measures the number of differing elements, while Jaccard distance measures the complement of the intersection-over-union ratio.
4
+ These are fundamental to locality-sensitive hashing, MinHash sketches, and binary feature matching.
5
+
6
+ Hamming distance counts the number of positions where elements differ.
7
+ For binary vectors packed as octets, this is the popcount of the XOR.
8
+ For byte-level vectors, it counts the number of mismatched bytes:
9
+
10
+ ```math
11
+ \text{hamming}(a, b) = \sum_{i=0}^{n-1} [a_i \neq b_i]
12
+ ```
13
+
14
+ Jaccard distance measures the dissimilarity of two sets.
15
+ For binary vectors, the intersection and union are computed via bitwise AND and OR with popcount:
16
+
17
+ ```math
18
+ \text{jaccard}(a, b) = 1 - \frac{|A \cap B|}{|A \cup B|} = 1 - \frac{\text{popcount}(a \mathbin{\&} b)}{\text{popcount}(a \mathbin{|} b)}
19
+ ```
20
+
21
+ For word-level vectors (MinHash signatures), Jaccard similarity is the fraction of matching elements:
22
+
23
+ ```math
24
+ \text{jaccard}(a, b) = 1 - \frac{\sum_{i=0}^{n-1} [a_i = b_i]}{n}
25
+ ```
26
+
27
+ Reformulating as Python pseudocode:
28
+
29
+ ```python
30
+ import numpy as np
31
+
32
+ def hamming_bits(a: np.ndarray, b: np.ndarray) -> int:
33
+ return np.unpackbits(np.bitwise_xor(a, b)).sum()
34
+
35
+ def jaccard_bits(a: np.ndarray, b: np.ndarray) -> float:
36
+ intersection = np.unpackbits(np.bitwise_and(a, b)).sum()
37
+ union = np.unpackbits(np.bitwise_or(a, b)).sum()
38
+ return 1 - intersection / union if union else 0
39
+
40
+ def jaccard_words(a: np.ndarray, b: np.ndarray) -> float:
41
+ return 1 - np.mean(a == b)
42
+ ```
43
+
44
+ ## Input & Output Types
45
+
46
+ | Input Type | Output Type | Description |
47
+ | ---------- | ----------- | ------------------------------------------- |
48
+ | `u1` | `u32` | Binary Hamming distance, packed octets |
49
+ | `u1` | `f32` | Binary Jaccard distance, packed octets |
50
+ | `u8` | `u32` | Byte-level Hamming distance |
51
+ | `u16` | `f32` | Word-level Jaccard distance, 16-bit MinHash |
52
+ | `u32` | `f32` | Word-level Jaccard distance, 32-bit MinHash |
53
+
54
+ ## Optimizations
55
+
56
+ ### Harley-Seal Carry-Save Adders for U1
57
+
58
+ `nk_hamming_u1_haswell`, `nk_jaccard_u1_haswell` amortize the cost of popcount by using Harley-Seal carry-save adder trees.
59
+ Instead of computing popcount on every XOR/AND/OR result independently, three intermediate values are combined through a full-adder circuit:
60
+
61
+ ```
62
+ ones = a ^ b ^ c
63
+ twos = (a & b) | (c & (a ^ b))
64
+ ```
65
+
66
+ This circuit takes three popcount inputs and produces a ones and twos accumulator, where `twos` has double the weight of `ones`.
67
+ By chaining two levels, a fours accumulator is also produced, so the actual `VPSHUFB`-based popcount is called only on the final accumulated ones, twos, and fours values.
68
+ The total number of popcount operations is reduced by roughly a factor of three compared to computing popcount on every vector independently.
69
+
70
+ ### Native VPOPCNTQ on Ice Lake
71
+
72
+ `nk_hamming_u1_icelake`, `nk_jaccard_u1_icelake` use `VPOPCNTQ` on 512-bit vectors, which directly produces per-quadword population counts for 8 quadwords at once.
73
+ This single instruction replaces the entire nibble-LUT + Harley-Seal pipeline used on Haswell.
74
+ The kernels batch 16 vectors before horizontal reduction to minimize `VPSADBW` overhead, accumulating the per-quadword counts into a running total via `VPADDQ`.
75
+
76
+ ### Jaccard via Precomputed Norms
77
+
78
+ `nk_jaccard_u1_haswell`, `nk_jaccard_u1_icelake` exploit the identity $|A \cup B| = |A| + |B| - |A \cap B|$ to avoid computing both AND-popcount and OR-popcount in the inner loop.
79
+ When vector norms (popcount of each vector) are precomputed and passed via the streaming API, only the intersection popcount is needed per pair, halving the work in the critical path.
80
+
81
+ ### Byte Hamming via VPSADBW
82
+
83
+ `nk_hamming_u8_haswell`, `nk_hamming_u8_icelake` compute byte-level Hamming distance using XOR to produce per-byte difference indicators, then `VPSADBW` against zero to horizontally sum the nonzero bytes.
84
+ XOR produces 0 for equal bytes and nonzero for different ones, and `VPSADBW` sums the absolute values of byte differences within each 64-bit lane.
85
+ Since XOR results are either 0 or nonzero (not necessarily 1), the kernel masks XOR output through `VPMIN` with a vector of ones to clamp each byte to 0 or 1 before feeding `VPSADBW`.
86
+
87
+ ## Performance
88
+
89
+ The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
90
+ The input size is controlled by the `NK_DENSE_DIMENSIONS` environment variable and set to 256, 1024, and 4096 elements.
91
+ The throughput is measured in GB/s as the number of input bytes per second.
92
+ Accuracy is reported where applicable as exact distance in the result representation; floating Jaccard rows are shown as mean ULP (units in last place).
93
+ Each kernel runs for at least 20 seconds per configuration.
94
+ Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
95
+ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
96
+
97
+ ### Intel Sapphire Rapids
98
+
99
+ #### Native
100
+
101
+ | Kernel | 256 | 1024 | 4096 |
102
+ | :----------------------- | -----------------------: | -----------------------: | -----------------------: |
103
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
104
+ | `nk_hamming_u1_serial` | 2.30 gb/s | 2.62 gb/s | 2.54 gb/s |
105
+ | `nk_jaccard_u1_serial` | 1.35 gb/s, 0 ulp | 1.46 gb/s, 0 ulp | 1.50 gb/s, 0 ulp |
106
+ | `nk_hamming_u1_haswell` | 9.63 gb/s | 25.2 gb/s | 56.2 gb/s |
107
+ | `nk_jaccard_u1_haswell` | 5.24 gb/s, 0 ulp | 15.5 gb/s, 0 ulp | 27.0 gb/s, 0 ulp |
108
+ | `nk_hamming_u1_icelake` | 11.2 gb/s | 38.2 gb/s | 56.1 gb/s |
109
+ | `nk_jaccard_u1_icelake` | 6.46 gb/s, 0 ulp | 22.4 gb/s, 0 ulp | 33.3 gb/s, 0 ulp |
110
+ | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
111
+ | `nk_hamming_u8_serial` | 15.0 gb/s | 14.9 gb/s | 14.8 gb/s |
112
+ | `nk_hamming_u8_haswell` | 22.4 gb/s | 21.6 gb/s | 17.9 gb/s |
113
+ | `nk_hamming_u8_icelake` | 55.2 gb/s | 37.7 gb/s | 24.3 gb/s |
114
+ | __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
115
+ | `nk_jaccard_u16_serial` | 27.8 gb/s, 0 ulp | 23.0 gb/s, 0 ulp | 19.2 gb/s, 0 ulp |
116
+ | `nk_jaccard_u16_haswell` | 22.2 gb/s, 0 ulp | 18.4 gb/s, 0 ulp | 13.7 gb/s, 0 ulp |
117
+ | `nk_jaccard_u16_icelake` | 54.2 gb/s, 0 ulp | 24.3 gb/s, 0 ulp | 20.9 gb/s, 0 ulp |
118
+ | __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
119
+ | `nk_jaccard_u32_serial` | 33.1 gb/s, 0 ulp | 23.5 gb/s, 0 ulp | 18.3 gb/s, 0 ulp |
120
+ | `nk_jaccard_u32_haswell` | 19.0 gb/s, 0 ulp | 16.9 gb/s, 0 ulp | 11.0 gb/s, 0 ulp |
121
+ | `nk_jaccard_u32_icelake` | 33.0 gb/s, 0 ulp | 24.6 gb/s, 0 ulp | 16.3 gb/s, 0 ulp |
122
+
123
+ #### WASM
124
+
125
+ Measured with Wasmtime v42 (Cranelift backend).
126
+
127
+ | Kernel | 256 | 1024 | 4096 |
128
+ | :--------------------------- | -----------------------: | -----------------------: | -----------------------: |
129
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
130
+ | `nk_hamming_u1_v128relaxed` | 0.138 gb/s | 0.149 gb/s | 0.979 gb/s |
131
+ | `nk_jaccard_u1_v128relaxed` | 0.153 gb/s, 0 ulp | 0.352 gb/s, 0 ulp | 2.50 gb/s, 0 ulp |
132
+ | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
133
+ | `nk_hamming_u8_v128relaxed` | 0.370 gb/s | 0.400 gb/s | 2.19 gb/s |
134
+ | __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
135
+ | `nk_jaccard_u16_v128relaxed` | 2.30 gb/s, 0 ulp | 2.34 gb/s, 0 ulp | 0.381 gb/s, 0 ulp |
136
+ | __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
137
+ | `nk_jaccard_u32_v128relaxed` | 0.430 gb/s, 0 ulp | 2.46 gb/s, 0 ulp | 1.08 gb/s, 0 ulp |
138
+
139
+ ### Apple M4
140
+
141
+ #### Native
142
+
143
+ | Kernel | 256 | 1024 | 4096 |
144
+ | :---------------------- | -----------------------: | -----------------------: | -----------------------: |
145
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
146
+ | `nk_hamming_u1_serial` | 4.66 gb/s | 5.30 gb/s | 5.04 gb/s |
147
+ | `nk_jaccard_u1_serial` | 3.03 gb/s, 0 ulp | 3.72 gb/s, 0 ulp | 3.65 gb/s, 0 ulp |
148
+ | `nk_hamming_u1_neon` | 20.7 gb/s | 41.9 gb/s | 52.2 gb/s |
149
+ | `nk_jaccard_u1_neon` | 15.8 gb/s, 0 ulp | 29.5 gb/s, 0 ulp | 34.8 gb/s, 0 ulp |
150
+ | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
151
+ | `nk_hamming_u8_serial` | 20.7 gb/s | 21.9 gb/s | 18.1 gb/s |
152
+ | `nk_hamming_u8_neon` | 49.1 gb/s | 43.9 gb/s | 32.5 gb/s |
153
+ | __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
154
+ | `nk_jaccard_u16_serial` | 42.5 gb/s, 0 ulp | 39.7 gb/s, 0 ulp | 36.1 gb/s, 0 ulp |
155
+ | `nk_jaccard_u16_neon` | 43.3 gb/s, 0 ulp | 33.0 gb/s, 0 ulp | 29.2 gb/s, 0 ulp |
156
+ | __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
157
+ | `nk_jaccard_u32_serial` | 60.6 gb/s, 0 ulp | 49.0 gb/s, 0 ulp | 51.2 gb/s, 0 ulp |
158
+ | `nk_jaccard_u32_neon` | 51.0 gb/s, 0 ulp | 39.9 gb/s, 0 ulp | 38.9 gb/s, 0 ulp |
159
+
160
+ #### WASM
161
+
162
+ Measured with Wasmtime v42 (Cranelift backend).
163
+
164
+ | Kernel | 256 | 1024 | 4096 |
165
+ | :--------------------------- | -----------------------: | -----------------------: | -----------------------: |
166
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
167
+ | `nk_hamming_u1_serial` | 0.501 gb/s | 0.00424 gb/s | 0.0443 gb/s |
168
+ | `nk_jaccard_u1_serial` | 0.315 gb/s, 0 ulp | 0.362 gb/s, 0 ulp | 0.382 gb/s, 0 ulp |
169
+ | `nk_hamming_u1_v128relaxed` | 0.414 gb/s | 0.0294 gb/s | 0.233 gb/s |
170
+ | `nk_jaccard_u1_v128relaxed` | 0.0141 gb/s, 0 ulp | 0.317 gb/s, 0 ulp | 0.249 gb/s, 0 ulp |
171
+ | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
172
+ | `nk_hamming_u8_serial` | 0.551 gb/s | 0.352 gb/s | 0.154 gb/s |
173
+ | `nk_hamming_u8_v128relaxed` | 0.702 gb/s | 0.409 gb/s | 0.464 gb/s |
174
+ | __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
175
+ | `nk_jaccard_u16_serial` | 0.647 gb/s, 0 ulp | 0.362 gb/s, 0 ulp | 0.174 gb/s, 0 ulp |
176
+ | `nk_jaccard_u16_v128relaxed` | 0.409 gb/s, 0 ulp | 0.00109 gb/s, 0 ulp | 0.275 gb/s, 0 ulp |
177
+ | __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
178
+ | `nk_jaccard_u32_serial` | 0.320 gb/s, 0 ulp | 0.161 gb/s, 0 ulp | 0.607 gb/s, 0 ulp |
179
+ | `nk_jaccard_u32_v128relaxed` | 0.397 gb/s, 0 ulp | 0.364 gb/s, 0 ulp | 0.0807 gb/s, 0 ulp |
@@ -0,0 +1,334 @@
1
+ /**
2
+ * @brief SIMD-accelerated Set Similarity Measures for Haswell.
3
+ * @file include/numkong/set/haswell.h
4
+ * @author Ash Vardanian
5
+ * @date December 27, 2025
6
+ *
7
+ * @sa include/numkong/set.h
8
+ *
9
+ * @section set_haswell_instructions Key POPCNT/AVX2 Set Instructions
10
+ *
11
+ * Intrinsic Instruction Latency Throughput Ports
12
+ * _mm_popcnt_u64 POPCNT (R64, R64) 3cy 1/cy p1
13
+ * _mm256_and_si256 VPAND (YMM, YMM, YMM) 1cy 0.33/cy p015
14
+ * _mm256_or_si256 VPOR (YMM, YMM, YMM) 1cy 0.33/cy p015
15
+ * _mm256_xor_si256 VPXOR (YMM, YMM, YMM) 1cy 0.33/cy p015
16
+ * _mm256_extracti128_si256 VEXTRACTI128 (XMM, YMM, I8) 3cy 1/cy p5
17
+ *
18
+ * Haswell lacks SIMD popcount; we extract 64-bit words and use scalar POPCNT. The p1 port
19
+ * bottleneck limits throughput to 1 popcount/cycle. For Hamming distance, XOR + POPCNT;
20
+ * for Jaccard, compute AND/OR + POPCNT separately to get intersection and union counts.
21
+ *
22
+ * @section set_haswell_stateful Stateful Streaming Logic
23
+ *
24
+ * To build memory-optimal tiled algorithms, this file defines:
25
+ *
26
+ * - nk_hamming_u1x64_state_haswell_t for streaming Hamming distance
27
+ * - nk_jaccard_u1x64_state_haswell_t for streaming Jaccard similarity
28
+ *
29
+ * @code{c}
30
+ * nk_jaccard_u1x64_state_haswell_t state_first, state_second, state_third, state_fourth;
31
+ * nk_jaccard_u1x64_init_haswell(&state_first);
32
+ * // ... stream through packed binary vectors ...
33
+ * nk_jaccard_u1x64_finalize_haswell(&state_first, &state_second, &state_third, &state_fourth,
34
+ * query_popcount, target_popcount_a, target_popcount_b, target_popcount_c, target_popcount_d,
35
+ * total_dimensions, &results);
36
+ * @endcode
37
+ */
38
+ #ifndef NK_SET_HASWELL_H
39
+ #define NK_SET_HASWELL_H
40
+
41
+ #if NK_TARGET_X86_
42
+ #if NK_TARGET_HASWELL
43
+
44
+ #include "numkong/types.h"
45
+ #include "numkong/set/serial.h" // `nk_u1x8_popcount_`
46
+
47
+ #if defined(__cplusplus)
48
+ extern "C" {
49
+ #endif
50
+
51
+ #if defined(__clang__)
52
+ #pragma clang attribute push(__attribute__((target("avx2,sse4.1,popcnt"))), apply_to = function)
53
+ #elif defined(__GNUC__)
54
+ #pragma GCC push_options
55
+ #pragma GCC target("avx2", "sse4.1", "popcnt")
56
+ #endif
57
+
58
+ #pragma region - Binary Sets
59
+
60
+ NK_PUBLIC void nk_hamming_u1_haswell(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
61
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
62
+ // x86 supports unaligned loads and works just fine with the scalar version for small vectors.
63
+ nk_u32_t differences = 0;
64
+ for (; n_bytes >= 8; n_bytes -= 8, a += 8, b += 8)
65
+ differences += _mm_popcnt_u64(*(nk_u64_t const *)a ^ *(nk_u64_t const *)b);
66
+ for (; n_bytes; --n_bytes, ++a, ++b) differences += _mm_popcnt_u32(*a ^ *b);
67
+ *result = differences;
68
+ }
69
+
70
+ NK_PUBLIC void nk_jaccard_u1_haswell(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result) {
71
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
72
+ // x86 supports unaligned loads and works just fine with the scalar version for small vectors.
73
+ nk_u32_t intersection_count = 0, union_count = 0;
74
+ for (; n_bytes >= 8; n_bytes -= 8, a += 8, b += 8)
75
+ intersection_count += (nk_u32_t)_mm_popcnt_u64(*(nk_u64_t const *)a & *(nk_u64_t const *)b),
76
+ union_count += (nk_u32_t)_mm_popcnt_u64(*(nk_u64_t const *)a | *(nk_u64_t const *)b);
77
+ for (; n_bytes; --n_bytes, ++a, ++b)
78
+ intersection_count += nk_u1x8_popcount_(*a & *b), union_count += nk_u1x8_popcount_(*a | *b);
79
+ *result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
80
+ }
81
+
82
+ #pragma endregion - Binary Sets
83
+
84
+ #pragma region - Integer Sets
85
+
86
+ NK_PUBLIC void nk_jaccard_u32_haswell(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
87
+ nk_u32_t intersection_count = 0;
88
+ nk_size_t n_remaining = n;
89
+ for (; n_remaining >= 4; n_remaining -= 4, a += 4, b += 4) {
90
+ __m128i a_u32x4 = _mm_loadu_si128((__m128i const *)a);
91
+ __m128i b_u32x4 = _mm_loadu_si128((__m128i const *)b);
92
+ __m128i equality_u32x4 = _mm_cmpeq_epi32(a_u32x4, b_u32x4);
93
+ int equality_mask = _mm_movemask_ps(_mm_castsi128_ps(equality_u32x4));
94
+ intersection_count += (nk_u32_t)_mm_popcnt_u32((unsigned int)equality_mask);
95
+ }
96
+ for (; n_remaining; --n_remaining, ++a, ++b) intersection_count += (*a == *b);
97
+ *result = (n != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)n : 0.0f;
98
+ }
99
+
100
+ NK_PUBLIC void nk_hamming_u8_haswell(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
101
+ // Process 32 bytes at a time using AVX2 (256-bit registers).
102
+ // Compare bytes for equality, invert to get not-equal mask, then count mismatches.
103
+ //
104
+ // Haswell port analysis:
105
+ // - `_mm256_loadu_si256`: p23, 1cy latency (load)
106
+ // - `_mm256_cmpeq_epi8`: p015, 1cy latency, 0.33cy throughput
107
+ // - `_mm256_extracti128`: p5, 3cy latency, 1cy throughput
108
+ // - `_mm_popcnt_u64`: p1 ONLY, 3cy latency, 1cy throughput (BOTTLENECK)
109
+ //
110
+ // For counting mismatches, we XOR and popcount the resulting bits set to 1.
111
+ // Alternative: compare -> movemask -> popcount, but movemask only works per-byte MSBs.
112
+ // XOR approach: each differing byte produces 0xFF (8 bits set), need to count bytes not bits.
113
+
114
+ nk_u32_t differences = 0;
115
+ nk_size_t n_remaining = n;
116
+
117
+ // Main loop: process 32 bytes at a time
118
+ for (; n_remaining >= 32; n_remaining -= 32, a += 32, b += 32) {
119
+ __m256i a_u8x32 = _mm256_loadu_si256((__m256i const *)a);
120
+ __m256i b_u8x32 = _mm256_loadu_si256((__m256i const *)b);
121
+
122
+ // Compare for equality: 0xFF where equal, 0x00 where different
123
+ __m256i equality_u8x32 = _mm256_cmpeq_epi8(a_u8x32, b_u8x32);
124
+
125
+ // Extract to two 128-bit halves for movemask
126
+ // movemask extracts the MSB of each byte, giving us 16 bits per 128-bit half
127
+ __m128i equality_low_u8x16 = _mm256_castsi256_si128(equality_u8x32);
128
+ __m128i equality_high_u8x16 = _mm256_extracti128_si256(equality_u8x32, 1);
129
+
130
+ // Get masks: bit set = equal (0xFF MSB = 1), bit clear = different
131
+ int mask_low = _mm_movemask_epi8(equality_low_u8x16); // 16 bits
132
+ int mask_high = _mm_movemask_epi8(equality_high_u8x16); // 16 bits
133
+
134
+ // Invert to count differences (bit set = different)
135
+ // Then popcount to count mismatches
136
+ differences += (nk_u32_t)_mm_popcnt_u32((unsigned int)(~mask_low & 0xFFFF));
137
+ differences += (nk_u32_t)_mm_popcnt_u32((unsigned int)(~mask_high & 0xFFFF));
138
+ }
139
+
140
+ // Handle remaining bytes (0-31) with scalar code
141
+ for (; n_remaining; --n_remaining, ++a, ++b) differences += (*a != *b);
142
+
143
+ *result = differences;
144
+ }
145
+
146
+ NK_PUBLIC void nk_jaccard_u16_haswell(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result) {
147
+ // Process 16 u16 values at a time using AVX2 (256-bit registers).
148
+ // Compare 16-bit integers for equality and count matches.
149
+ //
150
+ // Haswell port analysis:
151
+ // - `_mm256_loadu_si256`: p23, 1cy latency (load)
152
+ // - `_mm256_cmpeq_epi16`: p015, 1cy latency, 0.33cy throughput
153
+ // - `_mm256_packs_epi16`: p5, 1cy latency, 1cy throughput (pack 16->8 bit)
154
+ // - `_mm_movemask_epi8`: p0, 3cy latency (extracts MSB of each byte)
155
+ // - `_mm_popcnt_u32`: p1 ONLY, 3cy latency, 1cy throughput
156
+
157
+ nk_u32_t matches = 0;
158
+ nk_size_t n_remaining = n;
159
+
160
+ // Main loop: process 16 u16 values at a time
161
+ for (; n_remaining >= 16; n_remaining -= 16, a += 16, b += 16) {
162
+ __m256i a_u16x16 = _mm256_loadu_si256((__m256i const *)a);
163
+ __m256i b_u16x16 = _mm256_loadu_si256((__m256i const *)b);
164
+
165
+ // Compare for equality: 0xFFFF where equal, 0x0000 where different
166
+ __m256i equality_u16x16 = _mm256_cmpeq_epi16(a_u16x16, b_u16x16);
167
+
168
+ // Pack 16-bit results to 8-bit to use movemask efficiently.
169
+ // _mm256_packs_epi16 saturates signed 16-bit to signed 8-bit:
170
+ // 0xFFFF (-1) -> 0x80 (-128), 0x0000 (0) -> 0x00 (0)
171
+ // Note: packs interleaves lanes, so we need to handle the permutation.
172
+ // For counting, we just need the total popcount, so lane order doesn't matter.
173
+ __m256i packed_i8x32 = _mm256_packs_epi16(equality_u16x16, equality_u16x16);
174
+
175
+ // Extract to 128-bit halves
176
+ __m128i packed_low_i8x16 = _mm256_castsi256_si128(packed_i8x32);
177
+ __m128i packed_high_i8x16 = _mm256_extracti128_si256(packed_i8x32, 1);
178
+
179
+ // movemask extracts MSB of each byte
180
+ // After packs: 0x80 (MSB=1) for equal, 0x00 (MSB=0) for different
181
+ // Each 128-bit half has 8 relevant bytes (lower 8 from each original lane)
182
+ int mask_low = _mm_movemask_epi8(packed_low_i8x16) & 0xFF; // Lower 8 bytes
183
+ int mask_high = _mm_movemask_epi8(packed_high_i8x16) & 0xFF; // Lower 8 bytes from high lane
184
+
185
+ matches += (nk_u32_t)_mm_popcnt_u32((unsigned int)mask_low);
186
+ matches += (nk_u32_t)_mm_popcnt_u32((unsigned int)mask_high);
187
+ }
188
+
189
+ // Handle remaining elements (0-15) with scalar code
190
+ for (; n_remaining; --n_remaining, ++a, ++b) matches += (*a == *b);
191
+
192
+ *result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
193
+ }
194
+
195
+ #pragma endregion - Integer Sets
196
+
197
+ #pragma region - Stateful Streaming
198
+
199
+ typedef struct nk_hamming_u1x64_state_haswell_t {
200
+ nk_u32_t intersection_count;
201
+ } nk_hamming_u1x64_state_haswell_t;
202
+
203
+ NK_INTERNAL void nk_hamming_u1x64_init_haswell(nk_hamming_u1x64_state_haswell_t *state) {
204
+ state->intersection_count = 0;
205
+ }
206
+
207
+ NK_INTERNAL void nk_hamming_u1x64_update_haswell(nk_hamming_u1x64_state_haswell_t *state, nk_b64_vec_t a,
208
+ nk_b64_vec_t b, nk_size_t depth_offset, nk_size_t active_dimensions) {
209
+ nk_unused_(depth_offset);
210
+ nk_unused_(active_dimensions);
211
+ state->intersection_count += (nk_u32_t)_mm_popcnt_u64(a.u64 ^ b.u64);
212
+ }
213
+
214
+ NK_INTERNAL void nk_hamming_u1x64_finalize_haswell( //
215
+ nk_hamming_u1x64_state_haswell_t const *state_a, nk_hamming_u1x64_state_haswell_t const *state_b,
216
+ nk_hamming_u1x64_state_haswell_t const *state_c, nk_hamming_u1x64_state_haswell_t const *state_d,
217
+ nk_size_t total_dimensions, nk_b128_vec_t *result) {
218
+ nk_unused_(total_dimensions);
219
+ result->u32s[0] = state_a->intersection_count;
220
+ result->u32s[1] = state_b->intersection_count;
221
+ result->u32s[2] = state_c->intersection_count;
222
+ result->u32s[3] = state_d->intersection_count;
223
+ }
224
+
225
+ typedef struct nk_jaccard_u1x64_state_haswell_t {
226
+ nk_u32_t intersection_count;
227
+ } nk_jaccard_u1x64_state_haswell_t;
228
+
229
+ NK_INTERNAL void nk_jaccard_u1x64_init_haswell(nk_jaccard_u1x64_state_haswell_t *state) {
230
+ state->intersection_count = 0;
231
+ }
232
+
233
+ NK_INTERNAL void nk_jaccard_u1x64_update_haswell(nk_jaccard_u1x64_state_haswell_t *state, nk_b64_vec_t a,
234
+ nk_b64_vec_t b, nk_size_t depth_offset, nk_size_t active_dimensions) {
235
+ nk_unused_(depth_offset);
236
+ nk_unused_(active_dimensions);
237
+ state->intersection_count += (nk_u32_t)_mm_popcnt_u64(a.u64 & b.u64);
238
+ }
239
+
240
+ NK_INTERNAL void nk_jaccard_u1x64_finalize_haswell( //
241
+ nk_jaccard_u1x64_state_haswell_t const *state_a, nk_jaccard_u1x64_state_haswell_t const *state_b,
242
+ nk_jaccard_u1x64_state_haswell_t const *state_c, nk_jaccard_u1x64_state_haswell_t const *state_d,
243
+ nk_f32_t query_popcount, nk_f32_t target_popcount_a, nk_f32_t target_popcount_b, nk_f32_t target_popcount_c,
244
+ nk_f32_t target_popcount_d, nk_size_t total_dimensions, nk_b128_vec_t *result) {
245
+ nk_unused_(total_dimensions);
246
+
247
+ // 4-way SIMD Jaccard computation with fast reciprocal.
248
+ //
249
+ // Haswell port analysis:
250
+ // - `_mm_setr_ps`: p5, 1cy (INSERTPS chain)
251
+ // - `_mm_add_ps`: p01, 3cy latency
252
+ // - `_mm_sub_ps`: p01, 3cy latency
253
+ // - `_mm_rcp_ps`: p0, 5cy latency, 1cy throughput
254
+ // - `_mm_mul_ps`: p01, 5cy latency, 0.5cy throughput
255
+ // - `_mm_blendv_ps`: p015, 2cy latency
256
+
257
+ // Pack intersection counts and convert to float
258
+ nk_f32_t intersection_a_f32 = (nk_f32_t)state_a->intersection_count;
259
+ nk_f32_t intersection_b_f32 = (nk_f32_t)state_b->intersection_count;
260
+ nk_f32_t intersection_c_f32 = (nk_f32_t)state_c->intersection_count;
261
+ nk_f32_t intersection_d_f32 = (nk_f32_t)state_d->intersection_count;
262
+
263
+ __m128 intersection_f32x4 = _mm_setr_ps(intersection_a_f32, intersection_b_f32, intersection_c_f32,
264
+ intersection_d_f32);
265
+ __m128 query_f32x4 = _mm_set1_ps(query_popcount);
266
+ __m128 targets_f32x4 = _mm_setr_ps(target_popcount_a, target_popcount_b, target_popcount_c, target_popcount_d);
267
+ __m128 union_f32x4 = _mm_sub_ps(_mm_add_ps(query_f32x4, targets_f32x4), intersection_f32x4);
268
+
269
+ // Handle zero-union edge case
270
+ __m128 zero_union_mask = _mm_cmpeq_ps(union_f32x4, _mm_setzero_ps());
271
+ __m128 one_f32x4 = _mm_set1_ps(1.0f);
272
+ __m128 two_f32x4 = _mm_set1_ps(2.0f);
273
+ __m128 safe_union_f32x4 = _mm_blendv_ps(union_f32x4, one_f32x4, zero_union_mask);
274
+
275
+ // Fast reciprocal with Newton-Raphson refinement:
276
+ // - `_mm_rcp_ps`: ~12-bit precision, 5cy latency, 1cy throughput
277
+ // Newton-Raphson:
278
+ // rcp' = rcp × (2 - x × rcp), doubles precision to ~22-24 bits
279
+ // Total: ~10cy vs `_mm_div_ps` 13cy latency, but NR has better throughput
280
+ __m128 union_reciprocal_f32x4 = _mm_rcp_ps(safe_union_f32x4);
281
+ __m128 newton_raphson_correction = _mm_sub_ps(two_f32x4, _mm_mul_ps(safe_union_f32x4, union_reciprocal_f32x4));
282
+ union_reciprocal_f32x4 = _mm_mul_ps(union_reciprocal_f32x4, newton_raphson_correction);
283
+
284
+ __m128 ratio_f32x4 = _mm_mul_ps(intersection_f32x4, union_reciprocal_f32x4);
285
+ __m128 jaccard_f32x4 = _mm_sub_ps(one_f32x4, ratio_f32x4);
286
+ result->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
287
+ }
288
+
289
+ /** @brief Hamming from_dot: computes pop_a + pop_b - 2*dot for 4 pairs (Haswell). */
290
+ NK_INTERNAL void nk_hamming_u32x4_from_dot_haswell_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
291
+ nk_b128_vec_t *results) {
292
+ __m128i dots_i32x4 = dots.xmm;
293
+ __m128i query_i32x4 = _mm_set1_epi32((int)query_pop);
294
+ __m128i target_i32x4 = target_pops.xmm;
295
+ results->xmm = _mm_sub_epi32(_mm_add_epi32(query_i32x4, target_i32x4), _mm_slli_epi32(dots_i32x4, 1));
296
+ }
297
+
298
+ /** @brief Jaccard from_dot: computes 1 - dot / (pop_a + pop_b - dot) for 4 pairs (Haswell). */
299
+ NK_INTERNAL void nk_jaccard_f32x4_from_dot_haswell_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
300
+ nk_b128_vec_t *results) {
301
+ __m128 dot_f32x4 = _mm_cvtepi32_ps(dots.xmm);
302
+ __m128 query_f32x4 = _mm_set1_ps((nk_f32_t)query_pop);
303
+ __m128 target_f32x4 = _mm_cvtepi32_ps(target_pops.xmm);
304
+ __m128 union_f32x4 = _mm_sub_ps(_mm_add_ps(query_f32x4, target_f32x4), dot_f32x4);
305
+
306
+ __m128 zero_union_mask = _mm_cmpeq_ps(union_f32x4, _mm_setzero_ps());
307
+ __m128 one_f32x4 = _mm_set1_ps(1.0f);
308
+ __m128 two_f32x4 = _mm_set1_ps(2.0f);
309
+ __m128 safe_union_f32x4 = _mm_blendv_ps(union_f32x4, one_f32x4, zero_union_mask);
310
+
311
+ __m128 union_reciprocal_f32x4 = _mm_rcp_ps(safe_union_f32x4);
312
+ __m128 nr_correction = _mm_sub_ps(two_f32x4, _mm_mul_ps(safe_union_f32x4, union_reciprocal_f32x4));
313
+ union_reciprocal_f32x4 = _mm_mul_ps(union_reciprocal_f32x4, nr_correction);
314
+
315
+ __m128 ratio_f32x4 = _mm_mul_ps(dot_f32x4, union_reciprocal_f32x4);
316
+ __m128 jaccard_f32x4 = _mm_sub_ps(one_f32x4, ratio_f32x4);
317
+ results->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
318
+ }
319
+
320
+ #pragma endregion - Stateful Streaming
321
+
322
+ #if defined(__clang__)
323
+ #pragma clang attribute pop
324
+ #elif defined(__GNUC__)
325
+ #pragma GCC pop_options
326
+ #endif
327
+
328
+ #if defined(__cplusplus)
329
+ } // extern "C"
330
+ #endif
331
+
332
+ #endif // NK_TARGET_HASWELL
333
+ #endif // NK_TARGET_X86_
334
+ #endif // NK_SET_HASWELL_H