numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,485 @@
1
+ /**
2
+ * @brief SIMD-accelerated Set Similarity Measures for Ice Lake.
3
+ * @file include/numkong/set/icelake.h
4
+ * @author Ash Vardanian
5
+ * @date December 27, 2025
6
+ *
7
+ * @sa include/numkong/set.h
8
+ *
9
+ * @section set_icelake_instructions Key AVX-512 Set Instructions
10
+ *
11
+ * Intrinsic Instruction Latency Throughput Ports
12
+ * _mm512_popcnt_epi64 VPOPCNTQ (ZMM, ZMM) 3cy 1/cy p5
13
+ * _mm512_and_si512 VPANDQ (ZMM, ZMM, ZMM) 1cy 0.33/cy p05
14
+ * _mm512_or_si512 VPORQ (ZMM, ZMM, ZMM) 1cy 0.33/cy p05
15
+ * _mm512_xor_si512 VPXORQ (ZMM, ZMM, ZMM) 1cy 0.33/cy p05
16
+ * _mm512_maskz_loadu_epi8 VMOVDQU8 (ZMM, mem, k1) 7cy 0.5/cy p23
17
+ *
18
+ * Ice Lake has native VPOPCNTQ instruction via AVX-512 VPOPCNTDQ extension, enabling
19
+ * efficient 64-bit element-wise popcount. We process 512 bits per iteration.
20
+ *
21
+ * @section set_icelake_stateful Stateful Streaming Logic
22
+ *
23
+ * To build memory-optimal tiled algorithms, this file defines:
24
+ *
25
+ * - nk_hamming_u1x512_state_icelake_t for streaming Hamming distance
26
+ * - nk_jaccard_u1x512_state_icelake_t for streaming Jaccard similarity
27
+ *
28
+ * @code{c}
29
+ * nk_jaccard_u1x512_state_icelake_t state_first, state_second, state_third, state_fourth;
30
+ * nk_jaccard_u1x512_init_icelake(&state_first);
31
+ * // ... stream through packed binary vectors ...
32
+ * nk_jaccard_u1x512_finalize_icelake(&state_first, &state_second, &state_third, &state_fourth,
33
+ * query_popcount, target_popcount_a, target_popcount_b, target_popcount_c, target_popcount_d,
34
+ * total_dimensions, &results);
35
+ * @endcode
36
+ */
37
+ #ifndef NK_SET_ICELAKE_H
38
+ #define NK_SET_ICELAKE_H
39
+
40
+ #if NK_TARGET_X86_
41
+ #if NK_TARGET_ICELAKE
42
+
43
+ #include "numkong/types.h"
44
+
45
+ #if defined(__cplusplus)
46
+ extern "C" {
47
+ #endif
48
+
49
+ #if defined(__clang__)
50
+ #pragma clang attribute push( \
51
+ __attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512vpopcntdq,f16c,fma,bmi,bmi2"))), apply_to = function)
52
+ #elif defined(__GNUC__)
53
+ #pragma GCC push_options
54
+ #pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512vpopcntdq", "f16c", "fma", "bmi", "bmi2")
55
+ #endif
56
+
57
+ #pragma region - Binary Sets
58
+
59
+ NK_PUBLIC void nk_hamming_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
60
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
61
+
62
+ nk_u32_t xor_count;
63
+ // It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
64
+ if (n_bytes <= 64) { // Up to 512 bits.
65
+ __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes);
66
+ __m512i a_u8x64 = _mm512_maskz_loadu_epi8(mask, a);
67
+ __m512i b_u8x64 = _mm512_maskz_loadu_epi8(mask, b);
68
+ __m512i xor_popcount_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_u8x64, b_u8x64));
69
+ xor_count = _mm512_reduce_add_epi64(xor_popcount_u64x8);
70
+ }
71
+ else if (n_bytes <= 128) { // Up to 1024 bits.
72
+ __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes - 64);
73
+ __m512i a_one_u8x64 = _mm512_loadu_epi8(a);
74
+ __m512i b_one_u8x64 = _mm512_loadu_epi8(b);
75
+ __m512i a_two_u8x64 = _mm512_maskz_loadu_epi8(mask, a + 64);
76
+ __m512i b_two_u8x64 = _mm512_maskz_loadu_epi8(mask, b + 64);
77
+ __m512i xor_popcount_one_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_one_u8x64, b_one_u8x64));
78
+ __m512i xor_popcount_two_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_two_u8x64, b_two_u8x64));
79
+ xor_count = _mm512_reduce_add_epi64(_mm512_add_epi64(xor_popcount_two_u64x8, xor_popcount_one_u64x8));
80
+ }
81
+ else if (n_bytes <= 192) { // Up to 1536 bits.
82
+ __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes - 128);
83
+ __m512i a_one_u8x64 = _mm512_loadu_epi8(a);
84
+ __m512i b_one_u8x64 = _mm512_loadu_epi8(b);
85
+ __m512i a_two_u8x64 = _mm512_loadu_epi8(a + 64);
86
+ __m512i b_two_u8x64 = _mm512_loadu_epi8(b + 64);
87
+ __m512i a_three_u8x64 = _mm512_maskz_loadu_epi8(mask, a + 128);
88
+ __m512i b_three_u8x64 = _mm512_maskz_loadu_epi8(mask, b + 128);
89
+ __m512i xor_popcount_one_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_one_u8x64, b_one_u8x64));
90
+ __m512i xor_popcount_two_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_two_u8x64, b_two_u8x64));
91
+ __m512i xor_popcount_three_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_three_u8x64, b_three_u8x64));
92
+ xor_count = _mm512_reduce_add_epi64(_mm512_add_epi64(
93
+ xor_popcount_three_u64x8, _mm512_add_epi64(xor_popcount_two_u64x8, xor_popcount_one_u64x8)));
94
+ }
95
+ else if (n_bytes <= 256) { // Up to 2048 bits.
96
+ __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes - 192);
97
+ __m512i a_one_u8x64 = _mm512_loadu_epi8(a);
98
+ __m512i b_one_u8x64 = _mm512_loadu_epi8(b);
99
+ __m512i a_two_u8x64 = _mm512_loadu_epi8(a + 64);
100
+ __m512i b_two_u8x64 = _mm512_loadu_epi8(b + 64);
101
+ __m512i a_three_u8x64 = _mm512_loadu_epi8(a + 128);
102
+ __m512i b_three_u8x64 = _mm512_loadu_epi8(b + 128);
103
+ __m512i a_four_u8x64 = _mm512_maskz_loadu_epi8(mask, a + 192);
104
+ __m512i b_four_u8x64 = _mm512_maskz_loadu_epi8(mask, b + 192);
105
+ __m512i xor_popcount_one_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_one_u8x64, b_one_u8x64));
106
+ __m512i xor_popcount_two_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_two_u8x64, b_two_u8x64));
107
+ __m512i xor_popcount_three_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_three_u8x64, b_three_u8x64));
108
+ __m512i xor_popcount_four_u64x8 = _mm512_popcnt_epi64(_mm512_xor_si512(a_four_u8x64, b_four_u8x64));
109
+ xor_count = _mm512_reduce_add_epi64(
110
+ _mm512_add_epi64(_mm512_add_epi64(xor_popcount_four_u64x8, xor_popcount_three_u64x8),
111
+ _mm512_add_epi64(xor_popcount_two_u64x8, xor_popcount_one_u64x8)));
112
+ }
113
+ else {
114
+ __m512i xor_popcount_u64x8 = _mm512_setzero_si512();
115
+ __m512i a_u8x64, b_u8x64;
116
+
117
+ nk_hamming_u1_icelake_cycle:
118
+ if (n_bytes < 64) {
119
+ __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes);
120
+ a_u8x64 = _mm512_maskz_loadu_epi8(mask, a);
121
+ b_u8x64 = _mm512_maskz_loadu_epi8(mask, b);
122
+ n_bytes = 0;
123
+ }
124
+ else {
125
+ a_u8x64 = _mm512_loadu_epi8(a);
126
+ b_u8x64 = _mm512_loadu_epi8(b);
127
+ a += 64, b += 64, n_bytes -= 64;
128
+ }
129
+ __m512i xor_u8x64 = _mm512_xor_si512(a_u8x64, b_u8x64);
130
+ xor_popcount_u64x8 = _mm512_add_epi64(xor_popcount_u64x8, _mm512_popcnt_epi64(xor_u8x64));
131
+ if (n_bytes) goto nk_hamming_u1_icelake_cycle;
132
+
133
+ xor_count = _mm512_reduce_add_epi64(xor_popcount_u64x8);
134
+ }
135
+ *result = xor_count;
136
+ }
137
+
138
+ NK_PUBLIC void nk_jaccard_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result) {
139
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
140
+
141
+ nk_u32_t intersection_count = 0, union_count = 0;
142
+ // It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
143
+ if (n_bytes <= 64) { // Up to 512 bits.
144
+ __mmask64 load_mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes);
145
+ __m512i a_u8x64 = _mm512_maskz_loadu_epi8(load_mask, a);
146
+ __m512i b_u8x64 = _mm512_maskz_loadu_epi8(load_mask, b);
147
+ __m512i intersection_popcount_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_u8x64, b_u8x64));
148
+ __m512i union_popcount_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_u8x64, b_u8x64));
149
+ intersection_count = _mm512_reduce_add_epi64(intersection_popcount_u64x8);
150
+ union_count = _mm512_reduce_add_epi64(union_popcount_u64x8);
151
+ }
152
+ else if (n_bytes <= 128) { // Up to 1024 bits.
153
+ __mmask64 load_mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes - 64);
154
+ __m512i a_one_u8x64 = _mm512_loadu_epi8(a);
155
+ __m512i b_one_u8x64 = _mm512_loadu_epi8(b);
156
+ __m512i a_two_u8x64 = _mm512_maskz_loadu_epi8(load_mask, a + 64);
157
+ __m512i b_two_u8x64 = _mm512_maskz_loadu_epi8(load_mask, b + 64);
158
+ __m512i intersection_popcount_one_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_one_u8x64, b_one_u8x64));
159
+ __m512i union_popcount_one_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_one_u8x64, b_one_u8x64));
160
+ __m512i intersection_popcount_two_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_two_u8x64, b_two_u8x64));
161
+ __m512i union_popcount_two_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_two_u8x64, b_two_u8x64));
162
+ intersection_count = _mm512_reduce_add_epi64(
163
+ _mm512_add_epi64(intersection_popcount_two_u64x8, intersection_popcount_one_u64x8));
164
+ union_count = _mm512_reduce_add_epi64(_mm512_add_epi64(union_popcount_two_u64x8, union_popcount_one_u64x8));
165
+ }
166
+ else if (n_bytes <= 192) { // Up to 1536 bits.
167
+ __mmask64 load_mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes - 128);
168
+ __m512i a_one_u8x64 = _mm512_loadu_epi8(a);
169
+ __m512i b_one_u8x64 = _mm512_loadu_epi8(b);
170
+ __m512i a_two_u8x64 = _mm512_loadu_epi8(a + 64);
171
+ __m512i b_two_u8x64 = _mm512_loadu_epi8(b + 64);
172
+ __m512i a_three_u8x64 = _mm512_maskz_loadu_epi8(load_mask, a + 128);
173
+ __m512i b_three_u8x64 = _mm512_maskz_loadu_epi8(load_mask, b + 128);
174
+ __m512i intersection_popcount_one_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_one_u8x64, b_one_u8x64));
175
+ __m512i union_popcount_one_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_one_u8x64, b_one_u8x64));
176
+ __m512i intersection_popcount_two_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_two_u8x64, b_two_u8x64));
177
+ __m512i union_popcount_two_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_two_u8x64, b_two_u8x64));
178
+ __m512i intersection_popcount_three_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_three_u8x64, b_three_u8x64));
179
+ __m512i union_popcount_three_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_three_u8x64, b_three_u8x64));
180
+ intersection_count = _mm512_reduce_add_epi64( //
181
+ _mm512_add_epi64(intersection_popcount_three_u64x8,
182
+ _mm512_add_epi64(intersection_popcount_two_u64x8, intersection_popcount_one_u64x8)));
183
+ union_count = _mm512_reduce_add_epi64( //
184
+ _mm512_add_epi64(union_popcount_three_u64x8,
185
+ _mm512_add_epi64(union_popcount_two_u64x8, union_popcount_one_u64x8)));
186
+ }
187
+ else if (n_bytes <= 256) { // Up to 2048 bits.
188
+ __mmask64 load_mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes - 192);
189
+ __m512i a_one_u8x64 = _mm512_loadu_epi8(a);
190
+ __m512i b_one_u8x64 = _mm512_loadu_epi8(b);
191
+ __m512i a_two_u8x64 = _mm512_loadu_epi8(a + 64);
192
+ __m512i b_two_u8x64 = _mm512_loadu_epi8(b + 64);
193
+ __m512i a_three_u8x64 = _mm512_loadu_epi8(a + 128);
194
+ __m512i b_three_u8x64 = _mm512_loadu_epi8(b + 128);
195
+ __m512i a_four_u8x64 = _mm512_maskz_loadu_epi8(load_mask, a + 192);
196
+ __m512i b_four_u8x64 = _mm512_maskz_loadu_epi8(load_mask, b + 192);
197
+ __m512i intersection_popcount_one_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_one_u8x64, b_one_u8x64));
198
+ __m512i union_popcount_one_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_one_u8x64, b_one_u8x64));
199
+ __m512i intersection_popcount_two_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_two_u8x64, b_two_u8x64));
200
+ __m512i union_popcount_two_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_two_u8x64, b_two_u8x64));
201
+ __m512i intersection_popcount_three_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_three_u8x64, b_three_u8x64));
202
+ __m512i union_popcount_three_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_three_u8x64, b_three_u8x64));
203
+ __m512i intersection_popcount_four_u64x8 = _mm512_popcnt_epi64(_mm512_and_si512(a_four_u8x64, b_four_u8x64));
204
+ __m512i union_popcount_four_u64x8 = _mm512_popcnt_epi64(_mm512_or_si512(a_four_u8x64, b_four_u8x64));
205
+ intersection_count = _mm512_reduce_add_epi64(
206
+ _mm512_add_epi64(_mm512_add_epi64(intersection_popcount_four_u64x8, intersection_popcount_three_u64x8),
207
+ _mm512_add_epi64(intersection_popcount_two_u64x8, intersection_popcount_one_u64x8)));
208
+ union_count = _mm512_reduce_add_epi64(
209
+ _mm512_add_epi64(_mm512_add_epi64(union_popcount_four_u64x8, union_popcount_three_u64x8),
210
+ _mm512_add_epi64(union_popcount_two_u64x8, union_popcount_one_u64x8)));
211
+ }
212
+ else {
213
+ __m512i intersection_popcount_u64x8 = _mm512_setzero_si512();
214
+ __m512i union_popcount_u64x8 = _mm512_setzero_si512();
215
+ __m512i a_u8x64, b_u8x64;
216
+
217
+ nk_jaccard_u1_icelake_cycle:
218
+ if (n_bytes < 64) {
219
+ __mmask64 load_mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes);
220
+ a_u8x64 = _mm512_maskz_loadu_epi8(load_mask, a);
221
+ b_u8x64 = _mm512_maskz_loadu_epi8(load_mask, b);
222
+ n_bytes = 0;
223
+ }
224
+ else {
225
+ a_u8x64 = _mm512_loadu_epi8(a);
226
+ b_u8x64 = _mm512_loadu_epi8(b);
227
+ a += 64, b += 64, n_bytes -= 64;
228
+ }
229
+ __m512i intersection_u8x64 = _mm512_and_si512(a_u8x64, b_u8x64);
230
+ __m512i union_u8x64 = _mm512_or_si512(a_u8x64, b_u8x64);
231
+ intersection_popcount_u64x8 = _mm512_add_epi64(intersection_popcount_u64x8,
232
+ _mm512_popcnt_epi64(intersection_u8x64));
233
+ union_popcount_u64x8 = _mm512_add_epi64(union_popcount_u64x8, _mm512_popcnt_epi64(union_u8x64));
234
+ if (n_bytes) goto nk_jaccard_u1_icelake_cycle;
235
+
236
+ intersection_count = _mm512_reduce_add_epi64(intersection_popcount_u64x8);
237
+ union_count = _mm512_reduce_add_epi64(union_popcount_u64x8);
238
+ }
239
+ *result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
240
+ }
241
+
242
+ #pragma endregion - Binary Sets
243
+
244
+ #pragma region - Integer Sets
245
+
246
+ NK_PUBLIC void nk_jaccard_u32_icelake(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
247
+ nk_u32_t intersection_count = 0;
248
+ nk_size_t n_remaining = n;
249
+ for (; n_remaining >= 16; n_remaining -= 16, a += 16, b += 16) {
250
+ __m512i a_u32x16 = _mm512_loadu_epi32(a);
251
+ __m512i b_u32x16 = _mm512_loadu_epi32(b);
252
+ __mmask16 equality_mask = _mm512_cmpeq_epi32_mask(a_u32x16, b_u32x16);
253
+ intersection_count += _mm_popcnt_u32((unsigned int)equality_mask);
254
+ }
255
+ if (n_remaining) {
256
+ __mmask16 load_mask = (__mmask16)_bzhi_u32(0xFFFF, n_remaining);
257
+ __m512i a_u32x16 = _mm512_maskz_loadu_epi32(load_mask, a);
258
+ __m512i b_u32x16 = _mm512_maskz_loadu_epi32(load_mask, b);
259
+ __mmask16 equality_mask = _mm512_mask_cmpeq_epi32_mask(load_mask, a_u32x16, b_u32x16);
260
+ intersection_count += _mm_popcnt_u32((unsigned int)equality_mask);
261
+ }
262
+ *result = (n != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)n : 0.0f;
263
+ }
264
+
265
+ NK_PUBLIC void nk_hamming_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
266
+ nk_u32_t differences = 0;
267
+ nk_size_t n_remaining = n;
268
+ for (; n_remaining >= 64; n_remaining -= 64, a += 64, b += 64) {
269
+ __m512i a_u8x64 = _mm512_loadu_si512((__m512i const *)a);
270
+ __m512i b_u8x64 = _mm512_loadu_si512((__m512i const *)b);
271
+ __mmask64 neq_mask = _mm512_cmpneq_epi8_mask(a_u8x64, b_u8x64);
272
+ differences += _mm_popcnt_u64(neq_mask);
273
+ }
274
+ if (n_remaining) {
275
+ __mmask64 load_mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_remaining);
276
+ __m512i a_u8x64 = _mm512_maskz_loadu_epi8(load_mask, a);
277
+ __m512i b_u8x64 = _mm512_maskz_loadu_epi8(load_mask, b);
278
+ __mmask64 neq_mask = _mm512_mask_cmpneq_epi8_mask(load_mask, a_u8x64, b_u8x64);
279
+ differences += _mm_popcnt_u64(neq_mask);
280
+ }
281
+ *result = differences;
282
+ }
283
+
284
+ NK_PUBLIC void nk_jaccard_u16_icelake(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_f32_t *result) {
285
+ nk_u32_t matches = 0;
286
+ nk_size_t n_remaining = n;
287
+ for (; n_remaining >= 32; n_remaining -= 32, a += 32, b += 32) {
288
+ __m512i a_u16x32 = _mm512_loadu_si512((__m512i const *)a);
289
+ __m512i b_u16x32 = _mm512_loadu_si512((__m512i const *)b);
290
+ __mmask32 equality_mask = _mm512_cmpeq_epi16_mask(a_u16x32, b_u16x32);
291
+ matches += _mm_popcnt_u32(equality_mask);
292
+ }
293
+ if (n_remaining) {
294
+ __mmask32 load_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n_remaining);
295
+ __m512i a_u16x32 = _mm512_maskz_loadu_epi16(load_mask, a);
296
+ __m512i b_u16x32 = _mm512_maskz_loadu_epi16(load_mask, b);
297
+ __mmask32 equality_mask = _mm512_mask_cmpeq_epi16_mask(load_mask, a_u16x32, b_u16x32);
298
+ matches += _mm_popcnt_u32(equality_mask);
299
+ }
300
+ *result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
301
+ }
302
+
303
+ #pragma endregion - Integer Sets
304
+
305
+ #pragma region - Stateful Streaming
306
+
307
+ typedef struct nk_hamming_u1x512_state_icelake_t {
308
+ __m512i intersection_count_i64x8;
309
+ } nk_hamming_u1x512_state_icelake_t;
310
+
311
+ NK_INTERNAL void nk_hamming_u1x512_init_icelake(nk_hamming_u1x512_state_icelake_t *state) {
312
+ state->intersection_count_i64x8 = _mm512_setzero_si512();
313
+ }
314
+
315
+ NK_INTERNAL void nk_hamming_u1x512_update_icelake(nk_hamming_u1x512_state_icelake_t *state, nk_b512_vec_t a,
316
+ nk_b512_vec_t b, nk_size_t depth_offset,
317
+ nk_size_t active_dimensions) {
318
+ nk_unused_(depth_offset);
319
+ nk_unused_(active_dimensions);
320
+ state->intersection_count_i64x8 = _mm512_add_epi64(state->intersection_count_i64x8,
321
+ _mm512_popcnt_epi64(_mm512_xor_si512(a.zmm, b.zmm)));
322
+ }
323
+
324
+ NK_INTERNAL void nk_hamming_u1x512_finalize_icelake( //
325
+ nk_hamming_u1x512_state_icelake_t const *state_a, nk_hamming_u1x512_state_icelake_t const *state_b,
326
+ nk_hamming_u1x512_state_icelake_t const *state_c, nk_hamming_u1x512_state_icelake_t const *state_d,
327
+ nk_size_t total_dimensions, nk_b128_vec_t *result) {
328
+ nk_unused_(total_dimensions);
329
+
330
+ // Port-optimized 4-way horizontal reduction, matching the Jaccard finalizer pattern.
331
+ // Truncate i64 → i32 early so we can use `VPHADDD` (p01) instead of shuffle-heavy i64 reductions (p5).
332
+
333
+ // Step 1: Truncate 8×i64 → 8×i32 per state via VPMOVQD (p01, 4cy, 0.5/cy)
334
+ __m256i a_i32x8 = _mm512_cvtepi64_epi32(state_a->intersection_count_i64x8);
335
+ __m256i b_i32x8 = _mm512_cvtepi64_epi32(state_b->intersection_count_i64x8);
336
+ __m256i c_i32x8 = _mm512_cvtepi64_epi32(state_c->intersection_count_i64x8);
337
+ __m256i d_i32x8 = _mm512_cvtepi64_epi32(state_d->intersection_count_i64x8);
338
+
339
+ // Step 2: Fold 8×i32 → 4×i32 (add high 128-bit lane to low)
340
+ __m128i a_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(a_i32x8), _mm256_extracti128_si256(a_i32x8, 1));
341
+ __m128i b_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(b_i32x8), _mm256_extracti128_si256(b_i32x8, 1));
342
+ __m128i c_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(c_i32x8), _mm256_extracti128_si256(c_i32x8, 1));
343
+ __m128i d_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(d_i32x8), _mm256_extracti128_si256(d_i32x8, 1));
344
+
345
+ // Step 3: Interleaved horizontal adds — 4×i32 → 2×i32 via VPHADDD (p01, 3cy, 0.5/cy)
346
+ __m128i ab_i32x4 = _mm_hadd_epi32(a_i32x4, b_i32x4); // [a01, a23, b01, b23]
347
+ __m128i cd_i32x4 = _mm_hadd_epi32(c_i32x4, d_i32x4); // [c01, c23, d01, d23]
348
+
349
+ // Step 4: Final horizontal add — 2×i32 → 1×i32 per state
350
+ result->xmm = _mm_hadd_epi32(ab_i32x4, cd_i32x4); // [sum_a, sum_b, sum_c, sum_d]
351
+ }
352
+
353
+ typedef struct nk_jaccard_u1x512_state_icelake_t {
354
+ __m512i intersection_count_i64x8;
355
+ } nk_jaccard_u1x512_state_icelake_t;
356
+
357
+ NK_INTERNAL void nk_jaccard_u1x512_init_icelake(nk_jaccard_u1x512_state_icelake_t *state) {
358
+ state->intersection_count_i64x8 = _mm512_setzero_si512();
359
+ }
360
+
361
+ NK_INTERNAL void nk_jaccard_u1x512_update_icelake(nk_jaccard_u1x512_state_icelake_t *state, nk_b512_vec_t a,
362
+ nk_b512_vec_t b, nk_size_t depth_offset,
363
+ nk_size_t active_dimensions) {
364
+ nk_unused_(depth_offset);
365
+ nk_unused_(active_dimensions);
366
+ state->intersection_count_i64x8 = _mm512_add_epi64(state->intersection_count_i64x8,
367
+ _mm512_popcnt_epi64(_mm512_and_si512(a.zmm, b.zmm)));
368
+ }
369
+
370
+ NK_INTERNAL void nk_jaccard_u1x512_finalize_icelake( //
371
+ nk_jaccard_u1x512_state_icelake_t const *state_a, nk_jaccard_u1x512_state_icelake_t const *state_b,
372
+ nk_jaccard_u1x512_state_icelake_t const *state_c, nk_jaccard_u1x512_state_icelake_t const *state_d,
373
+ nk_f32_t query_popcount, nk_f32_t target_popcount_a, nk_f32_t target_popcount_b, nk_f32_t target_popcount_c,
374
+ nk_f32_t target_popcount_d, nk_size_t total_dimensions, nk_b128_vec_t *result) {
375
+ nk_unused_(total_dimensions);
376
+
377
+ // Port-optimized 4-way horizontal reduction using early i64 → i32 truncation.
378
+ //
379
+ // Key insight: `_mm_hadd_epi32` uses ports p01, not p5, avoiding the shuffle bottleneck.
380
+ // By truncating to i32 early, we can use hadd for reduction instead of expensive shuffles.
381
+ //
382
+ // Ice Lake execution ports:
383
+ // - p0: Division, reciprocal (`VRCP14PS`: 4cy latency, 1/cy throughput)
384
+ // - p01: FP mul/add/fma, hadd (`VMULPS`/`VPHADDD`: 3cy latency, 0.5/cy throughput)
385
+ // - p015: Integer add (`VPADDD`: 1cy latency, 0.33/cy throughput)
386
+ // - p5: Shuffles/extracts (`VEXTRACTI128`: 3cy latency, 1/cy throughput)
387
+
388
+ // Step 1: Truncate 8x i64 → 8x i32 per state (fits in YMM)
389
+ // `VPMOVQD` (ZMM → YMM): 4cy latency, 0.5/cy throughput, port p01
390
+ __m256i a_i32x8 = _mm512_cvtepi64_epi32(state_a->intersection_count_i64x8);
391
+ __m256i b_i32x8 = _mm512_cvtepi64_epi32(state_b->intersection_count_i64x8);
392
+ __m256i c_i32x8 = _mm512_cvtepi64_epi32(state_c->intersection_count_i64x8);
393
+ __m256i d_i32x8 = _mm512_cvtepi64_epi32(state_d->intersection_count_i64x8);
394
+
395
+ // Step 2: Reduce 8x i32 → 4x i32 (add high 128-bit lane to low)
396
+ // - `VEXTRACTI128`: 3cy latency, 1/cy throughput, port p5
397
+ // - `VPADDD` (XMM): 1cy latency, 0.33/cy throughput, ports p015
398
+ __m128i a_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(a_i32x8), _mm256_extracti128_si256(a_i32x8, 1));
399
+ __m128i b_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(b_i32x8), _mm256_extracti128_si256(b_i32x8, 1));
400
+ __m128i c_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(c_i32x8), _mm256_extracti128_si256(c_i32x8, 1));
401
+ __m128i d_i32x4 = _mm_add_epi32(_mm256_castsi256_si128(d_i32x8), _mm256_extracti128_si256(d_i32x8, 1));
402
+
403
+ // Step 3: Reduce 4x i32 → 2x i32 using horizontal add (uses p01, not p5!)
404
+ // - `VPHADDD` (XMM): 3cy latency, 0.5/cy throughput, ports p01
405
+ __m128i ab_i32x4 = _mm_hadd_epi32(a_i32x4, b_i32x4); // [a01, a23, b01, b23]
406
+ __m128i cd_i32x4 = _mm_hadd_epi32(c_i32x4, d_i32x4); // [c01, c23, d01, d23]
407
+
408
+ // Step 4: Reduce 2x i32 → 1x i32 per state (final horizontal add)
409
+ __m128i intersection_i32x4 = _mm_hadd_epi32(ab_i32x4, cd_i32x4); // [a, b, c, d]
410
+
411
+ // Step 5: Direct i32 → f32 conversion (simpler than i64 → f64 → f32 path)
412
+ // - `VCVTDQ2PS` (XMM): 4cy latency, 0.5/cy throughput, port p01
413
+ __m128 intersection_f32x4 = _mm_cvtepi32_ps(intersection_i32x4);
414
+
415
+ // Compute Jaccard distance: 1 - intersection ÷ union
416
+ // where union = query_popcount + target_popcount - intersection
417
+ __m128 query_f32x4 = _mm_set1_ps(query_popcount);
418
+ __m128 targets_f32x4 = _mm_setr_ps(target_popcount_a, target_popcount_b, target_popcount_c, target_popcount_d);
419
+ __m128 union_f32x4 = _mm_sub_ps(_mm_add_ps(query_f32x4, targets_f32x4), intersection_f32x4);
420
+
421
+ // Handle zero-union edge case: if union == 0, result = 0.0
422
+ __m128 zero_union_mask = _mm_cmpeq_ps(union_f32x4, _mm_setzero_ps());
423
+ __m128 one_f32x4 = _mm_set1_ps(1.0f);
424
+ __m128 safe_union_f32x4 = _mm_blendv_ps(union_f32x4, one_f32x4, zero_union_mask);
425
+
426
+ // Fast reciprocal with Newton-Raphson refinement:
427
+ // - `VRCP14PS`: 4cy latency, 1/cy throughput, port p0 (~14-bit precision)
428
+ // Newton-Raphson: rcp' = rcp × (2 - x × rcp) doubles precision to ~28 bits
429
+ // - `VFNMADD`: 4cy latency, 0.5/cy throughput, ports p01
430
+ // - `VMULPS`: 4cy latency, 0.5/cy throughput, ports p01
431
+ // Total: ~12cy vs `VDIVPS` 11cy latency but 3cy throughput - NR wins on throughput
432
+ __m128 union_reciprocal_f32x4 = _mm_rcp14_ps(safe_union_f32x4);
433
+ union_reciprocal_f32x4 = _mm_mul_ps(union_reciprocal_f32x4,
434
+ _mm_fnmadd_ps(safe_union_f32x4, union_reciprocal_f32x4, _mm_set1_ps(2.0f)));
435
+
436
+ __m128 ratio_f32x4 = _mm_mul_ps(intersection_f32x4, union_reciprocal_f32x4);
437
+ __m128 jaccard_f32x4 = _mm_sub_ps(one_f32x4, ratio_f32x4);
438
+ result->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
439
+ }
440
+
441
+ /** @brief Hamming from_dot: computes pop_a + pop_b - 2*dot for 4 pairs (IceLake). */
442
+ NK_INTERNAL void nk_hamming_u32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
443
+ nk_b128_vec_t *results) {
444
+ __m128i dots_i32x4 = dots.xmm;
445
+ __m128i query_i32x4 = _mm_set1_epi32((int)query_pop);
446
+ __m128i target_i32x4 = target_pops.xmm;
447
+ results->xmm = _mm_sub_epi32(_mm_add_epi32(query_i32x4, target_i32x4), _mm_slli_epi32(dots_i32x4, 1));
448
+ }
449
+
450
+ /** @brief Jaccard from_dot: computes 1 - dot / (pop_a + pop_b - dot) for 4 pairs (IceLake). */
451
+ NK_INTERNAL void nk_jaccard_f32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
452
+ nk_b128_vec_t *results) {
453
+ __m128 dot_f32x4 = _mm_cvtepi32_ps(dots.xmm);
454
+ __m128 query_f32x4 = _mm_set1_ps((nk_f32_t)query_pop);
455
+ __m128 target_f32x4 = _mm_cvtepi32_ps(target_pops.xmm);
456
+ __m128 union_f32x4 = _mm_sub_ps(_mm_add_ps(query_f32x4, target_f32x4), dot_f32x4);
457
+
458
+ __m128 zero_union_mask = _mm_cmpeq_ps(union_f32x4, _mm_setzero_ps());
459
+ __m128 one_f32x4 = _mm_set1_ps(1.0f);
460
+ __m128 safe_union_f32x4 = _mm_blendv_ps(union_f32x4, one_f32x4, zero_union_mask);
461
+
462
+ __m128 union_reciprocal_f32x4 = _mm_rcp14_ps(safe_union_f32x4);
463
+ union_reciprocal_f32x4 = _mm_mul_ps(union_reciprocal_f32x4,
464
+ _mm_fnmadd_ps(safe_union_f32x4, union_reciprocal_f32x4, _mm_set1_ps(2.0f)));
465
+
466
+ __m128 ratio_f32x4 = _mm_mul_ps(dot_f32x4, union_reciprocal_f32x4);
467
+ __m128 jaccard_f32x4 = _mm_sub_ps(one_f32x4, ratio_f32x4);
468
+ results->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
469
+ }
470
+
471
+ #pragma endregion - Stateful Streaming
472
+
473
+ #if defined(__clang__)
474
+ #pragma clang attribute pop
475
+ #elif defined(__GNUC__)
476
+ #pragma GCC pop_options
477
+ #endif
478
+
479
+ #if defined(__cplusplus)
480
+ } // extern "C"
481
+ #endif
482
+
483
+ #endif // NK_TARGET_ICELAKE
484
+ #endif // NK_TARGET_X86_
485
+ #endif // NK_SET_ICELAKE_H