numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -0,0 +1,181 @@
1
+ /**
2
+ * @brief SIMD-accelerated Set Similarity Measures for LoongArch LASX (256-bit).
3
+ * @file include/numkong/set/loongsonasx.h
4
+ * @author Ash Vardanian
5
+ * @date March 23, 2026
6
+ *
7
+ * @sa include/numkong/set.h
8
+ *
9
+ * @section set_loongsonasx_instructions Key LASX Set Instructions
10
+ *
11
+ * Intrinsic Instruction
12
+ * __lasx_xvld XVLD (256-bit unaligned load)
13
+ * __lasx_xvxor_v XVXOR.V (bitwise XOR)
14
+ * __lasx_xvor_v XVOR.V (bitwise OR)
15
+ * __lasx_xvand_v XVAND.V (bitwise AND)
16
+ * __lasx_xvpcnt_d XVPCNT.D (popcount per u64 element)
17
+ * __lasx_xvseq_b XVSEQ.B (byte-wise equality, 0xFF/0x00)
18
+ * __lasx_xvmin_bu XVMIN.BU (unsigned byte minimum)
19
+ * __lasx_xvhaddw_hu_bu XVHADDW.HU.BU (horizontal pairwise add u8->u16)
20
+ * __lasx_xvhaddw_wu_hu XVHADDW.WU.HU (horizontal pairwise add u16->u32)
21
+ * __lasx_xvhaddw_du_wu XVHADDW.DU.WU (horizontal pairwise add u32->u64)
22
+ * __lasx_xvadd_d XVADD.D (i64 addition)
23
+ * __lasx_xvpermi_q XVPERMI.Q (extract/permute 128-bit lanes)
24
+ *
25
+ * LASX provides per-element popcount at multiple widths (`xvpcnt_b/h/w/d`).
26
+ * For binary set operations we use `xvpcnt_d` which gives 4 x u64 popcount values
27
+ * directly, eliminating the need for horizontal byte-sum reduction chains.
28
+ *
29
+ * For sorted integer set operations (jaccard_u16, jaccard_u32), SIMD provides limited
30
+ * benefit due to the inherently serial merge-based algorithm, so we delegate to the
31
+ * serial implementations.
32
+ */
33
+ #ifndef NK_SET_LOONGSONASX_H
34
+ #define NK_SET_LOONGSONASX_H
35
+
36
+ #if NK_TARGET_LOONGARCH_
37
+ #if NK_TARGET_LOONGSONASX
38
+
39
+ #include "numkong/types.h"
40
+ #include "numkong/set/serial.h" // `nk_u1x8_popcount_`, serial fallbacks
41
+ #include "numkong/dot/loongsonasx.h" // `nk_reduce_add_i32x8_loongsonasx_`
42
+
43
+ #if defined(__cplusplus)
44
+ extern "C" {
45
+ #endif
46
+
47
+ #pragma region Reduction Helpers
48
+
49
+ /** @brief Horizontal sum of 4 u64 lanes in a 256-bit LASX register. */
50
+ NK_INTERNAL nk_u64_t nk_reduce_add_u64x4_loongsonasx_(__m256i sum_u64x4) {
51
+ __m256i high_u64x4 = __lasx_xvpermi_q(sum_u64x4, sum_u64x4, 0x11);
52
+ __m256i sum_u64x2 = __lasx_xvadd_d(sum_u64x4, high_u64x4);
53
+ __m256i swapped_u64x2 = __lasx_xvshuf4i_d(sum_u64x2, sum_u64x2, 0b0001);
54
+ __m256i reduced_u64x2 = __lasx_xvadd_d(sum_u64x2, swapped_u64x2);
55
+ return (nk_u64_t)__lasx_xvpickve2gr_du(reduced_u64x2, 0);
56
+ }
57
+
58
+ /** @brief Horizontally sum all bytes in a 256-bit register as unsigned values.
59
+ *
60
+ * Chains pairwise widening additions: u8→u16→u32→u64, then reduces 4 u64 lanes.
61
+ */
62
+ NK_INTERNAL nk_u64_t nk_reduce_add_u8x32_loongsonasx_(__m256i v_u8x32) {
63
+ __m256i sum_u16x16 = __lasx_xvhaddw_hu_bu(v_u8x32, v_u8x32);
64
+ __m256i sum_u32x8 = __lasx_xvhaddw_wu_hu(sum_u16x16, sum_u16x16);
65
+ __m256i sum_u64x4 = __lasx_xvhaddw_du_wu(sum_u32x8, sum_u32x8);
66
+ return nk_reduce_add_u64x4_loongsonasx_(sum_u64x4);
67
+ }
68
+
69
+ #pragma endregion Reduction Helpers
70
+
71
+ #pragma region Binary Sets
72
+
73
+ NK_PUBLIC void nk_hamming_u1_loongsonasx(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
74
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
75
+ __m256i count_u64x4 = __lasx_xvreplgr2vr_d(0);
76
+ nk_size_t i = 0;
77
+
78
+ for (; i + 32 <= n_bytes; i += 32) {
79
+ __m256i a_u8x32 = __lasx_xvld(a + i, 0);
80
+ __m256i b_u8x32 = __lasx_xvld(b + i, 0);
81
+ __m256i xor_u8x32 = __lasx_xvxor_v(a_u8x32, b_u8x32);
82
+ count_u64x4 = __lasx_xvadd_d(count_u64x4, __lasx_xvpcnt_d(xor_u8x32));
83
+ }
84
+
85
+ nk_u64_t count = nk_reduce_add_u64x4_loongsonasx_(count_u64x4);
86
+
87
+ for (; i < n_bytes; ++i) count += nk_u1x8_popcount_(a[i] ^ b[i]);
88
+ *result = (nk_u32_t)count;
89
+ }
90
+
91
+ NK_PUBLIC void nk_jaccard_u1_loongsonasx(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result) {
92
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
93
+ __m256i xor_count_u64x4 = __lasx_xvreplgr2vr_d(0);
94
+ __m256i or_count_u64x4 = __lasx_xvreplgr2vr_d(0);
95
+ nk_size_t i = 0;
96
+
97
+ for (; i + 32 <= n_bytes; i += 32) {
98
+ __m256i a_u8x32 = __lasx_xvld(a + i, 0);
99
+ __m256i b_u8x32 = __lasx_xvld(b + i, 0);
100
+ __m256i xor_u8x32 = __lasx_xvxor_v(a_u8x32, b_u8x32);
101
+ __m256i or_u8x32 = __lasx_xvor_v(a_u8x32, b_u8x32);
102
+ xor_count_u64x4 = __lasx_xvadd_d(xor_count_u64x4, __lasx_xvpcnt_d(xor_u8x32));
103
+ or_count_u64x4 = __lasx_xvadd_d(or_count_u64x4, __lasx_xvpcnt_d(or_u8x32));
104
+ }
105
+
106
+ nk_u64_t xor_count = nk_reduce_add_u64x4_loongsonasx_(xor_count_u64x4);
107
+ nk_u64_t or_count = nk_reduce_add_u64x4_loongsonasx_(or_count_u64x4);
108
+
109
+ for (; i < n_bytes; ++i) {
110
+ xor_count += nk_u1x8_popcount_(a[i] ^ b[i]);
111
+ or_count += nk_u1x8_popcount_(a[i] | b[i]);
112
+ }
113
+ *result = (or_count != 0) ? (nk_f32_t)xor_count / (nk_f32_t)or_count : 0.0f;
114
+ }
115
+
116
+ #pragma endregion Binary Sets
117
+
118
+ #pragma region Integer Sets
119
+
120
+ NK_PUBLIC void nk_hamming_u8_loongsonasx(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
121
+ __m256i count_u64x4 = __lasx_xvreplgr2vr_d(0);
122
+ __m256i ones_u8x32 = __lasx_xvreplgr2vr_b(1);
123
+ nk_size_t i = 0;
124
+
125
+ for (; i + 32 <= n; i += 32) {
126
+ __m256i a_u8x32 = __lasx_xvld(a + i, 0);
127
+ __m256i b_u8x32 = __lasx_xvld(b + i, 0);
128
+ __m256i xor_u8x32 = __lasx_xvxor_v(a_u8x32, b_u8x32);
129
+ __m256i min_u8x32 = __lasx_xvmin_bu(xor_u8x32, ones_u8x32);
130
+ __m256i sum_u16x16 = __lasx_xvhaddw_hu_bu(min_u8x32, min_u8x32);
131
+ __m256i sum_u32x8 = __lasx_xvhaddw_wu_hu(sum_u16x16, sum_u16x16);
132
+ __m256i sum_u64x4 = __lasx_xvhaddw_du_wu(sum_u32x8, sum_u32x8);
133
+ count_u64x4 = __lasx_xvadd_d(count_u64x4, sum_u64x4);
134
+ }
135
+
136
+ nk_u64_t count = nk_reduce_add_u64x4_loongsonasx_(count_u64x4);
137
+
138
+ for (; i < n; ++i) count += (a[i] != b[i]);
139
+ *result = (nk_u32_t)count;
140
+ }
141
+
142
+ #pragma endregion Integer Sets
143
+
144
+ #pragma region Batched Finalizers
145
+
146
+ /** @brief Hamming from_dot: computes pop_a + pop_b − 2 × dot for 4 pairs (LSX). */
147
+ NK_INTERNAL void nk_hamming_u32x4_from_dot_loongsonasx_(nk_b128_vec_t dots, nk_u32_t query_pop,
148
+ nk_b128_vec_t target_pops, nk_b128_vec_t *results) {
149
+ __m128i dots_u32x4 = dots.xmm;
150
+ __m128i query_u32x4 = __lsx_vreplgr2vr_w((int)query_pop);
151
+ __m128i target_u32x4 = target_pops.xmm;
152
+ results->xmm = __lsx_vsub_w(__lsx_vadd_w(query_u32x4, target_u32x4), __lsx_vslli_w(dots_u32x4, 1));
153
+ }
154
+
155
+ /** @brief Jaccard from_dot: computes 1 − dot / (pop_a + pop_b − dot) for 4 pairs (LSX). */
156
+ NK_INTERNAL void nk_jaccard_f32x4_from_dot_loongsonasx_(nk_b128_vec_t dots, nk_u32_t query_pop,
157
+ nk_b128_vec_t target_pops, nk_b128_vec_t *results) {
158
+ __m128 dot_f32x4 = __lsx_vffint_s_wu(dots.xmm);
159
+ __m128 query_f32x4 = nk_xvreplgr2vr_s_128_((nk_f32_t)query_pop);
160
+ __m128 target_f32x4 = __lsx_vffint_s_wu(target_pops.xmm);
161
+ __m128 union_f32x4 = __lsx_vfsub_s(__lsx_vfadd_s(query_f32x4, target_f32x4), dot_f32x4);
162
+
163
+ __m128 zero_f32x4 = (__m128)__lsx_vreplgr2vr_w(0);
164
+ __m128 one_f32x4 = nk_xvreplgr2vr_s_128_(1.0f);
165
+ __m128i zero_union_mask_u32x4 = __lsx_vfcmp_ceq_s(union_f32x4, zero_f32x4);
166
+ __m128 safe_union_f32x4 = (__m128)__lsx_vbitsel_v((__m128i)union_f32x4, (__m128i)one_f32x4, zero_union_mask_u32x4);
167
+
168
+ __m128 ratio_f32x4 = __lsx_vfdiv_s(dot_f32x4, safe_union_f32x4);
169
+ __m128 jaccard_f32x4 = __lsx_vfsub_s(one_f32x4, ratio_f32x4);
170
+ results->xmm_ps = (__m128)__lsx_vbitsel_v((__m128i)jaccard_f32x4, (__m128i)zero_f32x4, zero_union_mask_u32x4);
171
+ }
172
+
173
+ #pragma endregion Batched Finalizers
174
+
175
+ #if defined(__cplusplus)
176
+ } // extern "C"
177
+ #endif
178
+
179
+ #endif // NK_TARGET_LOONGSONASX
180
+ #endif // NK_TARGET_LOONGARCH_
181
+ #endif // NK_SET_LOONGSONASX_H
@@ -10,13 +10,13 @@
10
10
  *
11
11
  * Key NEON instructions for binary/bitwise operations (Cortex-A76 class):
12
12
  *
13
- * Intrinsic Instruction Latency Throughput
14
- * vcntq_u8 CNT (V.16B, V.16B) 2cy 2/cy
15
- * veorq_u8 EOR (V.16B, V.16B, V.16B) 1cy 4/cy
16
- * vandq_u8 AND (V.16B, V.16B, V.16B) 1cy 4/cy
17
- * vorrq_u8 ORR (V.16B, V.16B, V.16B) 1cy 4/cy
18
- * vpaddlq_u8 UADDLP (V.8H, V.16B) 2cy 2/cy
19
- * vaddvq_u32 ADDV (S, V.4S) 3cy 1/cy
13
+ * Intrinsic Instruction A76 M5
14
+ * vcntq_u8 CNT (V.16B, V.16B) 2cy @ 2p 2cy @ 4p
15
+ * veorq_u8 EOR (V.16B, V.16B, V.16B) 1cy @ 2p 2cy @ 4p
16
+ * vandq_u8 AND (V.16B, V.16B, V.16B) 1cy @ 2p 2cy @ 4p
17
+ * vorrq_u8 ORR (V.16B, V.16B, V.16B) 1cy @ 2p 2cy @ 4p
18
+ * vpaddlq_u8 UADDLP (V.8H, V.16B) 2cy @ 2p 2cy @ 4p
19
+ * vaddvq_u32 ADDV (S, V.4S) 4cy @ 1p 5cy @ 1p
20
20
  *
21
21
  * According to the available literature, the throughput for those basic integer ops is
22
22
  * identical across most Apple, Qualcomm, and AWS Graviton chips. As long as we avoid widening
@@ -58,7 +58,7 @@ extern "C" {
58
58
  #pragma GCC target("arch=armv8-a+simd")
59
59
  #endif
60
60
 
61
- #pragma region - Binary Sets
61
+ #pragma region Binary Sets
62
62
 
63
63
  NK_PUBLIC void nk_hamming_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
64
64
  nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
@@ -109,9 +109,9 @@ NK_PUBLIC void nk_jaccard_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_siz
109
109
  *result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
110
110
  }
111
111
 
112
- #pragma endregion - Binary Sets
112
+ #pragma endregion Binary Sets
113
113
 
114
- #pragma region - Integer Sets
114
+ #pragma region Integer Sets
115
115
 
116
116
  NK_PUBLIC void nk_jaccard_u32_neon(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
117
117
  nk_u32_t intersection_count = 0;
@@ -174,9 +174,9 @@ NK_PUBLIC void nk_jaccard_u16_neon(nk_u16_t const *a, nk_u16_t const *b, nk_size
174
174
  *result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
175
175
  }
176
176
 
177
- #pragma endregion - Integer Sets
177
+ #pragma endregion Integer Sets
178
178
 
179
- #pragma region - Stateful Streaming
179
+ #pragma region Stateful Streaming
180
180
 
181
181
  typedef struct nk_hamming_u1x128_state_neon_t {
182
182
  uint32x4_t intersection_count_u32x4;
@@ -290,12 +290,11 @@ NK_INTERNAL void nk_jaccard_u1x128_finalize_neon( //
290
290
  float32x4_t intersection_f32x4 = vcvtq_f32_u32(intersection_u32x4);
291
291
 
292
292
  // Compute union using |A ∪ B| = |A| + |B| - |A ∩ B|
293
- // Build target popcounts vector using lane insertion (avoids union store/load round-trip).
293
+ // Build target popcounts vector from two independent halves (avoids serial lane insertion chain).
294
294
  float32x4_t query_f32x4 = vdupq_n_f32(query_popcount);
295
- float32x4_t targets_f32x4 = vdupq_n_f32(target_popcount_a);
296
- targets_f32x4 = vsetq_lane_f32(target_popcount_b, targets_f32x4, 1);
297
- targets_f32x4 = vsetq_lane_f32(target_popcount_c, targets_f32x4, 2);
298
- targets_f32x4 = vsetq_lane_f32(target_popcount_d, targets_f32x4, 3);
295
+ float32x2_t targets_ab_f32x2 = vset_lane_f32(target_popcount_b, vdup_n_f32(target_popcount_a), 1);
296
+ float32x2_t targets_cd_f32x2 = vset_lane_f32(target_popcount_d, vdup_n_f32(target_popcount_c), 1);
297
+ float32x4_t targets_f32x4 = vcombine_f32(targets_ab_f32x2, targets_cd_f32x2);
299
298
  float32x4_t union_f32x4 = vsubq_f32(vaddq_f32(query_f32x4, targets_f32x4), intersection_f32x4);
300
299
 
301
300
  // Handle zero-union edge case (empty vectors → distance = 0.0, matching scipy convention)
@@ -347,7 +346,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_neon_(nk_b128_vec_t dots, nk_u32_t qu
347
346
  results->f32x4 = vbslq_f32(zero_union_mask, vdupq_n_f32(0.0f), jaccard_f32x4);
348
347
  }
349
348
 
350
- #pragma endregion - Stateful Streaming
349
+ #pragma endregion Stateful Streaming
351
350
 
352
351
  #if defined(__clang__)
353
352
  #pragma clang attribute pop
@@ -0,0 +1,326 @@
1
+ /**
2
+ * @brief SIMD-accelerated Set Similarity Measures for Power ISA VSX.
3
+ * @file include/numkong/set/powervsx.h
4
+ * @author Ash Vardanian
5
+ * @date March 23, 2026
6
+ *
7
+ * @sa include/numkong/set.h
8
+ *
9
+ * @section set_powervsx_instructions Power9 VSX Set Instructions
10
+ *
11
+ * Key Power9 VSX instructions for binary/bitwise operations:
12
+ *
13
+ * Intrinsic Instruction P9
14
+ * vec_popcnt vpopcntb/h/w/d 2cy @ 2p element-wise popcount
15
+ * vec_xor xxlxor 1cy @ 4p
16
+ * vec_and xxland 1cy @ 4p
17
+ * vec_or xxlor 1cy @ 4p
18
+ * vec_cmpne vcmpneb/h/w 2cy @ 2p byte/half/word not-equal
19
+ * vec_xl_len lxvll 6cy @ 1p partial vector load
20
+ *
21
+ * Power9 has native doubleword `vpopcntd` instruction, providing efficient SIMD popcount
22
+ * with minimal data flow complexity. `vec_xl_len` enables branchless tail handling.
23
+ *
24
+ * @section set_powervsx_stateful Stateful Streaming Logic
25
+ *
26
+ * To build memory-optimal tiled algorithms, this file defines:
27
+ *
28
+ * - nk_hamming_u1x128_state_powervsx_t for streaming Hamming distance
29
+ * - nk_jaccard_u1x128_state_powervsx_t for streaming Jaccard similarity
30
+ *
31
+ * @code{c}
32
+ * nk_jaccard_u1x128_state_powervsx_t state_first, state_second, state_third, state_fourth;
33
+ * nk_jaccard_u1x128_init_powervsx(&state_first);
34
+ * // ... stream through packed binary vectors ...
35
+ * nk_jaccard_u1x128_finalize_powervsx(&state_first, &state_second, &state_third, &state_fourth,
36
+ * query_popcount, target_popcount_a, target_popcount_b, target_popcount_c, target_popcount_d,
37
+ * total_dimensions, &results);
38
+ * @endcode
39
+ */
40
+ #ifndef NK_SET_POWERVSX_H
41
+ #define NK_SET_POWERVSX_H
42
+
43
+ #if NK_TARGET_POWER_
44
+ #if NK_TARGET_POWERVSX
45
+
46
+ #include "numkong/types.h"
47
+ #include "numkong/set/serial.h" // `nk_u1x8_popcount_`
48
+ #include "numkong/dot/powervsx.h" // `nk_hsum_u32x4_powervsx_`, `nk_hsum_u64x2_powervsx_`
49
+
50
+ #if defined(__cplusplus)
51
+ extern "C" {
52
+ #endif
53
+
54
+ #if defined(__clang__)
55
+ #pragma clang attribute push(__attribute__((target("power9-vector"))), apply_to = function)
56
+ #elif defined(__GNUC__)
57
+ #pragma GCC push_options
58
+ #pragma GCC target("power9-vector")
59
+ #endif
60
+
61
+ NK_PUBLIC void nk_hamming_u1_powervsx(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
62
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
63
+ nk_vu64x2_t differences_u64x2 = vec_splats((nk_u64_t)0);
64
+ nk_size_t i = 0;
65
+ // Process 16 bytes at a time using doubleword popcount (vpopcntd)
66
+ for (; i + 16 <= n_bytes; i += 16) {
67
+ nk_vu8x16_t a_u8x16 = vec_xl(0, (nk_u8_t const *)(a + i));
68
+ nk_vu8x16_t b_u8x16 = vec_xl(0, (nk_u8_t const *)(b + i));
69
+ nk_vu8x16_t xor_u8x16 = vec_xor(a_u8x16, b_u8x16);
70
+ nk_vu64x2_t popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)xor_u8x16);
71
+ differences_u64x2 = vec_add(differences_u64x2, popcnt_u64x2);
72
+ }
73
+ // Branchless tail: vec_xl_len zero-fills beyond remaining_bytes
74
+ nk_size_t remaining_bytes = n_bytes - i;
75
+ nk_vu8x16_t a_u8x16 = vec_xl_len((nk_u8_t *)(a + i), remaining_bytes);
76
+ nk_vu8x16_t b_u8x16 = vec_xl_len((nk_u8_t *)(b + i), remaining_bytes);
77
+ nk_vu8x16_t xor_u8x16 = vec_xor(a_u8x16, b_u8x16);
78
+ nk_vu64x2_t popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)xor_u8x16);
79
+ differences_u64x2 = vec_add(differences_u64x2, popcnt_u64x2);
80
+ *result = (nk_u32_t)nk_hsum_u64x2_powervsx_(differences_u64x2);
81
+ }
82
+
83
+ NK_PUBLIC void nk_jaccard_u1_powervsx(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result) {
84
+ nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
85
+ nk_vu64x2_t intersection_u64x2 = vec_splats((nk_u64_t)0);
86
+ nk_vu64x2_t union_u64x2 = vec_splats((nk_u64_t)0);
87
+ nk_size_t i = 0;
88
+ for (; i + 16 <= n_bytes; i += 16) {
89
+ nk_vu8x16_t a_u8x16 = vec_xl(0, (nk_u8_t const *)(a + i));
90
+ nk_vu8x16_t b_u8x16 = vec_xl(0, (nk_u8_t const *)(b + i));
91
+ nk_vu64x2_t and_popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)vec_and(a_u8x16, b_u8x16));
92
+ nk_vu64x2_t or_popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)vec_or(a_u8x16, b_u8x16));
93
+ intersection_u64x2 = vec_add(intersection_u64x2, and_popcnt_u64x2);
94
+ union_u64x2 = vec_add(union_u64x2, or_popcnt_u64x2);
95
+ }
96
+ // Branchless tail
97
+ nk_size_t remaining_bytes = n_bytes - i;
98
+ nk_vu8x16_t a_u8x16 = vec_xl_len((nk_u8_t *)(a + i), remaining_bytes);
99
+ nk_vu8x16_t b_u8x16 = vec_xl_len((nk_u8_t *)(b + i), remaining_bytes);
100
+ nk_vu64x2_t and_popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)vec_and(a_u8x16, b_u8x16));
101
+ nk_vu64x2_t or_popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)vec_or(a_u8x16, b_u8x16));
102
+ intersection_u64x2 = vec_add(intersection_u64x2, and_popcnt_u64x2);
103
+ union_u64x2 = vec_add(union_u64x2, or_popcnt_u64x2);
104
+ nk_u32_t intersection_count = (nk_u32_t)nk_hsum_u64x2_powervsx_(intersection_u64x2);
105
+ nk_u32_t union_count = (nk_u32_t)nk_hsum_u64x2_powervsx_(union_u64x2);
106
+ *result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
107
+ }
108
+
109
+ NK_PUBLIC void nk_hamming_u8_powervsx(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
110
+ nk_vu32x4_t differences_u32x4 = vec_splats((nk_u32_t)0);
111
+ nk_vu8x16_t ones_u8x16 = vec_splats((nk_u8_t)1);
112
+ nk_size_t i = 0;
113
+ // Process 16 bytes at a time using vec_cmpne
114
+ for (; i + 16 <= n; i += 16) {
115
+ nk_vu8x16_t a_u8x16 = vec_xl(0, (nk_u8_t const *)(a + i));
116
+ nk_vu8x16_t b_u8x16 = vec_xl(0, (nk_u8_t const *)(b + i));
117
+ // vec_cmpne returns 0xFF for not-equal, 0x00 for equal
118
+ // AND with 1 to get 0x01 for not-equal, then sum groups of 4 bytes → u32
119
+ nk_vu8x16_t not_equal_u8x16 = vec_and((nk_vu8x16_t)vec_cmpne(a_u8x16, b_u8x16), ones_u8x16);
120
+ differences_u32x4 = vec_sum4s(not_equal_u8x16, differences_u32x4);
121
+ }
122
+ // Branchless tail
123
+ nk_size_t remaining_bytes = n - i;
124
+ nk_vu8x16_t a_u8x16 = vec_xl_len((nk_u8_t *)(a + i), remaining_bytes);
125
+ nk_vu8x16_t b_u8x16 = vec_xl_len((nk_u8_t *)(b + i), remaining_bytes);
126
+ nk_vu8x16_t not_equal_u8x16 = vec_and((nk_vu8x16_t)vec_cmpne(a_u8x16, b_u8x16), ones_u8x16);
127
+ differences_u32x4 = vec_sum4s(not_equal_u8x16, differences_u32x4);
128
+ *result = nk_hsum_u32x4_powervsx_(differences_u32x4);
129
+ }
130
+
131
+ typedef struct nk_hamming_u1x128_state_powervsx_t {
132
+ nk_vu32x4_t intersection_count_u32x4;
133
+ } nk_hamming_u1x128_state_powervsx_t;
134
+
135
+ NK_INTERNAL void nk_hamming_u1x128_init_powervsx(nk_hamming_u1x128_state_powervsx_t *state) {
136
+ state->intersection_count_u32x4 = vec_splats((nk_u32_t)0);
137
+ }
138
+
139
+ NK_INTERNAL void nk_hamming_u1x128_update_powervsx(nk_hamming_u1x128_state_powervsx_t *state, nk_b128_vec_t a,
140
+ nk_b128_vec_t b, nk_size_t depth_offset,
141
+ nk_size_t active_dimensions) {
142
+ nk_unused_(depth_offset);
143
+ nk_unused_(active_dimensions);
144
+
145
+ // Process one 128-bit chunk (native VSX register size).
146
+ // Uses vector accumulation → horizontal sum deferred to finalize.
147
+ //
148
+ // Power9 VSX instruction characteristics:
149
+ // - `vec_xor`: xxlxor (V, V, V) 1cy, bitwise XOR
150
+ // - `vec_popcnt`: vpopcntw (V.4S, V.4S) 3cy, word popcount
151
+ // - `vec_add`: vadduwm (V.4S, V.4S, V.4S) 2cy, u32 add
152
+ // Total: ~6cy per 128-bit chunk (horizontal sum deferred to finalize)
153
+
154
+ // Step 1: Compute difference bits (A XOR B)
155
+ nk_vu8x16_t a_u8x16 = *(nk_vu8x16_t *)&a;
156
+ nk_vu8x16_t b_u8x16 = *(nk_vu8x16_t *)&b;
157
+ nk_vu8x16_t xor_u8x16 = vec_xor(a_u8x16, b_u8x16);
158
+
159
+ // Step 2: Word popcount → each u32 lane contains set bits for 4 bytes
160
+ nk_vu32x4_t popcnt_u32x4 = vec_popcnt((nk_vu32x4_t)xor_u8x16);
161
+
162
+ // Step 3: Vector accumulation (defers horizontal sum to finalize)
163
+ state->intersection_count_u32x4 = vec_add(state->intersection_count_u32x4, popcnt_u32x4);
164
+ }
165
+
166
+ NK_INTERNAL void nk_hamming_u1x128_finalize_powervsx( //
167
+ nk_hamming_u1x128_state_powervsx_t const *state_a, nk_hamming_u1x128_state_powervsx_t const *state_b,
168
+ nk_hamming_u1x128_state_powervsx_t const *state_c, nk_hamming_u1x128_state_powervsx_t const *state_d,
169
+ nk_size_t total_dimensions, nk_b128_vec_t *result) {
170
+ nk_unused_(total_dimensions);
171
+
172
+ nk_vu32x4_t a_u32x4 = state_a->intersection_count_u32x4, b_u32x4 = state_b->intersection_count_u32x4,
173
+ c_u32x4 = state_c->intersection_count_u32x4, d_u32x4 = state_d->intersection_count_u32x4;
174
+ nk_vu32x4_t transpose_ab_low_u32x4 = vec_mergeh(a_u32x4, b_u32x4);
175
+ nk_vu32x4_t transpose_cd_low_u32x4 = vec_mergeh(c_u32x4, d_u32x4);
176
+ nk_vu32x4_t transpose_ab_high_u32x4 = vec_mergel(a_u32x4, b_u32x4);
177
+ nk_vu32x4_t transpose_cd_high_u32x4 = vec_mergel(c_u32x4, d_u32x4);
178
+ nk_vu32x4_t sum_lane0_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
179
+ (nk_vu64x2_t)transpose_cd_low_u32x4, 0);
180
+ nk_vu32x4_t sum_lane1_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
181
+ (nk_vu64x2_t)transpose_cd_low_u32x4, 3);
182
+ nk_vu32x4_t sum_lane2_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
183
+ (nk_vu64x2_t)transpose_cd_high_u32x4, 0);
184
+ nk_vu32x4_t sum_lane3_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
185
+ (nk_vu64x2_t)transpose_cd_high_u32x4, 3);
186
+ result->vu32x4 = vec_add(vec_add(sum_lane0_u32x4, sum_lane1_u32x4), vec_add(sum_lane2_u32x4, sum_lane3_u32x4));
187
+ }
188
+
189
+ typedef struct nk_jaccard_u1x128_state_powervsx_t {
190
+ nk_vu32x4_t intersection_count_u32x4;
191
+ } nk_jaccard_u1x128_state_powervsx_t;
192
+
193
+ NK_INTERNAL void nk_jaccard_u1x128_init_powervsx(nk_jaccard_u1x128_state_powervsx_t *state) {
194
+ state->intersection_count_u32x4 = vec_splats((nk_u32_t)0);
195
+ }
196
+
197
+ NK_INTERNAL void nk_jaccard_u1x128_update_powervsx(nk_jaccard_u1x128_state_powervsx_t *state, nk_b128_vec_t a,
198
+ nk_b128_vec_t b, nk_size_t depth_offset,
199
+ nk_size_t active_dimensions) {
200
+ nk_unused_(depth_offset);
201
+ nk_unused_(active_dimensions);
202
+
203
+ // Process one 128-bit chunk (native VSX register size).
204
+ // Uses vector accumulation → horizontal sum deferred to finalize.
205
+ //
206
+ // Power9 VSX instruction characteristics:
207
+ // - `vec_and`: xxland (V, V, V) 1cy, bitwise AND
208
+ // - `vec_popcnt`: vpopcntw (V.4S, V.4S) 3cy, word popcount
209
+ // - `vec_add`: vadduwm (V.4S, V.4S, V.4S) 2cy, u32 add
210
+ // Total: ~6cy per 128-bit chunk (horizontal sum deferred to finalize)
211
+
212
+ // Step 1: Compute intersection bits (A AND B)
213
+ nk_vu8x16_t a_u8x16 = *(nk_vu8x16_t *)&a;
214
+ nk_vu8x16_t b_u8x16 = *(nk_vu8x16_t *)&b;
215
+ nk_vu8x16_t intersection_u8x16 = vec_and(a_u8x16, b_u8x16);
216
+
217
+ // Step 2: Word popcount → each u32 lane contains set bits for 4 bytes
218
+ nk_vu32x4_t popcnt_u32x4 = vec_popcnt((nk_vu32x4_t)intersection_u8x16);
219
+
220
+ // Step 3: Vector accumulation (defers horizontal sum to finalize)
221
+ state->intersection_count_u32x4 = vec_add(state->intersection_count_u32x4, popcnt_u32x4);
222
+ }
223
+
224
+ NK_INTERNAL void nk_jaccard_u1x128_finalize_powervsx( //
225
+ nk_jaccard_u1x128_state_powervsx_t const *state_a, nk_jaccard_u1x128_state_powervsx_t const *state_b,
226
+ nk_jaccard_u1x128_state_powervsx_t const *state_c, nk_jaccard_u1x128_state_powervsx_t const *state_d,
227
+ nk_f32_t query_popcount, nk_f32_t target_popcount_a, nk_f32_t target_popcount_b, nk_f32_t target_popcount_c,
228
+ nk_f32_t target_popcount_d, nk_size_t total_dimensions, nk_b128_vec_t *result) {
229
+ nk_unused_(total_dimensions);
230
+
231
+ // Transpose-based 4-way horizontal sum of u32x4 intersection counts
232
+ nk_vu32x4_t a_u32x4 = state_a->intersection_count_u32x4, b_u32x4 = state_b->intersection_count_u32x4,
233
+ c_u32x4 = state_c->intersection_count_u32x4, d_u32x4 = state_d->intersection_count_u32x4;
234
+ nk_vu32x4_t transpose_ab_low_u32x4 = vec_mergeh(a_u32x4, b_u32x4);
235
+ nk_vu32x4_t transpose_cd_low_u32x4 = vec_mergeh(c_u32x4, d_u32x4);
236
+ nk_vu32x4_t transpose_ab_high_u32x4 = vec_mergel(a_u32x4, b_u32x4);
237
+ nk_vu32x4_t transpose_cd_high_u32x4 = vec_mergel(c_u32x4, d_u32x4);
238
+ nk_vu32x4_t sum_lane0_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
239
+ (nk_vu64x2_t)transpose_cd_low_u32x4, 0);
240
+ nk_vu32x4_t sum_lane1_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
241
+ (nk_vu64x2_t)transpose_cd_low_u32x4, 3);
242
+ nk_vu32x4_t sum_lane2_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
243
+ (nk_vu64x2_t)transpose_cd_high_u32x4, 0);
244
+ nk_vu32x4_t sum_lane3_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
245
+ (nk_vu64x2_t)transpose_cd_high_u32x4, 3);
246
+ nk_vu32x4_t intersection_u32x4 = vec_add(vec_add(sum_lane0_u32x4, sum_lane1_u32x4),
247
+ vec_add(sum_lane2_u32x4, sum_lane3_u32x4));
248
+ nk_vf32x4_t intersection_f32x4 = vec_ctf(intersection_u32x4, 0);
249
+
250
+ // Build target popcounts vector via vec_insert
251
+ nk_vf32x4_t targets_f32x4 = vec_splats(0.0f);
252
+ targets_f32x4 = vec_insert(target_popcount_a, targets_f32x4, 0);
253
+ targets_f32x4 = vec_insert(target_popcount_b, targets_f32x4, 1);
254
+ targets_f32x4 = vec_insert(target_popcount_c, targets_f32x4, 2);
255
+ targets_f32x4 = vec_insert(target_popcount_d, targets_f32x4, 3);
256
+ nk_vf32x4_t query_f32x4 = vec_splats(query_popcount);
257
+
258
+ // Compute union using |A union B| = |A| + |B| - |A intersection B|
259
+ nk_vf32x4_t union_f32x4 = vec_sub(vec_add(query_f32x4, targets_f32x4), intersection_f32x4);
260
+
261
+ // Handle zero-union edge case (empty vectors → distance = 0.0)
262
+ nk_vf32x4_t one_f32x4 = vec_splats(1.0f);
263
+ nk_vf32x4_t zero_f32x4 = vec_splats(0.0f);
264
+ nk_vu32x4_t zero_union_mask_u32x4 = (nk_vu32x4_t)vec_cmpeq(union_f32x4, zero_f32x4);
265
+ nk_vf32x4_t safe_union_f32x4 = vec_sel(union_f32x4, one_f32x4, zero_union_mask_u32x4);
266
+
267
+ // Fast reciprocal with Newton-Raphson refinement
268
+ nk_vf32x4_t union_reciprocal_f32x4 = vec_re(safe_union_f32x4);
269
+ // One Newton-Raphson step: reciprocal = reciprocal × (2 - value * reciprocal)
270
+ nk_vf32x4_t two_f32x4 = vec_splats(2.0f);
271
+ union_reciprocal_f32x4 = vec_mul(union_reciprocal_f32x4,
272
+ vec_sub(two_f32x4, vec_mul(safe_union_f32x4, union_reciprocal_f32x4)));
273
+
274
+ // Compute Jaccard distance = 1 - intersection / union
275
+ nk_vf32x4_t ratio_f32x4 = vec_mul(intersection_f32x4, union_reciprocal_f32x4);
276
+ nk_vf32x4_t jaccard_f32x4 = vec_sub(one_f32x4, ratio_f32x4);
277
+ result->vf32x4 = vec_sel(jaccard_f32x4, zero_f32x4, zero_union_mask_u32x4);
278
+ }
279
+
280
+ /** @brief Hamming from_dot: computes pop_a + pop_b - 2 × dot for 4 pairs (Power VSX). */
281
+ NK_INTERNAL void nk_hamming_u32x4_from_dot_powervsx_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
282
+ nk_b128_vec_t *results) {
283
+ nk_vu32x4_t dots_u32x4 = dots.vu32x4;
284
+ nk_vu32x4_t query_u32x4 = vec_splats(query_pop);
285
+ nk_vu32x4_t target_u32x4 = target_pops.vu32x4;
286
+ nk_vu32x4_t two_dots_u32x4 = vec_add(dots_u32x4, dots_u32x4);
287
+ results->vu32x4 = vec_sub(vec_add(query_u32x4, target_u32x4), two_dots_u32x4);
288
+ }
289
+
290
+ /** @brief Jaccard from_dot: computes 1 - dot / (pop_a + pop_b - dot) for 4 pairs (Power VSX). */
291
+ NK_INTERNAL void nk_jaccard_f32x4_from_dot_powervsx_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
292
+ nk_b128_vec_t *results) {
293
+ nk_vf32x4_t dot_f32x4 = vec_ctf(dots.vu32x4, 0);
294
+ nk_vf32x4_t query_f32x4 = vec_splats((nk_f32_t)query_pop);
295
+ nk_vf32x4_t target_f32x4 = vec_ctf(target_pops.vu32x4, 0);
296
+ nk_vf32x4_t union_f32x4 = vec_sub(vec_add(query_f32x4, target_f32x4), dot_f32x4);
297
+
298
+ nk_vf32x4_t one_f32x4 = vec_splats(1.0f);
299
+ nk_vf32x4_t zero_f32x4 = vec_splats(0.0f);
300
+ nk_vu32x4_t zero_union_mask_u32x4 = (nk_vu32x4_t)vec_cmpeq(union_f32x4, zero_f32x4);
301
+ nk_vf32x4_t safe_union_f32x4 = vec_sel(union_f32x4, one_f32x4, zero_union_mask_u32x4);
302
+
303
+ // Fast reciprocal with Newton-Raphson
304
+ nk_vf32x4_t union_reciprocal_f32x4 = vec_re(safe_union_f32x4);
305
+ nk_vf32x4_t two_f32x4 = vec_splats(2.0f);
306
+ union_reciprocal_f32x4 = vec_mul(union_reciprocal_f32x4,
307
+ vec_sub(two_f32x4, vec_mul(safe_union_f32x4, union_reciprocal_f32x4)));
308
+
309
+ nk_vf32x4_t ratio_f32x4 = vec_mul(dot_f32x4, union_reciprocal_f32x4);
310
+ nk_vf32x4_t jaccard_f32x4 = vec_sub(one_f32x4, ratio_f32x4);
311
+ results->vf32x4 = vec_sel(jaccard_f32x4, zero_f32x4, zero_union_mask_u32x4);
312
+ }
313
+
314
+ #if defined(__clang__)
315
+ #pragma clang attribute pop
316
+ #elif defined(__GNUC__)
317
+ #pragma GCC pop_options
318
+ #endif
319
+
320
+ #if defined(__cplusplus)
321
+ } // extern "C"
322
+ #endif
323
+
324
+ #endif // NK_TARGET_POWERVSX
325
+ #endif // NK_TARGET_POWER_
326
+ #endif // NK_SET_POWERVSX_H
@@ -50,7 +50,7 @@
50
50
  extern "C" {
51
51
  #endif
52
52
 
53
- #pragma region - Binary Sets
53
+ #pragma region Binary Sets
54
54
 
55
55
  /**
56
56
  * @brief Compute byte-level popcount using arithmetic SWAR.
@@ -142,9 +142,9 @@ NK_PUBLIC void nk_jaccard_u1_rvv(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size
142
142
  *result = (union_count_u32 != 0) ? 1.0f - (nk_f32_t)intersection_count_u32 / (nk_f32_t)union_count_u32 : 0.0f;
143
143
  }
144
144
 
145
- #pragma endregion - Binary Sets
145
+ #pragma endregion Binary Sets
146
146
 
147
- #pragma region - Integer Sets
147
+ #pragma region Integer Sets
148
148
 
149
149
  NK_PUBLIC void nk_hamming_u8_rvv(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
150
150
  vuint32m1_t difference_count_u32m1 = __riscv_vmv_v_x_u32m1(0, 1);
@@ -209,7 +209,7 @@ NK_PUBLIC void nk_jaccard_u16_rvv(nk_u16_t const *a, nk_u16_t const *b, nk_size_
209
209
  *result = (n != 0) ? 1.0f - (nk_f32_t)match_count_u32 / (nk_f32_t)n : 0.0f;
210
210
  }
211
211
 
212
- #pragma endregion - Integer Sets
212
+ #pragma endregion Integer Sets
213
213
 
214
214
  #if defined(__cplusplus)
215
215
  } // extern "C"
@@ -35,7 +35,7 @@
35
35
  extern "C" {
36
36
  #endif
37
37
 
38
- #pragma region - Binary Sets
38
+ #pragma region Binary Sets
39
39
 
40
40
  NK_PUBLIC void nk_hamming_u1_serial(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
41
41
  nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
@@ -52,9 +52,9 @@ NK_PUBLIC void nk_jaccard_u1_serial(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_s
52
52
  *result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
53
53
  }
54
54
 
55
- #pragma endregion - Binary Sets
55
+ #pragma endregion Binary Sets
56
56
 
57
- #pragma region - Integer Sets
57
+ #pragma region Integer Sets
58
58
 
59
59
  NK_PUBLIC void nk_jaccard_u32_serial(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
60
60
  nk_u32_t intersection_count = 0;
@@ -74,9 +74,9 @@ NK_PUBLIC void nk_jaccard_u16_serial(nk_u16_t const *a, nk_u16_t const *b, nk_si
74
74
  *result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
75
75
  }
76
76
 
77
- #pragma endregion - Integer Sets
77
+ #pragma endregion Integer Sets
78
78
 
79
- #pragma region - Stateful Streaming
79
+ #pragma region Stateful Streaming
80
80
 
81
81
  typedef struct nk_jaccard_u1x128_state_serial_t {
82
82
  nk_u64_t intersection_count;
@@ -165,7 +165,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_serial_(nk_b128_vec_t dots, nk_u32_t
165
165
  }
166
166
  }
167
167
 
168
- #pragma endregion - Stateful Streaming
168
+ #pragma endregion Stateful Streaming
169
169
 
170
170
  #if defined(__cplusplus)
171
171
  } // extern "C"