numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -8,23 +8,23 @@
8
8
  *
9
9
  * @section set_sve_instructions ARM SVE Instructions
10
10
  *
11
- * Intrinsic Instruction Latency Throughput
12
- * svld1_u8 LD1B (Z.B, P/Z, [Xn]) 4-6cy 2/cy
13
- * svld1_u32 LD1W (Z.S, P/Z, [Xn]) 4-6cy 2/cy
14
- * sveor_u8_m EOR (Z.B, P/M, Z.B, Z.B) 1cy 2/cy
15
- * svand_u8_m AND (Z.B, P/M, Z.B, Z.B) 1cy 2/cy
16
- * svorr_u8_m ORR (Z.B, P/M, Z.B, Z.B) 1cy 2/cy
17
- * svcnt_u8_x CNT (Z.B, P/M, Z.B) 2cy 2/cy
18
- * svadd_u8_z ADD (Z.B, P/M, Z.B, Z.B) 1cy 2/cy
19
- * svaddv_u8 UADDV (D, P, Z.B) 6cy 1/cy
20
- * svcmpeq_u32 CMPEQ (P.S, P/Z, Z.S, Z.S) 2cy 1/cy
21
- * svcntp_b32 CNTP (Xd, P, P.S) 2cy 1/cy
22
- * svdup_n_u8 DUP (Z.B, #imm) 1cy 2/cy
23
- * svwhilelt_b8 WHILELT (P.B, Xn, Xm) 2cy 1/cy
24
- * svwhilelt_b32 WHILELT (P.S, Xn, Xm) 2cy 1/cy
25
- * svptrue_b8 PTRUE (P.B, pattern) 1cy 2/cy
26
- * svcntb CNTB (Xd) 1cy 2/cy
27
- * svcntw CNTW (Xd) 1cy 2/cy
11
+ * Intrinsic Instruction V1
12
+ * svld1_u8 LD1B (Z.B, P/Z, [Xn]) 4-6cy @ 2p
13
+ * svld1_u32 LD1W (Z.S, P/Z, [Xn]) 4-6cy @ 2p
14
+ * sveor_u8_m EOR (Z.B, P/M, Z.B, Z.B) 1cy @ 2p
15
+ * svand_u8_m AND (Z.B, P/M, Z.B, Z.B) 1cy @ 2p
16
+ * svorr_u8_m ORR (Z.B, P/M, Z.B, Z.B) 1cy @ 2p
17
+ * svcnt_u8_x CNT (Z.B, P/M, Z.B) 2cy @ 2p
18
+ * svadd_u8_z ADD (Z.B, P/M, Z.B, Z.B) 1cy @ 2p
19
+ * svaddv_u8 UADDV (D, P, Z.B) 6cy @ 1p
20
+ * svcmpeq_u32 CMPEQ (P.S, P/Z, Z.S, Z.S) 2cy @ 1p
21
+ * svcntp_b32 CNTP (Xd, P, P.S) 2cy @ 1p
22
+ * svdup_n_u8 DUP (Z.B, #imm) 1cy @ 2p
23
+ * svwhilelt_b8 WHILELT (P.B, Xn, Xm) 2cy @ 1p
24
+ * svwhilelt_b32 WHILELT (P.S, Xn, Xm) 2cy @ 1p
25
+ * svptrue_b8 PTRUE (P.B, pattern) 1cy @ 2p
26
+ * svcntb CNTB (Xd) 1cy @ 2p
27
+ * svcntw CNTW (Xd) 1cy @ 2p
28
28
  */
29
29
  #ifndef NK_SET_SVE_H
30
30
  #define NK_SET_SVE_H
@@ -46,7 +46,7 @@ extern "C" {
46
46
  #pragma GCC target("arch=armv8.2-a+sve")
47
47
  #endif
48
48
 
49
- #pragma region - Binary Sets
49
+ #pragma region Binary Sets
50
50
 
51
51
  NK_PUBLIC void nk_hamming_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
52
52
  nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
@@ -61,20 +61,20 @@ NK_PUBLIC void nk_hamming_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size
61
61
  // On larger register sizes, SVE is faster.
62
62
  nk_size_t i = 0, cycle = 0;
63
63
  nk_u32_t differences = 0;
64
- svuint8_t popcount_u8 = svdup_n_u8(0);
65
- svbool_t const all_predicate = svptrue_b8();
64
+ svuint8_t popcount_u8x = svdup_n_u8(0);
65
+ svbool_t const all_predicate_b8x = svptrue_b8();
66
66
  while (i < n_bytes) {
67
67
  do {
68
- svbool_t active_predicate = svwhilelt_b8_u64(i, n_bytes);
69
- svuint8_t a_u8 = svld1_u8(active_predicate, a + i);
70
- svuint8_t b_u8 = svld1_u8(active_predicate, b + i);
71
- popcount_u8 = svadd_u8_z(all_predicate, popcount_u8,
72
- svcnt_u8_x(all_predicate, sveor_u8_m(all_predicate, a_u8, b_u8)));
68
+ svbool_t active_predicate_b8x = svwhilelt_b8_u64(i, n_bytes);
69
+ svuint8_t a_u8x = svld1_u8(active_predicate_b8x, a + i);
70
+ svuint8_t b_u8x = svld1_u8(active_predicate_b8x, b + i);
71
+ popcount_u8x = svadd_u8_z(all_predicate_b8x, popcount_u8x,
72
+ svcnt_u8_x(all_predicate_b8x, sveor_u8_m(all_predicate_b8x, a_u8x, b_u8x)));
73
73
  i += words_per_register;
74
74
  ++cycle;
75
75
  } while (i < n_bytes && cycle < 31);
76
- differences += svaddv_u8(all_predicate, popcount_u8);
77
- popcount_u8 = svdup_n_u8(0);
76
+ differences += svaddv_u8(all_predicate_b8x, popcount_u8x);
77
+ popcount_u8x = svdup_n_u8(0);
78
78
  cycle = 0; // Reset the cycle counter.
79
79
  }
80
80
 
@@ -94,45 +94,46 @@ NK_PUBLIC void nk_jaccard_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size
94
94
  // On larger register sizes, SVE is faster.
95
95
  nk_size_t i = 0, cycle = 0;
96
96
  nk_u32_t intersection_count = 0, union_count = 0;
97
- svuint8_t intersection_popcount_u8 = svdup_n_u8(0);
98
- svuint8_t union_popcount_u8 = svdup_n_u8(0);
99
- svbool_t const all_predicate = svptrue_b8();
97
+ svuint8_t intersection_popcount_u8x = svdup_n_u8(0);
98
+ svuint8_t union_popcount_u8x = svdup_n_u8(0);
99
+ svbool_t const all_predicate_b8x = svptrue_b8();
100
100
  while (i < n_bytes) {
101
101
  do {
102
- svbool_t active_predicate = svwhilelt_b8_u64(i, n_bytes);
103
- svuint8_t a_u8 = svld1_u8(active_predicate, a + i);
104
- svuint8_t b_u8 = svld1_u8(active_predicate, b + i);
105
- intersection_popcount_u8 = svadd_u8_z(all_predicate, intersection_popcount_u8,
106
- svcnt_u8_x(all_predicate, svand_u8_m(all_predicate, a_u8, b_u8)));
107
- union_popcount_u8 = svadd_u8_z(all_predicate, union_popcount_u8,
108
- svcnt_u8_x(all_predicate, svorr_u8_m(all_predicate, a_u8, b_u8)));
102
+ svbool_t active_predicate_b8x = svwhilelt_b8_u64(i, n_bytes);
103
+ svuint8_t a_u8x = svld1_u8(active_predicate_b8x, a + i);
104
+ svuint8_t b_u8x = svld1_u8(active_predicate_b8x, b + i);
105
+ intersection_popcount_u8x = svadd_u8_z(
106
+ all_predicate_b8x, intersection_popcount_u8x,
107
+ svcnt_u8_x(all_predicate_b8x, svand_u8_m(all_predicate_b8x, a_u8x, b_u8x)));
108
+ union_popcount_u8x = svadd_u8_z(all_predicate_b8x, union_popcount_u8x,
109
+ svcnt_u8_x(all_predicate_b8x, svorr_u8_m(all_predicate_b8x, a_u8x, b_u8x)));
109
110
  i += words_per_register;
110
111
  ++cycle;
111
112
  } while (i < n_bytes && cycle < 31);
112
- intersection_count += svaddv_u8(all_predicate, intersection_popcount_u8);
113
- intersection_popcount_u8 = svdup_n_u8(0);
114
- union_count += svaddv_u8(all_predicate, union_popcount_u8);
115
- union_popcount_u8 = svdup_n_u8(0);
113
+ intersection_count += svaddv_u8(all_predicate_b8x, intersection_popcount_u8x);
114
+ intersection_popcount_u8x = svdup_n_u8(0);
115
+ union_count += svaddv_u8(all_predicate_b8x, union_popcount_u8x);
116
+ union_popcount_u8x = svdup_n_u8(0);
116
117
  cycle = 0; // Reset the cycle counter.
117
118
  }
118
119
 
119
120
  *result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
120
121
  }
121
122
 
122
- #pragma endregion - Binary Sets
123
+ #pragma endregion Binary Sets
123
124
 
124
- #pragma region - Integer Sets
125
+ #pragma region Integer Sets
125
126
 
126
127
  NK_PUBLIC void nk_jaccard_u32_sve(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
127
128
  nk_size_t const words_per_register = svcntw();
128
129
  nk_size_t i = 0;
129
130
  nk_u32_t intersection_count = 0;
130
131
  while (i < n) {
131
- svbool_t active_predicate = svwhilelt_b32_u64(i, n);
132
- svuint32_t a_u32 = svld1_u32(active_predicate, a + i);
133
- svuint32_t b_u32 = svld1_u32(active_predicate, b + i);
134
- svbool_t equality_predicate = svcmpeq_u32(active_predicate, a_u32, b_u32);
135
- intersection_count += svcntp_b32(active_predicate, equality_predicate);
132
+ svbool_t active_predicate_b32x = svwhilelt_b32_u64(i, n);
133
+ svuint32_t a_u32x = svld1_u32(active_predicate_b32x, a + i);
134
+ svuint32_t b_u32x = svld1_u32(active_predicate_b32x, b + i);
135
+ svbool_t equality_predicate_b32x = svcmpeq_u32(active_predicate_b32x, a_u32x, b_u32x);
136
+ intersection_count += svcntp_b32(active_predicate_b32x, equality_predicate_b32x);
136
137
  i += words_per_register;
137
138
  }
138
139
  *result = (n != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)n : 0.0f;
@@ -143,11 +144,11 @@ NK_PUBLIC void nk_hamming_u8_sve(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n
143
144
  nk_size_t i = 0;
144
145
  nk_u32_t differences = 0;
145
146
  while (i < n) {
146
- svbool_t active_predicate = svwhilelt_b8_u64(i, n);
147
- svuint8_t a_u8 = svld1_u8(active_predicate, a + i);
148
- svuint8_t b_u8 = svld1_u8(active_predicate, b + i);
149
- svbool_t neq_predicate = svcmpne_u8(active_predicate, a_u8, b_u8);
150
- differences += svcntp_b8(active_predicate, neq_predicate);
147
+ svbool_t active_predicate_b8x = svwhilelt_b8_u64(i, n);
148
+ svuint8_t a_u8x = svld1_u8(active_predicate_b8x, a + i);
149
+ svuint8_t b_u8x = svld1_u8(active_predicate_b8x, b + i);
150
+ svbool_t neq_predicate_b8x = svcmpne_u8(active_predicate_b8x, a_u8x, b_u8x);
151
+ differences += svcntp_b8(active_predicate_b8x, neq_predicate_b8x);
151
152
  i += bytes_per_register;
152
153
  }
153
154
  *result = differences;
@@ -158,17 +159,17 @@ NK_PUBLIC void nk_jaccard_u16_sve(nk_u16_t const *a, nk_u16_t const *b, nk_size_
158
159
  nk_size_t i = 0;
159
160
  nk_u32_t intersection_count = 0;
160
161
  while (i < n) {
161
- svbool_t active_predicate = svwhilelt_b16_u64(i, n);
162
- svuint16_t a_u16 = svld1_u16(active_predicate, a + i);
163
- svuint16_t b_u16 = svld1_u16(active_predicate, b + i);
164
- svbool_t equality_predicate = svcmpeq_u16(active_predicate, a_u16, b_u16);
165
- intersection_count += svcntp_b16(active_predicate, equality_predicate);
162
+ svbool_t active_predicate_b16x = svwhilelt_b16_u64(i, n);
163
+ svuint16_t a_u16x = svld1_u16(active_predicate_b16x, a + i);
164
+ svuint16_t b_u16x = svld1_u16(active_predicate_b16x, b + i);
165
+ svbool_t equality_predicate_b16x = svcmpeq_u16(active_predicate_b16x, a_u16x, b_u16x);
166
+ intersection_count += svcntp_b16(active_predicate_b16x, equality_predicate_b16x);
166
167
  i += halfwords_per_register;
167
168
  }
168
169
  *result = (n != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)n : 0.0f;
169
170
  }
170
171
 
171
- #pragma endregion - Integer Sets
172
+ #pragma endregion Integer Sets
172
173
 
173
174
  #if defined(__clang__)
174
175
  #pragma clang attribute pop
@@ -30,7 +30,7 @@ extern "C" {
30
30
  #pragma clang attribute push(__attribute__((target("relaxed-simd"))), apply_to = function)
31
31
  #endif
32
32
 
33
- #pragma region - Binary Sets
33
+ #pragma region Binary Sets
34
34
 
35
35
  NK_PUBLIC void nk_hamming_u1_v128relaxed(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
36
36
  nk_u8_t const *a_bytes = (nk_u8_t const *)a;
@@ -121,9 +121,9 @@ NK_PUBLIC void nk_jaccard_u1_v128relaxed(nk_u1x8_t const *a, nk_u1x8_t const *b,
121
121
  *result = union_count > 0 ? 1.0f - ((nk_f32_t)intersection / (nk_f32_t)union_count) : 0.0f;
122
122
  }
123
123
 
124
- #pragma endregion - Binary Sets
124
+ #pragma endregion Binary Sets
125
125
 
126
- #pragma region - Integer Sets
126
+ #pragma region Integer Sets
127
127
 
128
128
  NK_PUBLIC void nk_hamming_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
129
129
  nk_u32_t sum_total = 0;
@@ -197,9 +197,9 @@ NK_PUBLIC void nk_jaccard_u16_v128relaxed(nk_u16_t const *a, nk_u16_t const *b,
197
197
  *result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
198
198
  }
199
199
 
200
- #pragma endregion - Integer Sets
200
+ #pragma endregion Integer Sets
201
201
 
202
- #pragma region - Binary Sets from Dot
202
+ #pragma region Binary Sets from Dot
203
203
 
204
204
  NK_INTERNAL void nk_hamming_u32x4_from_dot_v128relaxed_( //
205
205
  nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops, nk_b128_vec_t *results) {
@@ -226,7 +226,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_v128relaxed_( //
226
226
  results->v128 = wasm_i32x4_relaxed_laneselect(zero_f32x4, jaccard_f32x4, zero_mask_u32x4);
227
227
  }
228
228
 
229
- #pragma endregion - Binary Sets from Dot
229
+ #pragma endregion Binary Sets from Dot
230
230
 
231
231
  #if defined(__clang__)
232
232
  #pragma clang attribute pop
@@ -41,12 +41,12 @@
41
41
  * On binary vectors, when computing Jaccard distance, the CPU often struggles to compute the
42
42
  * large number of required population counts. There are several instructions we should keep in mind:
43
43
  *
44
- * Intrinsic Instruction Ice Genoa
45
- * _mm512_popcnt_epi64 VPOPCNTQ (ZMM, K, ZMM) 3cy @ p5 2cy @ p01
46
- * _mm512_shuffle_epi8 VPSHUFB (ZMM, ZMM, ZMM) 1cy @ p5 2cy @ p12
47
- * _mm512_sad_epu8 VPSADBW (ZMM, ZMM, ZMM) 3cy @ p5 3cy @ p01
48
- * _mm512_ternarylogic_epi64 VPTERNLOGQ (ZMM, ZMM, ZMM, I8) 1cy @ p05 1cy @ p0123
49
- * _mm512_gf2p8mul_epi8 VGF2P8MULB (ZMM, ZMM, ZMM) 5cy @ p0 3cy @ p01
44
+ * Intrinsic Instruction Icelake Genoa
45
+ * _mm512_popcnt_epi64 VPOPCNTQ (ZMM, K, ZMM) 3cy @ p5 2cy @ p01
46
+ * _mm512_shuffle_epi8 VPSHUFB (ZMM, ZMM, ZMM) 1cy @ p5 2cy @ p12
47
+ * _mm512_sad_epu8 VPSADBW (ZMM, ZMM, ZMM) 3cy @ p5 3cy @ p01
48
+ * _mm512_ternarylogic_epi64 VPTERNLOGQ (ZMM, ZMM, ZMM, I8) 1cy @ p05 1cy @ p0123
49
+ * _mm512_gf2p8mul_epi8 VGF2P8MULB (ZMM, ZMM, ZMM) 5cy @ p0 3cy @ p01
50
50
  *
51
51
  * On Ice Lake, VPOPCNTQ bottlenecks on port 5. On AMD Genoa/Turin, it dual-issues
52
52
  * on ports 0-1, making native popcount significantly faster without CSA tricks.
@@ -123,7 +123,7 @@
123
123
  *
124
124
  * @section references References
125
125
  *
126
- * - Intel Intrinsics Guide: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
126
+ * - Intel Intrinsics Guide: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
127
127
  * - Arm Intrinsics Reference: https://developer.arm.com/architectures/instruction-sets/intrinsics/
128
128
  * - Muła et al. "Faster Population Counts": https://arxiv.org/pdf/1611.07612
129
129
  * - Muła SSE POPCOUNT experiments: https://github.com/WojciechMula/sse-popcount
@@ -344,9 +344,11 @@ NK_INTERNAL nk_dtype_t nk_jaccard_output_dtype(nk_dtype_t dtype) {
344
344
  #include "numkong/set/sve.h"
345
345
  #include "numkong/set/icelake.h"
346
346
  #include "numkong/set/haswell.h"
347
+ #include "numkong/set/powervsx.h"
347
348
  #include "numkong/set/v128relaxed.h"
348
349
  #include "numkong/set/rvv.h"
349
350
  #include "numkong/set/rvvbb.h"
351
+ #include "numkong/set/loongsonasx.h"
350
352
 
351
353
  #if defined(__cplusplus)
352
354
  extern "C" {
@@ -365,6 +367,10 @@ NK_PUBLIC void nk_hamming_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n
365
367
  nk_hamming_u1_icelake(a, b, n, result);
366
368
  #elif NK_TARGET_HASWELL
367
369
  nk_hamming_u1_haswell(a, b, n, result);
370
+ #elif NK_TARGET_POWERVSX
371
+ nk_hamming_u1_powervsx(a, b, n, result);
372
+ #elif NK_TARGET_LOONGSONASX
373
+ nk_hamming_u1_loongsonasx(a, b, n, result);
368
374
  #elif NK_TARGET_RVVBB
369
375
  nk_hamming_u1_rvvbb(a, b, n, result);
370
376
  #elif NK_TARGET_RVV
@@ -385,6 +391,10 @@ NK_PUBLIC void nk_jaccard_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n
385
391
  nk_jaccard_u1_icelake(a, b, n, result);
386
392
  #elif NK_TARGET_HASWELL
387
393
  nk_jaccard_u1_haswell(a, b, n, result);
394
+ #elif NK_TARGET_POWERVSX
395
+ nk_jaccard_u1_powervsx(a, b, n, result);
396
+ #elif NK_TARGET_LOONGSONASX
397
+ nk_jaccard_u1_loongsonasx(a, b, n, result);
388
398
  #elif NK_TARGET_RVVBB
389
399
  nk_jaccard_u1_rvvbb(a, b, n, result);
390
400
  #elif NK_TARGET_RVV
@@ -423,6 +433,10 @@ NK_PUBLIC void nk_hamming_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk
423
433
  nk_hamming_u8_icelake(a, b, n, result);
424
434
  #elif NK_TARGET_HASWELL
425
435
  nk_hamming_u8_haswell(a, b, n, result);
436
+ #elif NK_TARGET_POWERVSX
437
+ nk_hamming_u8_powervsx(a, b, n, result);
438
+ #elif NK_TARGET_LOONGSONASX
439
+ nk_hamming_u8_loongsonasx(a, b, n, result);
426
440
  #elif NK_TARGET_RVV
427
441
  nk_hamming_u8_rvv(a, b, n, result);
428
442
  #else
@@ -4,17 +4,17 @@ NumKong implements batched M×N Hamming and Jaccard distance matrices for binary
4
4
 
5
5
  Hamming distance from batched dot products:
6
6
 
7
- ```math
7
+ $$
8
8
  D_{ij} = \|A_i\|_1 + \|B_j\|_1 - 2 \cdot \text{dot}(A_i, B_j)
9
- ```
9
+ $$
10
10
 
11
11
  Where dot = popcount(AND), measuring intersection size.
12
12
 
13
13
  Jaccard distance from batched dot products:
14
14
 
15
- ```math
15
+ $$
16
16
  D_{ij} = 1 - \frac{\text{dot}(A_i, B_j)}{\|A_i\|_1 + \|B_j\|_1 - \text{dot}(A_i, B_j)}
17
- ```
17
+ $$
18
18
 
19
19
  Reformulating as Python pseudocode:
20
20
 
@@ -112,38 +112,38 @@ Measured with Wasmtime v42 (Cranelift backend).
112
112
  | `nk_jaccards_symmetric_u1_serial` | 3.57 gso/s, 0 ulp | 13.3 gso/s, 0 ulp | 40.6 gso/s, 0 ulp |
113
113
  | `nk_jaccards_symmetric_u1_v128relaxed` | 3.65 gso/s, 0 ulp | 13.9 gso/s, 0 ulp | 42.2 gso/s, 0 ulp |
114
114
 
115
- ### Apple M4
115
+ ### Apple M5
116
116
 
117
117
  #### Native
118
118
 
119
119
  | Kernel | 256³ | 1024³ | 4096³ |
120
120
  | :--------------------------------- | -----------------------: | -----------------------: | -----------------------: |
121
121
  | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
122
- | `nk_hammings_packed_u1_serial` | 154 gso/s | 204 gso/s | 221 gso/s |
123
- | `nk_hammings_symmetric_u1_serial` | 101 gso/s | 159 gso/s | 172 gso/s |
124
- | `nk_jaccards_packed_u1_serial` | 116 gso/s, 0 ulp | 203 gso/s, 0 ulp | 232 gso/s, 0 ulp |
125
- | `nk_jaccards_symmetric_u1_serial` | 86.3 gso/s, 0 ulp | 157 gso/s, 0 ulp | 176 gso/s, 0 ulp |
126
- | `nk_hammings_packed_u1_neon` | 315 gso/s | 428 gso/s | 481 gso/s |
127
- | `nk_hammings_symmetric_u1_neon` | 132 gso/s | 240 gso/s | 294 gso/s |
128
- | `nk_jaccards_packed_u1_neon` | 266 gso/s, 8.6 ulp | 416 gso/s, 8.6 ulp | 488 gso/s, 8.6 ulp |
129
- | `nk_jaccards_symmetric_u1_neon` | 129 gso/s, 8.5 ulp | 242 gso/s, 8.5 ulp | 294 gso/s, 8.5 ulp |
130
- | `nk_hammings_packed_u1_smebi32` | 1,420 gso/s | 2,928 gso/s | 4,027 gso/s |
131
- | `nk_hammings_symmetric_u1_smebi32` | 629 gso/s | 1,438 gso/s | 1,111 gso/s |
132
- | `nk_jaccards_packed_u1_smebi32` | 273 gso/s, 0 ulp | 1,381 gso/s, 0 ulp | 3,280 gso/s, 0 ulp |
133
- | `nk_jaccards_symmetric_u1_smebi32` | 45.1 gso/s, 0 ulp | 267 gso/s, 0 ulp | 618 gso/s, 0 ulp |
122
+ | `nk_hammings_packed_u1_serial` | 156 gso/s | 231 gso/s | 262 gso/s |
123
+ | `nk_hammings_symmetric_u1_serial` | 106 gso/s | 196 gso/s | 246 gso/s |
124
+ | `nk_jaccards_packed_u1_serial` | 136 gso/s, 0 ulp | 221 gso/s, 0 ulp | 262 gso/s, 0 ulp |
125
+ | `nk_jaccards_symmetric_u1_serial` | 96.5 gso/s, 0 ulp | 183 gso/s, 0 ulp | 244 gso/s, 0 ulp |
126
+ | `nk_hammings_packed_u1_neon` | 321 gso/s | 436 gso/s | 508 gso/s |
127
+ | `nk_hammings_symmetric_u1_neon` | 126 gso/s | 239 gso/s | 318 gso/s |
128
+ | `nk_jaccards_packed_u1_neon` | 271 gso/s, 0 ulp | 423 gso/s, 0 ulp | 503 gso/s, 0 ulp |
129
+ | `nk_jaccards_symmetric_u1_neon` | 120 gso/s, 0 ulp | 233 gso/s, 0 ulp | 316 gso/s, 0 ulp |
130
+ | `nk_hammings_packed_u1_smebi32` | 3,286 gso/s | 7,303 gso/s | 11,269 gso/s |
131
+ | `nk_hammings_symmetric_u1_smebi32` | 1,872 gso/s | 5,332 gso/s | 4,079 gso/s |
132
+ | `nk_jaccards_packed_u1_smebi32` | 371 gso/s, 0 ulp | 1,735 gso/s, 0 ulp | 4,348 gso/s, 0 ulp |
133
+ | `nk_jaccards_symmetric_u1_smebi32` | 83.1 gso/s, 0 ulp | 358 gso/s, 0 ulp | 1,005 gso/s, 0 ulp |
134
134
 
135
135
  #### WASM
136
136
 
137
- Measured with Wasmtime v42 (Cranelift backend).
137
+ Measured with Wasmtime v43 (Cranelift backend).
138
138
 
139
139
  | Kernel | 256³ | 1024³ | 4096³ |
140
140
  | :------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
141
141
  | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
142
- | `nk_hammings_packed_u1_serial` | 35.2 gso/s | 47.6 gso/s | 52.8 gso/s |
143
- | `nk_hammings_symmetric_u1_serial` | 25.4 gso/s | 51.5 gso/s | 129 gso/s |
144
- | `nk_jaccards_packed_u1_serial` | 30.9 gso/s, 0 ulp | 46.0 gso/s, 0 ulp | 52.7 gso/s, 0 ulp |
145
- | `nk_jaccards_symmetric_u1_serial` | 22.8 gso/s, 0 ulp | 48.9 gso/s, 0 ulp | 123 gso/s, 0 ulp |
146
- | `nk_hammings_packed_u1_v128relaxed` | 102 gso/s | 144 gso/s | 160 gso/s |
147
- | `nk_hammings_symmetric_u1_v128relaxed` | 28.2 gso/s | 61.7 gso/s | 175 gso/s |
148
- | `nk_jaccards_packed_u1_v128relaxed` | 91.2 gso/s, 0 ulp | 140 gso/s, 0 ulp | 172 gso/s, 0 ulp |
149
- | `nk_jaccards_symmetric_u1_v128relaxed` | 26.9 gso/s, 0 ulp | 60.3 gso/s, 0 ulp | 177 gso/s, 0 ulp |
142
+ | `nk_hammings_packed_u1_serial` | 99.3 gso/s | 127 gso/s | 154 gso/s |
143
+ | `nk_hammings_symmetric_u1_serial` | 63.7 gso/s | 142 gso/s | 210 gso/s |
144
+ | `nk_jaccards_packed_u1_serial` | 92.2 gso/s, 0 ulp | 123 gso/s, 0 ulp | 153 gso/s, 0 ulp |
145
+ | `nk_jaccards_symmetric_u1_serial` | 59.3 gso/s, 0 ulp | 142 gso/s, 0 ulp | 207 gso/s, 0 ulp |
146
+ | `nk_hammings_packed_u1_v128relaxed` | 266 gso/s | 378 gso/s | 426 gso/s |
147
+ | `nk_hammings_symmetric_u1_v128relaxed` | 72.2 gso/s | 185 gso/s | 259 gso/s |
148
+ | `nk_jaccards_packed_u1_v128relaxed` | 243 gso/s, 0 ulp | 370 gso/s, 0 ulp | 424 gso/s, 0 ulp |
149
+ | `nk_jaccards_symmetric_u1_v128relaxed` | 72.9 gso/s, 0 ulp | 183 gso/s, 0 ulp | 257 gso/s, 0 ulp |
@@ -0,0 +1,52 @@
1
+ /**
2
+ * @brief Batched Set Operations for LoongArch LASX (256-bit).
3
+ * @file include/numkong/sets/loongsonasx.h
4
+ * @author Ash Vardanian
5
+ * @date March 25, 2026
6
+ *
7
+ * @sa include/numkong/sets.h
8
+ */
9
+ #ifndef NK_SETS_LOONGSONASX_H
10
+ #define NK_SETS_LOONGSONASX_H
11
+
12
+ #if NK_TARGET_LOONGARCH_
13
+ #if NK_TARGET_LOONGSONASX
14
+
15
+ #include "numkong/set/loongsonasx.h"
16
+ #include "numkong/dots/loongsonasx.h"
17
+
18
+ #if defined(__cplusplus)
19
+ extern "C" {
20
+ #endif
21
+
22
+ nk_define_cross_normalized_packed_(hamming, u1, loongsonasx, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32,
23
+ nk_b128_vec_t, nk_dots_packed_u1_loongsonasx, nk_hamming_u32x4_from_dot_loongsonasx_,
24
+ nk_dots_reduce_sum_u1_, nk_load_b128_loongsonasx_, nk_partial_load_b32x4_serial_,
25
+ nk_store_b128_loongsonasx_, nk_partial_store_b32x4_serial_,
26
+ /*dimensions_per_value=*/8)
27
+
28
+ nk_define_cross_normalized_packed_(jaccard, u1, loongsonasx, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32,
29
+ nk_b128_vec_t, nk_dots_packed_u1_loongsonasx, nk_jaccard_f32x4_from_dot_loongsonasx_,
30
+ nk_dots_reduce_sum_u1_, nk_load_b128_loongsonasx_, nk_partial_load_b32x4_serial_,
31
+ nk_store_b128_loongsonasx_, nk_partial_store_b32x4_serial_,
32
+ /*dimensions_per_value=*/8)
33
+
34
+ nk_define_cross_normalized_symmetric_(hamming, u1, loongsonasx, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
35
+ nk_dots_symmetric_u1_loongsonasx, nk_hamming_u32x4_from_dot_loongsonasx_,
36
+ nk_dots_reduce_sum_u1_, nk_load_b128_loongsonasx_, nk_partial_load_b32x4_serial_,
37
+ nk_store_b128_loongsonasx_, nk_partial_store_b32x4_serial_,
38
+ /*dimensions_per_value=*/8)
39
+
40
+ nk_define_cross_normalized_symmetric_(jaccard, u1, loongsonasx, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
41
+ nk_dots_symmetric_u1_loongsonasx, nk_jaccard_f32x4_from_dot_loongsonasx_,
42
+ nk_dots_reduce_sum_u1_, nk_load_b128_loongsonasx_, nk_partial_load_b32x4_serial_,
43
+ nk_store_b128_loongsonasx_, nk_partial_store_b32x4_serial_,
44
+ /*dimensions_per_value=*/8)
45
+
46
+ #if defined(__cplusplus)
47
+ } // extern "C"
48
+ #endif
49
+
50
+ #endif // NK_TARGET_LOONGSONASX
51
+ #endif // NK_TARGET_LOONGARCH_
52
+ #endif // NK_SETS_LOONGSONASX_H
@@ -0,0 +1,65 @@
1
+ /**
2
+ * @brief Batched Set Operations for Power VSX.
3
+ * @file include/numkong/sets/powervsx.h
4
+ * @author Ash Vardanian
5
+ * @date March 23, 2026
6
+ *
7
+ * @sa include/numkong/sets.h
8
+ */
9
+ #ifndef NK_SETS_POWERVSX_H
10
+ #define NK_SETS_POWERVSX_H
11
+
12
+ #if NK_TARGET_POWER_
13
+ #if NK_TARGET_POWERVSX
14
+
15
+ #include "numkong/set/powervsx.h"
16
+ #include "numkong/dots/powervsx.h"
17
+
18
+ #if defined(__clang__)
19
+ #pragma clang attribute push(__attribute__((target("power9-vector"))), apply_to = function)
20
+ #elif defined(__GNUC__)
21
+ #pragma GCC push_options
22
+ #pragma GCC target("power9-vector")
23
+ #endif
24
+
25
+ #if defined(__cplusplus)
26
+ extern "C" {
27
+ #endif
28
+
29
+ nk_define_cross_normalized_packed_(hamming, u1, powervsx, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
30
+ nk_dots_packed_u1_powervsx, nk_hamming_u32x4_from_dot_powervsx_,
31
+ nk_dots_reduce_sum_u1_, nk_load_b128_powervsx_, nk_partial_load_b32x4_powervsx_,
32
+ nk_store_b128_powervsx_, nk_partial_store_b32x4_powervsx_,
33
+ /*dimensions_per_value=*/8)
34
+
35
+ nk_define_cross_normalized_packed_(jaccard, u1, powervsx, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
36
+ nk_dots_packed_u1_powervsx, nk_jaccard_f32x4_from_dot_powervsx_,
37
+ nk_dots_reduce_sum_u1_, nk_load_b128_powervsx_, nk_partial_load_b32x4_powervsx_,
38
+ nk_store_b128_powervsx_, nk_partial_store_b32x4_powervsx_,
39
+ /*dimensions_per_value=*/8)
40
+
41
+ nk_define_cross_normalized_symmetric_(hamming, u1, powervsx, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
42
+ nk_dots_symmetric_u1_powervsx, nk_hamming_u32x4_from_dot_powervsx_,
43
+ nk_dots_reduce_sum_u1_, nk_load_b128_powervsx_, nk_partial_load_b32x4_powervsx_,
44
+ nk_store_b128_powervsx_, nk_partial_store_b32x4_powervsx_,
45
+ /*dimensions_per_value=*/8)
46
+
47
+ nk_define_cross_normalized_symmetric_(jaccard, u1, powervsx, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
48
+ nk_dots_symmetric_u1_powervsx, nk_jaccard_f32x4_from_dot_powervsx_,
49
+ nk_dots_reduce_sum_u1_, nk_load_b128_powervsx_, nk_partial_load_b32x4_powervsx_,
50
+ nk_store_b128_powervsx_, nk_partial_store_b32x4_powervsx_,
51
+ /*dimensions_per_value=*/8)
52
+
53
+ #if defined(__cplusplus)
54
+ } // extern "C"
55
+ #endif
56
+
57
+ #if defined(__clang__)
58
+ #pragma clang attribute pop
59
+ #elif defined(__GNUC__)
60
+ #pragma GCC pop_options
61
+ #endif
62
+
63
+ #endif // NK_TARGET_POWERVSX
64
+ #endif // NK_TARGET_POWER_
65
+ #endif // NK_SETS_POWERVSX_H