numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -8,13 +8,13 @@
8
8
  *
9
9
  * @section scalars_neon_instructions Key NEON Scalar Instructions
10
10
  *
11
- * Intrinsic Instruction Latency Throughput
12
- * vsqrt_f32 FSQRT (S) 9-12cy 0.25/cy
13
- * vsqrt_f64 FSQRT (D) 12-18cy 0.25/cy
14
- * vfmas_f32 FMADD (S, S, S, S) 4cy 2/cy
15
- * vfmad_f64 FMADD (D, D, D, D) 4cy 2/cy
16
- * vqaddb_u8 UQADD (B) 1cy 4/cy
17
- * vqaddb_s8 SQADD (B) 1cy 4/cy
11
+ * Intrinsic Instruction A76 M5
12
+ * vsqrt_f32 FSQRT (S) 12cy @ 1p 9cy @ 1p
13
+ * vsqrt_f64 FSQRT (D) 12cy @ 1p 9cy @ 1p
14
+ * vfmas_f32 FMADD (S, S, S, S) 4cy @ 2p 3cy @ 4p
15
+ * vfmad_f64 FMADD (D, D, D, D) 4cy @ 2p 3cy @ 4p
16
+ * vqaddb_u8 UQADD (B) 2cy @ 2p 3cy @ 2p
17
+ * vqaddb_s8 SQADD (B) 2cy @ 2p 3cy @ 2p
18
18
  */
19
19
  #ifndef NK_SCALAR_NEON_H
20
20
  #define NK_SCALAR_NEON_H
@@ -98,8 +98,8 @@ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_neon(nk_u64_t a, nk_u64_t b) {
98
98
  }
99
99
  NK_PUBLIC nk_i64_t nk_i64_saturating_mul_neon(nk_i64_t a, nk_i64_t b) {
100
100
  int sign = (a < 0) ^ (b < 0);
101
- nk_u64_t abs_a = a < 0 ? -(nk_u64_t)a : (nk_u64_t)a;
102
- nk_u64_t abs_b = b < 0 ? -(nk_u64_t)b : (nk_u64_t)b;
101
+ nk_u64_t abs_a = a < 0 ? (0u - (nk_u64_t)a) : (nk_u64_t)a;
102
+ nk_u64_t abs_b = b < 0 ? (0u - (nk_u64_t)b) : (nk_u64_t)b;
103
103
  nk_u64_t high = nk_u64_mulhigh_neon_(abs_a, abs_b);
104
104
  nk_u64_t low = abs_a * abs_b;
105
105
  if (high || (sign && low > 9223372036854775808ull) || (!sign && low > 9223372036854775807ull))
@@ -0,0 +1,96 @@
1
+ /**
2
+ * @brief SIMD-accelerated Scalar Math Helpers for Power VSX.
3
+ * @file include/numkong/scalar/powervsx.h
4
+ * @author Ash Vardanian
5
+ * @date March 24, 2026
6
+ *
7
+ * @sa include/numkong/scalar.h
8
+ *
9
+ * @section scalars_powervsx_instructions Key Power VSX Scalar Instructions
10
+ *
11
+ * Instruction Description Latency
12
+ * xssqrtsp Scalar √ (f32) 26cy
13
+ * xssqrtdp Scalar √ (f64) 33cy
14
+ * xsrsqrtesp Scalar 1/√ estimate (f32) 6cy
15
+ * xsrsqrtedp Scalar 1/√ estimate (f64) 6cy
16
+ * xsmaddadp Scalar FMA (f64) 5cy
17
+ * xsmaddasp Scalar FMA (f32) 5cy
18
+ */
19
+ #ifndef NK_SCALAR_POWERVSX_H
20
+ #define NK_SCALAR_POWERVSX_H
21
+
22
+ #if NK_TARGET_POWER_
23
+ #if NK_TARGET_POWERVSX
24
+
25
+ #include "numkong/types.h"
26
+
27
+ #if defined(__cplusplus)
28
+ extern "C" {
29
+ #endif
30
+
31
+ #if defined(__clang__)
32
+ #pragma clang attribute push(__attribute__((target("power9-vector"))), apply_to = function)
33
+ #elif defined(__GNUC__)
34
+ #pragma GCC push_options
35
+ #pragma GCC target("power9-vector")
36
+ #endif
37
+
38
+ NK_PUBLIC nk_f32_t nk_f32_sqrt_powervsx(nk_f32_t x) {
39
+ nk_f32_t result;
40
+ __asm__("xssqrtsp %0, %1" : "=f"(result) : "f"(x));
41
+ return result;
42
+ }
43
+ NK_PUBLIC nk_f64_t nk_f64_sqrt_powervsx(nk_f64_t x) {
44
+ nk_f64_t result;
45
+ __asm__("xssqrtdp %0, %1" : "=d"(result) : "d"(x));
46
+ return result;
47
+ }
48
+ NK_PUBLIC nk_f32_t nk_f32_rsqrt_powervsx(nk_f32_t x) {
49
+ // xsrsqrtesp → ~12-bit estimate, then 2 Newton→Raphson iterations → ~24-bit precision
50
+ nk_f32_t r;
51
+ __asm__("xsrsqrtesp %0, %1" : "=f"(r) : "f"(x));
52
+ // Newton→Raphson: r = r * (3 - x * r * r) / 2
53
+ nk_f32_t half_x = x * 0.5f;
54
+ nk_f32_t three_half = 1.5f;
55
+ r = r * (three_half - half_x * r * r);
56
+ r = r * (three_half - half_x * r * r);
57
+ return r;
58
+ }
59
+ NK_PUBLIC nk_f64_t nk_f64_rsqrt_powervsx(nk_f64_t x) {
60
+ // xsrsqrtedp → ~14-bit estimate, then 3 Newton→Raphson iterations → ~48-bit precision
61
+ nk_f64_t r;
62
+ __asm__("xsrsqrtedp %0, %1" : "=d"(r) : "d"(x));
63
+ // Newton→Raphson: r = r * (3 - x * r * r) / 2
64
+ nk_f64_t half_x = x * 0.5;
65
+ nk_f64_t three_half = 1.5;
66
+ r = r * (three_half - half_x * r * r);
67
+ r = r * (three_half - half_x * r * r);
68
+ r = r * (three_half - half_x * r * r);
69
+ return r;
70
+ }
71
+ NK_PUBLIC nk_f32_t nk_f32_fma_powervsx(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
72
+ // xsmaddasp: result = a * b + c (scalar f32 FMA)
73
+ nk_f32_t r = c;
74
+ __asm__("xsmaddasp %0, %1, %2" : "+f"(r) : "f"(a), "f"(b));
75
+ return r;
76
+ }
77
+ NK_PUBLIC nk_f64_t nk_f64_fma_powervsx(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
78
+ // xsmaddadp: result = a * b + c (scalar f64 FMA)
79
+ nk_f64_t r = c;
80
+ __asm__("xsmaddadp %0, %1, %2" : "+d"(r) : "d"(a), "d"(b));
81
+ return r;
82
+ }
83
+
84
+ #if defined(__clang__)
85
+ #pragma clang attribute pop
86
+ #elif defined(__GNUC__)
87
+ #pragma GCC pop_options
88
+ #endif
89
+
90
+ #if defined(__cplusplus)
91
+ } // extern "C"
92
+ #endif
93
+
94
+ #endif // NK_TARGET_POWERVSX
95
+ #endif // NK_TARGET_POWER_
96
+ #endif // NK_SCALAR_POWERVSX_H
@@ -185,8 +185,8 @@ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_rvv(nk_u64_t a, nk_u64_t b) {
185
185
  }
186
186
  NK_PUBLIC nk_i64_t nk_i64_saturating_mul_rvv(nk_i64_t a, nk_i64_t b) {
187
187
  int sign = (a < 0) ^ (b < 0);
188
- nk_u64_t abs_a = a < 0 ? -(nk_u64_t)a : (nk_u64_t)a;
189
- nk_u64_t abs_b = b < 0 ? -(nk_u64_t)b : (nk_u64_t)b;
188
+ nk_u64_t abs_a = a < 0 ? (0u - (nk_u64_t)a) : (nk_u64_t)a;
189
+ nk_u64_t abs_b = b < 0 ? (0u - (nk_u64_t)b) : (nk_u64_t)b;
190
190
  vuint64m1_t a_u64m1 = __riscv_vmv_v_x_u64m1(abs_a, 1);
191
191
  vuint64m1_t b_u64m1 = __riscv_vmv_v_x_u64m1(abs_b, 1);
192
192
  nk_u64_t high = __riscv_vmv_x_s_u64m1_u64(__riscv_vmulhu_vv_u64m1(a_u64m1, b_u64m1, 1));
@@ -29,23 +29,34 @@ extern "C" {
29
29
  #endif
30
30
 
31
31
  NK_PUBLIC int nk_f16_order_sapphire(nk_f16_t a, nk_f16_t b) {
32
- __m128h a_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(a));
33
- __m128h b_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(b));
32
+ nk_fui16_t a_fui, b_fui;
33
+ a_fui.f = a, b_fui.f = b;
34
+ __m128h a_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(a_fui.u));
35
+ __m128h b_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(b_fui.u));
34
36
  return _mm_comigt_sh(a_f16x8, b_f16x8) - _mm_comilt_sh(a_f16x8, b_f16x8);
35
37
  }
36
38
  NK_PUBLIC nk_f16_t nk_f16_sqrt_sapphire(nk_f16_t x) {
37
- __m128h x_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(x));
38
- return (nk_f16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_sqrt_sh(x_f16x8, x_f16x8)));
39
+ nk_fui16_t x_fui, out_fui;
40
+ x_fui.f = x;
41
+ __m128h x_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(x_fui.u));
42
+ out_fui.u = (nk_u16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_sqrt_sh(x_f16x8, x_f16x8)));
43
+ return out_fui.f;
39
44
  }
40
45
  NK_PUBLIC nk_f16_t nk_f16_rsqrt_sapphire(nk_f16_t x) {
41
- __m128h x_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(x));
42
- return (nk_f16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_rsqrt_sh(x_f16x8, x_f16x8)));
46
+ nk_fui16_t x_fui, out_fui;
47
+ x_fui.f = x;
48
+ __m128h x_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(x_fui.u));
49
+ out_fui.u = (nk_u16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_rsqrt_sh(x_f16x8, x_f16x8)));
50
+ return out_fui.f;
43
51
  }
44
52
  NK_PUBLIC nk_f16_t nk_f16_fma_sapphire(nk_f16_t a, nk_f16_t b, nk_f16_t c) {
45
- __m128h a_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(a));
46
- __m128h b_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(b));
47
- __m128h c_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(c));
48
- return (nk_f16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_fmadd_sh(a_f16x8, b_f16x8, c_f16x8)));
53
+ nk_fui16_t a_fui, b_fui, c_fui, out_fui;
54
+ a_fui.f = a, b_fui.f = b, c_fui.f = c;
55
+ __m128h a_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(a_fui.u));
56
+ __m128h b_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(b_fui.u));
57
+ __m128h c_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(c_fui.u));
58
+ out_fui.u = (nk_u16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_fmadd_sh(a_f16x8, b_f16x8, c_f16x8)));
59
+ return out_fui.f;
49
60
  }
50
61
 
51
62
  #if defined(__clang__)
@@ -74,16 +74,16 @@ NK_PUBLIC nk_f64_t nk_f64_fma_serial(nk_f64_t multiplicand, nk_f64_t multiplier,
74
74
  nk_f64_t product = multiplicand * multiplier;
75
75
  // Dekker splitting: break each operand into non-overlapping high and low halves
76
76
  nk_f64_t const dekker_split = 134217729.0; // 2^27 + 1 for double precision
77
- nk_f64_t multiplicand_hi = dekker_split * multiplicand;
78
- nk_f64_t multiplicand_lo = multiplicand - (multiplicand_hi - (multiplicand_hi - multiplicand));
79
- multiplicand_hi = multiplicand_hi - (multiplicand_hi - multiplicand);
80
- nk_f64_t multiplier_hi = dekker_split * multiplier;
81
- nk_f64_t multiplier_lo = multiplier - (multiplier_hi - (multiplier_hi - multiplier));
82
- multiplier_hi = multiplier_hi - (multiplier_hi - multiplier);
77
+ nk_f64_t multiplicand_high = dekker_split * multiplicand;
78
+ nk_f64_t multiplicand_low = multiplicand - (multiplicand_high - (multiplicand_high - multiplicand));
79
+ multiplicand_high = multiplicand_high - (multiplicand_high - multiplicand);
80
+ nk_f64_t multiplier_high = dekker_split * multiplier;
81
+ nk_f64_t multiplier_low = multiplier - (multiplier_high - (multiplier_high - multiplier));
82
+ multiplier_high = multiplier_high - (multiplier_high - multiplier);
83
83
  // Exact multiplication error from the four cross-products
84
- nk_f64_t product_error = ((multiplicand_hi * multiplier_hi - product) + multiplicand_hi * multiplier_lo +
85
- multiplicand_lo * multiplier_hi) +
86
- multiplicand_lo * multiplier_lo;
84
+ nk_f64_t product_error = ((multiplicand_high * multiplier_high - product) + multiplicand_high * multiplier_low +
85
+ multiplicand_low * multiplier_high) +
86
+ multiplicand_low * multiplier_low;
87
87
  // Knuth TwoSum: add the addend with error tracking
88
88
  nk_f64_t result = product + addend;
89
89
  nk_f64_t addend_recovered = result - product;
@@ -102,16 +102,16 @@ NK_PUBLIC nk_f32_t nk_f32_fma_serial(nk_f32_t multiplicand, nk_f32_t multiplier,
102
102
  nk_f32_t product = multiplicand * multiplier;
103
103
  // Dekker splitting: break each operand into non-overlapping high and low halves
104
104
  nk_f32_t const dekker_split = 4097.0f; // 2^12 + 1 for single precision
105
- nk_f32_t multiplicand_hi = dekker_split * multiplicand;
106
- nk_f32_t multiplicand_lo = multiplicand - (multiplicand_hi - (multiplicand_hi - multiplicand));
107
- multiplicand_hi = multiplicand_hi - (multiplicand_hi - multiplicand);
108
- nk_f32_t multiplier_hi = dekker_split * multiplier;
109
- nk_f32_t multiplier_lo = multiplier - (multiplier_hi - (multiplier_hi - multiplier));
110
- multiplier_hi = multiplier_hi - (multiplier_hi - multiplier);
105
+ nk_f32_t multiplicand_high = dekker_split * multiplicand;
106
+ nk_f32_t multiplicand_low = multiplicand - (multiplicand_high - (multiplicand_high - multiplicand));
107
+ multiplicand_high = multiplicand_high - (multiplicand_high - multiplicand);
108
+ nk_f32_t multiplier_high = dekker_split * multiplier;
109
+ nk_f32_t multiplier_low = multiplier - (multiplier_high - (multiplier_high - multiplier));
110
+ multiplier_high = multiplier_high - (multiplier_high - multiplier);
111
111
  // Exact multiplication error from the four cross-products
112
- nk_f32_t product_error = ((multiplicand_hi * multiplier_hi - product) + multiplicand_hi * multiplier_lo +
113
- multiplicand_lo * multiplier_hi) +
114
- multiplicand_lo * multiplier_lo;
112
+ nk_f32_t product_error = ((multiplicand_high * multiplier_high - product) + multiplicand_high * multiplier_low +
113
+ multiplicand_low * multiplier_high) +
114
+ multiplicand_low * multiplier_low;
115
115
  // Knuth TwoSum: add the addend with error tracking
116
116
  nk_f32_t result = product + addend;
117
117
  nk_f32_t addend_recovered = result - product;
@@ -125,7 +125,7 @@ NK_PUBLIC nk_f32_t nk_f32_fma_serial(nk_f32_t multiplicand, nk_f32_t multiplier,
125
125
  * Uses TwoProd (via FMA) and TwoSum error-free transformations.
126
126
  * @see Ogita, T., Rump, S.M., Oishi, S. (2005). "Accurate Sum and Dot Product"
127
127
  */
128
- NK_INTERNAL void nk_f64_dot2_(nk_f64_t *sum, nk_f64_t *compensation, nk_f64_t a, nk_f64_t b) {
128
+ NK_INTERNAL void nk_f64_dot2_(nk_f64_t *sum, nk_f64_t *compensation, nk_f64_t a, nk_f64_t b) NK_STREAMING_COMPATIBLE_ {
129
129
  nk_f64_t product = a * b;
130
130
  nk_f64_t product_error = nk_f64_fma_serial(a, b, -product);
131
131
  nk_f64_t running_sum = *sum + product;
@@ -238,8 +238,8 @@ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_serial(nk_i64_t a, nk_i64_t b) {
238
238
  int sign = ((a < 0) ^ (b < 0)) ? -1 : 1; // Track the sign of the result
239
239
 
240
240
  // Take absolute values for easy multiplication and overflow detection
241
- nk_u64_t abs_a = (a < 0) ? -(nk_u64_t)a : (nk_u64_t)a;
242
- nk_u64_t abs_b = (b < 0) ? -(nk_u64_t)b : (nk_u64_t)b;
241
+ nk_u64_t abs_a = (a < 0) ? (0u - (nk_u64_t)a) : (nk_u64_t)a;
242
+ nk_u64_t abs_b = (b < 0) ? (0u - (nk_u64_t)b) : (nk_u64_t)b;
243
243
 
244
244
  // Split the absolute values into high and low 32-bit parts
245
245
  nk_u64_t a_high = abs_a >> 32;
@@ -383,6 +383,7 @@ NK_PUBLIC nk_f64_t nk_f64_fma_v128relaxed(nk_f64_t a, nk_f64_t b, nk_f64_t c);
383
383
  #include "numkong/scalar/haswell.h" // `nk_f32_sqrt_haswell`
384
384
  #include "numkong/scalar/sapphire.h" // `nk_f16_order_sapphire`
385
385
  #include "numkong/scalar/rvv.h" // `nk_f32_rsqrt_rvv`
386
+ #include "numkong/scalar/powervsx.h" // `nk_f32_sqrt_powervsx`
386
387
  #include "numkong/scalar/v128relaxed.h" // `nk_f32_sqrt_v128relaxed`
387
388
 
388
389
  #if defined(__cplusplus)
@@ -396,6 +397,8 @@ NK_PUBLIC nk_f32_t nk_f32_sqrt(nk_f32_t x) {
396
397
  return nk_f32_sqrt_haswell(x);
397
398
  #elif NK_TARGET_NEON
398
399
  return nk_f32_sqrt_neon(x);
400
+ #elif NK_TARGET_POWERVSX
401
+ return nk_f32_sqrt_powervsx(x);
399
402
  #elif NK_TARGET_RVV
400
403
  return nk_f32_sqrt_rvv(x);
401
404
  #elif NK_TARGET_V128RELAXED
@@ -410,6 +413,8 @@ NK_PUBLIC nk_f64_t nk_f64_sqrt(nk_f64_t x) {
410
413
  return nk_f64_sqrt_haswell(x);
411
414
  #elif NK_TARGET_NEON
412
415
  return nk_f64_sqrt_neon(x);
416
+ #elif NK_TARGET_POWERVSX
417
+ return nk_f64_sqrt_powervsx(x);
413
418
  #elif NK_TARGET_RVV
414
419
  return nk_f64_sqrt_rvv(x);
415
420
  #elif NK_TARGET_V128RELAXED
@@ -424,6 +429,8 @@ NK_PUBLIC nk_f32_t nk_f32_rsqrt(nk_f32_t x) {
424
429
  return nk_f32_rsqrt_haswell(x);
425
430
  #elif NK_TARGET_NEON
426
431
  return nk_f32_rsqrt_neon(x);
432
+ #elif NK_TARGET_POWERVSX
433
+ return nk_f32_rsqrt_powervsx(x);
427
434
  #elif NK_TARGET_RVV
428
435
  return nk_f32_rsqrt_rvv(x);
429
436
  #elif NK_TARGET_V128RELAXED
@@ -438,6 +445,8 @@ NK_PUBLIC nk_f64_t nk_f64_rsqrt(nk_f64_t x) {
438
445
  return nk_f64_rsqrt_haswell(x);
439
446
  #elif NK_TARGET_NEON
440
447
  return nk_f64_rsqrt_neon(x);
448
+ #elif NK_TARGET_POWERVSX
449
+ return nk_f64_rsqrt_powervsx(x);
441
450
  #elif NK_TARGET_RVV
442
451
  return nk_f64_rsqrt_rvv(x);
443
452
  #elif NK_TARGET_V128RELAXED
@@ -452,6 +461,8 @@ NK_PUBLIC nk_f32_t nk_f32_fma(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
452
461
  return nk_f32_fma_haswell(a, b, c);
453
462
  #elif NK_TARGET_NEON
454
463
  return nk_f32_fma_neon(a, b, c);
464
+ #elif NK_TARGET_POWERVSX
465
+ return nk_f32_fma_powervsx(a, b, c);
455
466
  #elif NK_TARGET_RVV
456
467
  return nk_f32_fma_rvv(a, b, c);
457
468
  #elif NK_TARGET_V128RELAXED
@@ -466,6 +477,8 @@ NK_PUBLIC nk_f64_t nk_f64_fma(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
466
477
  return nk_f64_fma_haswell(a, b, c);
467
478
  #elif NK_TARGET_NEON
468
479
  return nk_f64_fma_neon(a, b, c);
480
+ #elif NK_TARGET_POWERVSX
481
+ return nk_f64_fma_powervsx(a, b, c);
469
482
  #elif NK_TARGET_RVV
470
483
  return nk_f64_fma_rvv(a, b, c);
471
484
  #elif NK_TARGET_V128RELAXED
@@ -7,22 +7,22 @@ Hamming distance counts the number of positions where elements differ.
7
7
  For binary vectors packed as octets, this is the popcount of the XOR.
8
8
  For byte-level vectors, it counts the number of mismatched bytes:
9
9
 
10
- ```math
10
+ $$
11
11
  \text{hamming}(a, b) = \sum_{i=0}^{n-1} [a_i \neq b_i]
12
- ```
12
+ $$
13
13
 
14
14
  Jaccard distance measures the dissimilarity of two sets.
15
15
  For binary vectors, the intersection and union are computed via bitwise AND and OR with popcount:
16
16
 
17
- ```math
17
+ $$
18
18
  \text{jaccard}(a, b) = 1 - \frac{|A \cap B|}{|A \cup B|} = 1 - \frac{\text{popcount}(a \mathbin{\&} b)}{\text{popcount}(a \mathbin{|} b)}
19
- ```
19
+ $$
20
20
 
21
21
  For word-level vectors (MinHash signatures), Jaccard similarity is the fraction of matching elements:
22
22
 
23
- ```math
23
+ $$
24
24
  \text{jaccard}(a, b) = 1 - \frac{\sum_{i=0}^{n-1} [a_i = b_i]}{n}
25
- ```
25
+ $$
26
26
 
27
27
  Reformulating as Python pseudocode:
28
28
 
@@ -136,44 +136,44 @@ Measured with Wasmtime v42 (Cranelift backend).
136
136
  | __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
137
137
  | `nk_jaccard_u32_v128relaxed` | 0.430 gb/s, 0 ulp | 2.46 gb/s, 0 ulp | 1.08 gb/s, 0 ulp |
138
138
 
139
- ### Apple M4
139
+ ### Apple M5
140
140
 
141
141
  #### Native
142
142
 
143
143
  | Kernel | 256 | 1024 | 4096 |
144
144
  | :---------------------- | -----------------------: | -----------------------: | -----------------------: |
145
145
  | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
146
- | `nk_hamming_u1_serial` | 4.66 gb/s | 5.30 gb/s | 5.04 gb/s |
147
- | `nk_jaccard_u1_serial` | 3.03 gb/s, 0 ulp | 3.72 gb/s, 0 ulp | 3.65 gb/s, 0 ulp |
148
- | `nk_hamming_u1_neon` | 20.7 gb/s | 41.9 gb/s | 52.2 gb/s |
149
- | `nk_jaccard_u1_neon` | 15.8 gb/s, 0 ulp | 29.5 gb/s, 0 ulp | 34.8 gb/s, 0 ulp |
146
+ | `nk_hamming_u1_serial` | 6.79 gb/s | 7.48 gb/s | 6.92 gb/s |
147
+ | `nk_jaccard_u1_serial` | 4.36 gb/s, 0 ulp | 5.38 gb/s, 0 ulp | 5.45 gb/s, 0 ulp |
148
+ | `nk_hamming_u1_neon` | 31.6 gb/s | 65.6 gb/s | 90.9 gb/s |
149
+ | `nk_jaccard_u1_neon` | 28.4 gb/s, 0 ulp | 48.1 gb/s, 0 ulp | 51.0 gb/s, 0 ulp |
150
150
  | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
151
- | `nk_hamming_u8_serial` | 20.7 gb/s | 21.9 gb/s | 18.1 gb/s |
152
- | `nk_hamming_u8_neon` | 49.1 gb/s | 43.9 gb/s | 32.5 gb/s |
151
+ | `nk_hamming_u8_serial` | 27.8 gb/s | 30.1 gb/s | 31.2 gb/s |
152
+ | `nk_hamming_u8_neon` | 96.9 gb/s | 79.5 gb/s | 56.3 gb/s |
153
153
  | __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
154
- | `nk_jaccard_u16_serial` | 42.5 gb/s, 0 ulp | 39.7 gb/s, 0 ulp | 36.1 gb/s, 0 ulp |
155
- | `nk_jaccard_u16_neon` | 43.3 gb/s, 0 ulp | 33.0 gb/s, 0 ulp | 29.2 gb/s, 0 ulp |
154
+ | `nk_jaccard_u16_serial` | 59.3 gb/s, 0 ulp | 69.4 gb/s, 0 ulp | 66.8 gb/s, 0 ulp |
155
+ | `nk_jaccard_u16_neon` | 67.8 gb/s, 0 ulp | 61.6 gb/s, 0 ulp | 50.8 gb/s, 0 ulp |
156
156
  | __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
157
- | `nk_jaccard_u32_serial` | 60.6 gb/s, 0 ulp | 49.0 gb/s, 0 ulp | 51.2 gb/s, 0 ulp |
158
- | `nk_jaccard_u32_neon` | 51.0 gb/s, 0 ulp | 39.9 gb/s, 0 ulp | 38.9 gb/s, 0 ulp |
157
+ | `nk_jaccard_u32_serial` | 105 gb/s, 0 ulp | 101 gb/s, 0 ulp | 89.1 gb/s, 0 ulp |
158
+ | `nk_jaccard_u32_neon` | 89.3 gb/s, 0 ulp | 72.8 gb/s, 0 ulp | 68.2 gb/s, 0 ulp |
159
159
 
160
160
  #### WASM
161
161
 
162
- Measured with Wasmtime v42 (Cranelift backend).
162
+ Measured with Wasmtime v43 (Cranelift backend).
163
163
 
164
164
  | Kernel | 256 | 1024 | 4096 |
165
165
  | :--------------------------- | -----------------------: | -----------------------: | -----------------------: |
166
166
  | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
167
- | `nk_hamming_u1_serial` | 0.501 gb/s | 0.00424 gb/s | 0.0443 gb/s |
168
- | `nk_jaccard_u1_serial` | 0.315 gb/s, 0 ulp | 0.362 gb/s, 0 ulp | 0.382 gb/s, 0 ulp |
169
- | `nk_hamming_u1_v128relaxed` | 0.414 gb/s | 0.0294 gb/s | 0.233 gb/s |
170
- | `nk_jaccard_u1_v128relaxed` | 0.0141 gb/s, 0 ulp | 0.317 gb/s, 0 ulp | 0.249 gb/s, 0 ulp |
167
+ | `nk_hamming_u1_serial` | 5.18 gb/s | 5.66 gb/s | 6.52 gb/s |
168
+ | `nk_jaccard_u1_serial` | 1.74 gb/s, 0 ulp | 3.32 gb/s, 0 ulp | 3.61 gb/s, 0 ulp |
169
+ | `nk_hamming_u1_v128relaxed` | 22.6 gb/s | 46.5 gb/s | 67.9 gb/s |
170
+ | `nk_jaccard_u1_v128relaxed` | 16.1 gb/s, 0 ulp | 34.5 gb/s, 0 ulp | 50.8 gb/s, 0 ulp |
171
171
  | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
172
- | `nk_hamming_u8_serial` | 0.551 gb/s | 0.352 gb/s | 0.154 gb/s |
173
- | `nk_hamming_u8_v128relaxed` | 0.702 gb/s | 0.409 gb/s | 0.464 gb/s |
172
+ | `nk_hamming_u8_serial` | 8.32 gb/s | 6.09 gb/s | 5.84 gb/s |
173
+ | `nk_hamming_u8_v128relaxed` | 47.7 gb/s | 68.5 gb/s | 72.1 gb/s |
174
174
  | __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
175
- | `nk_jaccard_u16_serial` | 0.647 gb/s, 0 ulp | 0.362 gb/s, 0 ulp | 0.174 gb/s, 0 ulp |
176
- | `nk_jaccard_u16_v128relaxed` | 0.409 gb/s, 0 ulp | 0.00109 gb/s, 0 ulp | 0.275 gb/s, 0 ulp |
175
+ | `nk_jaccard_u16_serial` | 19.2 gb/s, 0 ulp | 12.4 gb/s, 0 ulp | 11.9 gb/s, 0 ulp |
176
+ | `nk_jaccard_u16_v128relaxed` | 89.8 gb/s, 0 ulp | 74.0 gb/s, 0 ulp | 71.3 gb/s, 0 ulp |
177
177
  | __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
178
- | `nk_jaccard_u32_serial` | 0.320 gb/s, 0 ulp | 0.161 gb/s, 0 ulp | 0.607 gb/s, 0 ulp |
179
- | `nk_jaccard_u32_v128relaxed` | 0.397 gb/s, 0 ulp | 0.364 gb/s, 0 ulp | 0.0807 gb/s, 0 ulp |
178
+ | `nk_jaccard_u32_serial` | 91.6 gb/s, 0 ulp | 69.4 gb/s, 0 ulp | 68.4 gb/s, 0 ulp |
179
+ | `nk_jaccard_u32_v128relaxed` | 94.8 gb/s, 0 ulp | 76.2 gb/s, 0 ulp | 68.8 gb/s, 0 ulp |
@@ -8,12 +8,12 @@
8
8
  *
9
9
  * @section set_haswell_instructions Key POPCNT/AVX2 Set Instructions
10
10
  *
11
- * Intrinsic Instruction Latency Throughput Ports
12
- * _mm_popcnt_u64 POPCNT (R64, R64) 3cy 1/cy p1
13
- * _mm256_and_si256 VPAND (YMM, YMM, YMM) 1cy 0.33/cy p015
14
- * _mm256_or_si256 VPOR (YMM, YMM, YMM) 1cy 0.33/cy p015
15
- * _mm256_xor_si256 VPXOR (YMM, YMM, YMM) 1cy 0.33/cy p015
16
- * _mm256_extracti128_si256 VEXTRACTI128 (XMM, YMM, I8) 3cy 1/cy p5
11
+ * Intrinsic Instruction Haswell Genoa
12
+ * _mm_popcnt_u64 POPCNT (R64, R64) 3cy @ p1 1cy @ p0123
13
+ * _mm256_and_si256 VPAND (YMM, YMM, YMM) 1cy @ p015 1cy @ p0123
14
+ * _mm256_or_si256 VPOR (YMM, YMM, YMM) 1cy @ p015 1cy @ p0123
15
+ * _mm256_xor_si256 VPXOR (YMM, YMM, YMM) 1cy @ p015 1cy @ p0123
16
+ * _mm256_extracti128_si256 VEXTRACTI128 (XMM, YMM, I8) 3cy @ p5 1cy @ p0123
17
17
  *
18
18
  * Haswell lacks SIMD popcount; we extract 64-bit words and use scalar POPCNT. The p1 port
19
19
  * bottleneck limits throughput to 1 popcount/cycle. For Hamming distance, XOR + POPCNT;
@@ -55,7 +55,7 @@ extern "C" {
55
55
  #pragma GCC target("avx2", "sse4.1", "popcnt")
56
56
  #endif
57
57
 
58
- #pragma region - Binary Sets
58
+ #pragma region Binary Sets
59
59
 
60
60
  NK_PUBLIC void nk_hamming_u1_haswell(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
61
61
  nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
@@ -79,9 +79,9 @@ NK_PUBLIC void nk_jaccard_u1_haswell(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_
79
79
  *result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
80
80
  }
81
81
 
82
- #pragma endregion - Binary Sets
82
+ #pragma endregion Binary Sets
83
83
 
84
- #pragma region - Integer Sets
84
+ #pragma region Integer Sets
85
85
 
86
86
  NK_PUBLIC void nk_jaccard_u32_haswell(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
87
87
  nk_u32_t intersection_count = 0;
@@ -192,9 +192,9 @@ NK_PUBLIC void nk_jaccard_u16_haswell(nk_u16_t const *a, nk_u16_t const *b, nk_s
192
192
  *result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
193
193
  }
194
194
 
195
- #pragma endregion - Integer Sets
195
+ #pragma endregion Integer Sets
196
196
 
197
- #pragma region - Stateful Streaming
197
+ #pragma region Stateful Streaming
198
198
 
199
199
  typedef struct nk_hamming_u1x64_state_haswell_t {
200
200
  nk_u32_t intersection_count;
@@ -317,7 +317,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_haswell_(nk_b128_vec_t dots, nk_u32_t
317
317
  results->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
318
318
  }
319
319
 
320
- #pragma endregion - Stateful Streaming
320
+ #pragma endregion Stateful Streaming
321
321
 
322
322
  #if defined(__clang__)
323
323
  #pragma clang attribute pop
@@ -8,12 +8,12 @@
8
8
  *
9
9
  * @section set_icelake_instructions Key AVX-512 Set Instructions
10
10
  *
11
- * Intrinsic Instruction Latency Throughput Ports
12
- * _mm512_popcnt_epi64 VPOPCNTQ (ZMM, ZMM) 3cy 1/cy p5
13
- * _mm512_and_si512 VPANDQ (ZMM, ZMM, ZMM) 1cy 0.33/cy p05
14
- * _mm512_or_si512 VPORQ (ZMM, ZMM, ZMM) 1cy 0.33/cy p05
15
- * _mm512_xor_si512 VPXORQ (ZMM, ZMM, ZMM) 1cy 0.33/cy p05
16
- * _mm512_maskz_loadu_epi8 VMOVDQU8 (ZMM, mem, k1) 7cy 0.5/cy p23
11
+ * Intrinsic Instruction Ice Lake
12
+ * _mm512_popcnt_epi64 VPOPCNTQ (ZMM, ZMM) 3cy @ p5
13
+ * _mm512_and_si512 VPANDQ (ZMM, ZMM, ZMM) 1cy @ p05
14
+ * _mm512_or_si512 VPORQ (ZMM, ZMM, ZMM) 1cy @ p05
15
+ * _mm512_xor_si512 VPXORQ (ZMM, ZMM, ZMM) 1cy @ p05
16
+ * _mm512_maskz_loadu_epi8 VMOVDQU8 (ZMM, mem, k1) 7cy @ p23
17
17
  *
18
18
  * Ice Lake has native VPOPCNTQ instruction via AVX-512 VPOPCNTDQ extension, enabling
19
19
  * efficient 64-bit element-wise popcount. We process 512 bits per iteration.
@@ -54,7 +54,7 @@ extern "C" {
54
54
  #pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512vpopcntdq", "f16c", "fma", "bmi", "bmi2")
55
55
  #endif
56
56
 
57
- #pragma region - Binary Sets
57
+ #pragma region Binary Sets
58
58
 
59
59
  NK_PUBLIC void nk_hamming_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
60
60
  nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
@@ -239,9 +239,9 @@ NK_PUBLIC void nk_jaccard_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_
239
239
  *result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
240
240
  }
241
241
 
242
- #pragma endregion - Binary Sets
242
+ #pragma endregion Binary Sets
243
243
 
244
- #pragma region - Integer Sets
244
+ #pragma region Integer Sets
245
245
 
246
246
  NK_PUBLIC void nk_jaccard_u32_icelake(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
247
247
  nk_u32_t intersection_count = 0;
@@ -300,9 +300,9 @@ NK_PUBLIC void nk_jaccard_u16_icelake(nk_u16_t const *a, nk_u16_t const *b, nk_s
300
300
  *result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
301
301
  }
302
302
 
303
- #pragma endregion - Integer Sets
303
+ #pragma endregion Integer Sets
304
304
 
305
- #pragma region - Stateful Streaming
305
+ #pragma region Stateful Streaming
306
306
 
307
307
  typedef struct nk_hamming_u1x512_state_icelake_t {
308
308
  __m512i intersection_count_i64x8;
@@ -438,7 +438,7 @@ NK_INTERNAL void nk_jaccard_u1x512_finalize_icelake( //
438
438
  result->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
439
439
  }
440
440
 
441
- /** @brief Hamming from_dot: computes pop_a + pop_b - 2*dot for 4 pairs (IceLake). */
441
+ /** @brief Hamming from_dot: computes pop_a + pop_b - 2*dot for 4 pairs (Icelake). */
442
442
  NK_INTERNAL void nk_hamming_u32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
443
443
  nk_b128_vec_t *results) {
444
444
  __m128i dots_i32x4 = dots.xmm;
@@ -447,7 +447,7 @@ NK_INTERNAL void nk_hamming_u32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t
447
447
  results->xmm = _mm_sub_epi32(_mm_add_epi32(query_i32x4, target_i32x4), _mm_slli_epi32(dots_i32x4, 1));
448
448
  }
449
449
 
450
- /** @brief Jaccard from_dot: computes 1 - dot / (pop_a + pop_b - dot) for 4 pairs (IceLake). */
450
+ /** @brief Jaccard from_dot: computes 1 - dot / (pop_a + pop_b - dot) for 4 pairs (Icelake). */
451
451
  NK_INTERNAL void nk_jaccard_f32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
452
452
  nk_b128_vec_t *results) {
453
453
  __m128 dot_f32x4 = _mm_cvtepi32_ps(dots.xmm);
@@ -468,7 +468,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t
468
468
  results->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
469
469
  }
470
470
 
471
- #pragma endregion - Stateful Streaming
471
+ #pragma endregion Stateful Streaming
472
472
 
473
473
  #if defined(__clang__)
474
474
  #pragma clang attribute pop