numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -73,14 +73,14 @@
73
73
  * BF16 dot products (VDPBF16PS) are Genoa-only, accumulating bf16 pairs directly to f32.
74
74
  * Genoa shows 40% faster integer multiply-add (3c vs 5c) than Ice Lake.
75
75
  *
76
- * Intrinsic Instruction Haswell Ice Genoa
77
- * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 5c @ p01 4c @ p01 4c @ p01
78
- * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 5c @ p01 4c @ p01 4c @ p01
79
- * _mm256_maddubs_epi16 VPMADDUBSW (YMM, YMM, YMM) 5c @ p0 5c @ p01 3c @ p01
80
- * _mm256_madd_epi16 VPMADDWD (YMM, YMM, YMM) 5c @ p0 5c @ p01 3c @ p01
81
- * _mm256_dpbusd_epi32 VPDPBUSD (YMM, YMM, YMM) N/A 5c @ p01 4c @ p01
82
- * _mm512_dpwssd_epi32 VPDPWSSD (ZMM, ZMM, ZMM) N/A 5c @ p0 4c @ p01
83
- * _mm512_dpbf16_ps VDPBF16PS (ZMM, ZMM, ZMM) N/A N/A 6c @ p01
76
+ * Intrinsic Instruction Haswell Icelake Genoa
77
+ * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 5cy @ p01 4cy @ p01 4cy @ p01
78
+ * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 5cy @ p01 4cy @ p01 4cy @ p01
79
+ * _mm256_maddubs_epi16 VPMADDUBSW (YMM, YMM, YMM) 5cy @ p0 5cy @ p01 3cy @ p01
80
+ * _mm256_madd_epi16 VPMADDWD (YMM, YMM, YMM) 5cy @ p0 5cy @ p01 3cy @ p01
81
+ * _mm256_dpbusd_epi32 VPDPBUSD (YMM, YMM, YMM) n/a 5cy @ p01 4cy @ p01
82
+ * _mm512_dpwssd_epi32 VPDPWSSD (ZMM, ZMM, ZMM) n/a 5cy @ p0 4cy @ p01
83
+ * _mm512_dpbf16_ps VDPBF16PS (ZMM, ZMM, ZMM) n/a n/a 6cy @ p01
84
84
  *
85
85
  * @section arm_neon_instructions Relevant ARM NEON Instructions
86
86
  *
@@ -89,13 +89,13 @@
89
89
  * provides native bf16 dot products on Graviton 3+. Complex dot products use LD2 for deinterleaved
90
90
  * loads of real/imag pairs, though its L01+V throughput can bottleneck on memory-bound workloads.
91
91
  *
92
- * Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
93
- * vfmaq_f32 FMLA.S (vec) 4c @ V0123 4c @ V0123 4c @ V0123
94
- * vfmaq_f64 FMLA.D (vec) 4c @ V0123 4c @ V0123 4c @ V0123
95
- * vdotq_s32 SDOT (vec) 3c @ V0123 3c @ V0123 3c @ V0123
96
- * vdotq_u32 UDOT (vec) 3c @ V0123 3c @ V0123 3c @ V0123
97
- * vbfdotq_f32 BFDOT (vec) N/A 4c @ V0123 5c @ V0123
98
- * vld2q_f32 LD2 (Q-form) 5c @ L01+V 8c @ L01+V 8c @ L01+V
92
+ * Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
93
+ * vfmaq_f32 FMLA.S (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
94
+ * vfmaq_f64 FMLA.D (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
95
+ * vdotq_s32 SDOT (vec) 3cy @ V0123 3cy @ V0123 3cy @ V0123
96
+ * vdotq_u32 UDOT (vec) 3cy @ V0123 3cy @ V0123 3cy @ V0123
97
+ * vbfdotq_f32 BFDOT (vec) N/A 4cy @ V0123 5cy @ V0123
98
+ * vld2q_f32 LD2 (Q-form) 5cy @ L01+V 8cy @ L01+V 8cy @ L01+V
99
99
  *
100
100
  * @section arm_sve_instructions Relevant ARM SVE Instructions
101
101
  *
@@ -103,12 +103,12 @@
103
103
  * scalar cleanup loops. FADDV performs horizontal reduction; notably 45% faster on Graviton 4
104
104
  * (6c) than Graviton 3 (11c). SVE complex dot products use svld2 for structure loads.
105
105
  *
106
- * Intrinsic Instruction Graviton 3 Graviton 4
107
- * svmla_f32_x FMLA (pred) 4c @ V0123 4c @ V0123
108
- * svmls_f32_x FMLS (pred) 4c @ V0123 4c @ V0123
109
- * svwhilelt_b32 WHILELT 3c @ M0 3c @ M0
110
- * svld2_f32 LD2 (SVE) 8c @ L01+V 8c @ L01+V
111
- * svaddv_f32 FADDV 11c @ V0123 6c @ V0123
106
+ * Intrinsic Instruction Graviton 3 Graviton 4
107
+ * svmla_f32_x FMLA (pred) 4cy @ V0123 4cy @ V0123
108
+ * svmls_f32_x FMLS (pred) 4cy @ V0123 4cy @ V0123
109
+ * svwhilelt_b32 WHILELT 3cy @ M0 3cy @ M0
110
+ * svld2_f32 LD2 (SVE) 8cy @ L01+V 8cy @ L01+V
111
+ * svaddv_f32 FADDV 11cy @ V0123 6cy @ V0123
112
112
  *
113
113
  * @section complex_instructions Complex Number Optimizations
114
114
  *
@@ -121,7 +121,7 @@
121
121
  *
122
122
  * @section references References
123
123
  *
124
- * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
124
+ * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
125
125
  * - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
126
126
  *
127
127
  */
@@ -293,16 +293,12 @@ NK_PUBLIC void nk_dot_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t
293
293
  /** @copydoc nk_dot_f16 */
294
294
  NK_PUBLIC void nk_dot_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
295
295
 
296
- #endif // NK_TARGET_NEON
297
-
298
- #if NK_TARGET_NEONHALF
299
- /** @copydoc nk_dot_f16 */
300
- NK_PUBLIC void nk_dot_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
301
296
  /** @copydoc nk_dot_f16c */
302
- NK_PUBLIC void nk_dot_f16c_neonhalf(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n, nk_f32c_t *result);
297
+ NK_PUBLIC void nk_dot_f16c_neon(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n, nk_f32c_t *result);
303
298
  /** @copydoc nk_vdot_f16c */
304
- NK_PUBLIC void nk_vdot_f16c_neonhalf(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n, nk_f32c_t *result);
305
- #endif // NK_TARGET_NEONHALF
299
+ NK_PUBLIC void nk_vdot_f16c_neon(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n, nk_f32c_t *result);
300
+
301
+ #endif // NK_TARGET_NEON
306
302
 
307
303
  #if NK_TARGET_NEONFHM
308
304
  /** @copydoc nk_dot_f16 */
@@ -332,6 +328,13 @@ NK_PUBLIC void nk_dot_e2m3_neonsdot(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_s
332
328
  NK_PUBLIC void nk_dot_e3m2_neonsdot(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
333
329
  #endif // NK_TARGET_NEONSDOT
334
330
 
331
+ #if NK_TARGET_SVESDOT
332
+ /** @copydoc nk_dot_i8 */
333
+ NK_PUBLIC void nk_dot_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i32_t *result);
334
+ /** @copydoc nk_dot_u8 */
335
+ NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
336
+ #endif // NK_TARGET_SVESDOT
337
+
335
338
  #if NK_TARGET_NEONBFDOT
336
339
  /** @copydoc nk_dot_bf16 */
337
340
  NK_PUBLIC void nk_dot_bf16_neonbfdot(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
@@ -345,10 +348,16 @@ NK_PUBLIC void nk_dot_bf16c_neonbfdot(nk_bf16c_t const *a, nk_bf16c_t const *b,
345
348
  NK_PUBLIC void nk_vdot_bf16c_neonbfdot(nk_bf16c_t const *a, nk_bf16c_t const *b, nk_size_t n, nk_f32c_t *result);
346
349
  #endif // NK_TARGET_NEONBFDOT
347
350
 
348
- #if NK_TARGET_SVEBFDOT
349
- /** @copydoc nk_dot_bf16 */
350
- NK_PUBLIC void nk_dot_bf16_svebfdot(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
351
- #endif // NK_TARGET_SVEBFDOT
351
+ #if NK_TARGET_NEONFP8
352
+ /** @copydoc nk_dot_e4m3 */
353
+ NK_PUBLIC void nk_dot_e4m3_neonfp8(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
354
+ /** @copydoc nk_dot_e5m2 */
355
+ NK_PUBLIC void nk_dot_e5m2_neonfp8(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
356
+ /** @copydoc nk_dot_e2m3 */
357
+ NK_PUBLIC void nk_dot_e2m3_neonfp8(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
358
+ /** @copydoc nk_dot_e3m2 */
359
+ NK_PUBLIC void nk_dot_e3m2_neonfp8(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
360
+ #endif // NK_TARGET_NEONFP8
352
361
 
353
362
  #if NK_TARGET_SVE
354
363
  /** @copydoc nk_dot_f32 */
@@ -374,6 +383,10 @@ NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a, nk_f16c_t const *b, nk_si
374
383
  NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n, nk_f32c_t *result);
375
384
  #endif // NK_TARGET_SVEHALF
376
385
 
386
+ #if NK_TARGET_SVEBFDOT
387
+ /** @copydoc nk_dot_bf16 */
388
+ NK_PUBLIC void nk_dot_bf16_svebfdot(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
389
+ #endif // NK_TARGET_SVEBFDOT
377
390
  #if NK_TARGET_HASWELL
378
391
  /** @copydoc nk_dot_f32 */
379
392
  NK_PUBLIC void nk_dot_f32_haswell(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result);
@@ -474,6 +487,8 @@ NK_PUBLIC void nk_dot_e2m3_icelake(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_si
474
487
  NK_PUBLIC void nk_dot_e3m2_icelake(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
475
488
  /** @copydoc nk_dot_u1 */
476
489
  NK_PUBLIC void nk_dot_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bits, nk_u32_t *result);
490
+ /** @copydoc nk_dot_e4m3 */
491
+ NK_PUBLIC void nk_dot_e4m3_icelake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
477
492
  #endif // NK_TARGET_ICELAKE
478
493
 
479
494
  #if NK_TARGET_GENOA
@@ -484,12 +499,19 @@ NK_PUBLIC void nk_dot_bf16c_genoa(nk_bf16c_t const *a, nk_bf16c_t const *b, nk_s
484
499
  /** @copydoc nk_vdot_bf16c */
485
500
  NK_PUBLIC void nk_vdot_bf16c_genoa(nk_bf16c_t const *a, nk_bf16c_t const *b, nk_size_t n, nk_f32c_t *result);
486
501
 
487
- /** @copydoc nk_dot_e4m3 */
488
- NK_PUBLIC void nk_dot_e4m3_genoa(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
489
502
  /** @copydoc nk_dot_e5m2 */
490
503
  NK_PUBLIC void nk_dot_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
491
504
  #endif // NK_TARGET_GENOA
492
505
 
506
+ #if NK_TARGET_DIAMOND
507
+ /** @copydoc nk_dot_f16 */
508
+ NK_PUBLIC void nk_dot_f16_diamond(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
509
+ /** @copydoc nk_dot_e4m3 */
510
+ NK_PUBLIC void nk_dot_e4m3_diamond(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
511
+ /** @copydoc nk_dot_e5m2 */
512
+ NK_PUBLIC void nk_dot_e5m2_diamond(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
513
+ #endif // NK_TARGET_DIAMOND
514
+
493
515
  #if NK_TARGET_ALDER
494
516
  /** @copydoc nk_dot_i8 */
495
517
  NK_PUBLIC void nk_dot_i8_alder(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i32_t *result);
@@ -638,16 +660,18 @@ NK_INTERNAL nk_dtype_t nk_dot_output_dtype(nk_dtype_t dtype) {
638
660
  #include "numkong/dot/serial.h"
639
661
  #include "numkong/dot/neon.h"
640
662
  #include "numkong/dot/neonsdot.h"
641
- #include "numkong/dot/neonhalf.h"
642
663
  #include "numkong/dot/neonfhm.h"
643
664
  #include "numkong/dot/neonbfdot.h"
665
+ #include "numkong/dot/neonfp8.h"
644
666
  #include "numkong/dot/sve.h"
645
667
  #include "numkong/dot/svehalf.h"
646
668
  #include "numkong/dot/svebfdot.h"
669
+ #include "numkong/dot/svesdot.h"
647
670
  #include "numkong/dot/haswell.h"
648
671
  #include "numkong/dot/skylake.h"
649
672
  #include "numkong/dot/icelake.h"
650
673
  #include "numkong/dot/genoa.h"
674
+ #include "numkong/dot/diamond.h"
651
675
  #include "numkong/dot/sapphire.h"
652
676
  #include "numkong/dot/alder.h"
653
677
  #include "numkong/dot/sierra.h"
@@ -655,7 +679,9 @@ NK_INTERNAL nk_dtype_t nk_dot_output_dtype(nk_dtype_t dtype) {
655
679
  #include "numkong/dot/rvvbb.h"
656
680
  #include "numkong/dot/rvvhalf.h"
657
681
  #include "numkong/dot/rvvbf16.h"
682
+ #include "numkong/dot/powervsx.h"
658
683
  #include "numkong/dot/v128relaxed.h"
684
+ #include "numkong/dot/loongsonasx.h"
659
685
 
660
686
  #if defined(__cplusplus)
661
687
  extern "C" {
@@ -666,8 +692,14 @@ extern "C" {
666
692
  NK_PUBLIC void nk_dot_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i32_t *result) {
667
693
  #if NK_TARGET_V128RELAXED
668
694
  nk_dot_i8_v128relaxed(a, b, n, result);
695
+ #elif NK_TARGET_POWERVSX
696
+ nk_dot_i8_powervsx(a, b, n, result);
697
+ #elif NK_TARGET_LOONGSONASX
698
+ nk_dot_i8_loongsonasx(a, b, n, result);
669
699
  #elif NK_TARGET_RVV
670
700
  nk_dot_i8_rvv(a, b, n, result);
701
+ #elif NK_TARGET_SVESDOT
702
+ nk_dot_i8_svesdot(a, b, n, result);
671
703
  #elif NK_TARGET_NEONSDOT
672
704
  nk_dot_i8_neonsdot(a, b, n, result);
673
705
  #elif NK_TARGET_ICELAKE
@@ -688,8 +720,14 @@ NK_PUBLIC void nk_dot_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i32
688
720
  NK_PUBLIC void nk_dot_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
689
721
  #if NK_TARGET_V128RELAXED
690
722
  nk_dot_u8_v128relaxed(a, b, n, result);
723
+ #elif NK_TARGET_POWERVSX
724
+ nk_dot_u8_powervsx(a, b, n, result);
725
+ #elif NK_TARGET_LOONGSONASX
726
+ nk_dot_u8_loongsonasx(a, b, n, result);
691
727
  #elif NK_TARGET_RVV
692
728
  nk_dot_u8_rvv(a, b, n, result);
729
+ #elif NK_TARGET_SVESDOT
730
+ nk_dot_u8_svesdot(a, b, n, result);
693
731
  #elif NK_TARGET_NEONSDOT
694
732
  nk_dot_u8_neonsdot(a, b, n, result);
695
733
  #elif NK_TARGET_ICELAKE
@@ -746,6 +784,8 @@ NK_PUBLIC void nk_dot_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bit
746
784
  nk_dot_u1_haswell(a, b, n_bits, result);
747
785
  #elif NK_TARGET_V128RELAXED
748
786
  nk_dot_u1_v128relaxed(a, b, n_bits, result);
787
+ #elif NK_TARGET_POWERVSX
788
+ nk_dot_u1_powervsx(a, b, n_bits, result);
749
789
  #elif NK_TARGET_RVVBB
750
790
  nk_dot_u1_rvvbb(a, b, n_bits, result);
751
791
  #elif NK_TARGET_RVV
@@ -760,6 +800,8 @@ NK_PUBLIC void nk_dot_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bit
760
800
  NK_PUBLIC void nk_dot_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
761
801
  #if NK_TARGET_V128RELAXED
762
802
  nk_dot_f16_v128relaxed(a, b, n, result);
803
+ #elif NK_TARGET_POWERVSX
804
+ nk_dot_f16_powervsx(a, b, n, result);
763
805
  #elif NK_TARGET_RVVHALF
764
806
  nk_dot_f16_rvvhalf(a, b, n, result);
765
807
  #elif NK_TARGET_RVV
@@ -768,10 +810,10 @@ NK_PUBLIC void nk_dot_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_
768
810
  nk_dot_f16_svehalf(a, b, n, result);
769
811
  #elif NK_TARGET_NEONFHM
770
812
  nk_dot_f16_neonfhm(a, b, n, result);
771
- #elif NK_TARGET_NEONHALF
772
- nk_dot_f16_neonhalf(a, b, n, result);
773
813
  #elif NK_TARGET_NEON
774
814
  nk_dot_f16_neon(a, b, n, result);
815
+ #elif NK_TARGET_DIAMOND
816
+ nk_dot_f16_diamond(a, b, n, result);
775
817
  #elif NK_TARGET_SKYLAKE
776
818
  nk_dot_f16_skylake(a, b, n, result);
777
819
  #elif NK_TARGET_HASWELL
@@ -784,6 +826,10 @@ NK_PUBLIC void nk_dot_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_
784
826
  NK_PUBLIC void nk_dot_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
785
827
  #if NK_TARGET_V128RELAXED
786
828
  nk_dot_bf16_v128relaxed(a, b, n, result);
829
+ #elif NK_TARGET_POWERVSX
830
+ nk_dot_bf16_powervsx(a, b, n, result);
831
+ #elif NK_TARGET_LOONGSONASX
832
+ nk_dot_bf16_loongsonasx(a, b, n, result);
787
833
  #elif NK_TARGET_GENOA
788
834
  nk_dot_bf16_genoa(a, b, n, result);
789
835
  #elif NK_TARGET_RVVBF16
@@ -806,8 +852,12 @@ NK_PUBLIC void nk_dot_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n,
806
852
  }
807
853
 
808
854
  NK_PUBLIC void nk_dot_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
809
- #if NK_TARGET_GENOA
810
- nk_dot_e4m3_genoa(a, b, n, result);
855
+ #if NK_TARGET_DIAMOND
856
+ nk_dot_e4m3_diamond(a, b, n, result);
857
+ #elif NK_TARGET_ICELAKE
858
+ nk_dot_e4m3_icelake(a, b, n, result);
859
+ #elif NK_TARGET_NEONFP8
860
+ nk_dot_e4m3_neonfp8(a, b, n, result);
811
861
  #elif NK_TARGET_NEONBFDOT
812
862
  nk_dot_e4m3_neonbfdot(a, b, n, result);
813
863
  #elif NK_TARGET_NEONFHM
@@ -832,8 +882,12 @@ NK_PUBLIC void nk_dot_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n,
832
882
  }
833
883
 
834
884
  NK_PUBLIC void nk_dot_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
835
- #if NK_TARGET_GENOA
885
+ #if NK_TARGET_DIAMOND
886
+ nk_dot_e5m2_diamond(a, b, n, result);
887
+ #elif NK_TARGET_GENOA
836
888
  nk_dot_e5m2_genoa(a, b, n, result);
889
+ #elif NK_TARGET_NEONFP8
890
+ nk_dot_e5m2_neonfp8(a, b, n, result);
837
891
  #elif NK_TARGET_NEONBFDOT
838
892
  nk_dot_e5m2_neonbfdot(a, b, n, result);
839
893
  #elif NK_TARGET_NEONFHM
@@ -858,7 +912,9 @@ NK_PUBLIC void nk_dot_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n,
858
912
  }
859
913
 
860
914
  NK_PUBLIC void nk_dot_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
861
- #if NK_TARGET_ICELAKE
915
+ #if NK_TARGET_NEONFP8
916
+ nk_dot_e2m3_neonfp8(a, b, n, result);
917
+ #elif NK_TARGET_ICELAKE
862
918
  nk_dot_e2m3_icelake(a, b, n, result);
863
919
  #elif NK_TARGET_SKYLAKE
864
920
  nk_dot_e2m3_skylake(a, b, n, result);
@@ -882,7 +938,9 @@ NK_PUBLIC void nk_dot_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n,
882
938
  }
883
939
 
884
940
  NK_PUBLIC void nk_dot_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
885
- #if NK_TARGET_ICELAKE
941
+ #if NK_TARGET_NEONFP8
942
+ nk_dot_e3m2_neonfp8(a, b, n, result);
943
+ #elif NK_TARGET_ICELAKE
886
944
  nk_dot_e3m2_icelake(a, b, n, result);
887
945
  #elif NK_TARGET_NEONSDOT
888
946
  nk_dot_e3m2_neonsdot(a, b, n, result);
@@ -904,6 +962,10 @@ NK_PUBLIC void nk_dot_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n,
904
962
  NK_PUBLIC void nk_dot_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
905
963
  #if NK_TARGET_V128RELAXED
906
964
  nk_dot_f32_v128relaxed(a, b, n, result);
965
+ #elif NK_TARGET_POWERVSX
966
+ nk_dot_f32_powervsx(a, b, n, result);
967
+ #elif NK_TARGET_LOONGSONASX
968
+ nk_dot_f32_loongsonasx(a, b, n, result);
907
969
  #elif NK_TARGET_RVV
908
970
  nk_dot_f32_rvv(a, b, n, result);
909
971
  #elif NK_TARGET_SVE
@@ -922,6 +984,10 @@ NK_PUBLIC void nk_dot_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_
922
984
  NK_PUBLIC void nk_dot_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
923
985
  #if NK_TARGET_V128RELAXED
924
986
  nk_dot_f64_v128relaxed(a, b, n, result);
987
+ #elif NK_TARGET_POWERVSX
988
+ nk_dot_f64_powervsx(a, b, n, result);
989
+ #elif NK_TARGET_LOONGSONASX
990
+ nk_dot_f64_loongsonasx(a, b, n, result);
925
991
  #elif NK_TARGET_RVV
926
992
  nk_dot_f64_rvv(a, b, n, result);
927
993
  #elif NK_TARGET_SVE
@@ -942,8 +1008,8 @@ NK_PUBLIC void nk_dot_f16c(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n,
942
1008
  nk_dot_f16c_svehalf(a, b, n, result);
943
1009
  #elif NK_TARGET_NEONFHM
944
1010
  nk_dot_f16c_neonfhm(a, b, n, result);
945
- #elif NK_TARGET_NEONHALF
946
- nk_dot_f16c_neonhalf(a, b, n, result);
1011
+ #elif NK_TARGET_NEON
1012
+ nk_dot_f16c_neon(a, b, n, result);
947
1013
  #elif NK_TARGET_HASWELL
948
1014
  nk_dot_f16c_haswell(a, b, n, result);
949
1015
  #else
@@ -1004,8 +1070,8 @@ NK_PUBLIC void nk_vdot_f16c(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n,
1004
1070
  nk_vdot_f16c_svehalf(a, b, n, result);
1005
1071
  #elif NK_TARGET_NEONFHM
1006
1072
  nk_vdot_f16c_neonfhm(a, b, n, result);
1007
- #elif NK_TARGET_NEONHALF
1008
- nk_vdot_f16c_neonhalf(a, b, n, result);
1073
+ #elif NK_TARGET_NEON
1074
+ nk_vdot_f16c_neon(a, b, n, result);
1009
1075
  #elif NK_TARGET_HASWELL
1010
1076
  nk_vdot_f16c_haswell(a, b, n, result);
1011
1077
  #else