numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
package/binding.gyp CHANGED
@@ -1,5 +1,7 @@
1
1
  {
2
- "variables": {"openssl_fips": ""},
2
+ "variables": {
3
+ "openssl_fips": ""
4
+ },
3
5
  "targets": [
4
6
  {
5
7
  "target_name": "numkong",
@@ -31,8 +33,14 @@
31
33
  "c/dispatch_e2m3.c",
32
34
  "c/dispatch_e3m2.c",
33
35
  ],
34
- "include_dirs": ["include"],
35
- "defines": ["NK_NATIVE_F16=0", "NK_NATIVE_BF16=0", "NK_DYNAMIC_DISPATCH=1"],
36
+ "include_dirs": [
37
+ "include"
38
+ ],
39
+ "defines": [
40
+ "NK_NATIVE_F16=0",
41
+ "NK_NATIVE_BF16=0",
42
+ "NK_DYNAMIC_DISPATCH=1"
43
+ ],
36
44
  "cflags": [
37
45
  "-std=c11",
38
46
  "-O3",
@@ -41,500 +49,42 @@
41
49
  "-Wno-cast-function-type",
42
50
  "-Wno-switch",
43
51
  "-Wno-psabi",
52
+ "-include",
53
+ "<(module_root_dir)/nk_probes.h",
44
54
  ],
55
+ "msvs_settings": {
56
+ "VCCLCompilerTool": {
57
+ "ForcedIncludeFiles": [
58
+ "<(module_root_dir)/nk_probes.h"
59
+ ],
60
+ },
61
+ },
45
62
  "conditions": [
46
63
  [
47
- "OS=='linux'",
64
+ "OS=='mac'",
48
65
  {
49
- "conditions": [
50
- [
51
- "target_arch=='x64'",
52
- {
53
- "defines": [
54
- "NK_TARGET_HASWELL=1",
55
- "NK_TARGET_SKYLAKE=1",
56
- "NK_TARGET_ICELAKE=1",
57
- "NK_TARGET_GENOA=1",
58
- "NK_TARGET_SAPPHIRE=1",
59
- "NK_TARGET_TURIN=1",
60
- "NK_TARGET_ALDER=1",
61
- "NK_TARGET_SIERRA=1",
62
- "NK_TARGET_SAPPHIREAMX=1",
63
- "NK_TARGET_GRANITEAMX=1",
64
- "NK_TARGET_NEON=0",
65
- "NK_TARGET_NEONHALF=0",
66
- "NK_TARGET_NEONSDOT=0",
67
- "NK_TARGET_NEONBFDOT=0",
68
- "NK_TARGET_NEONFHM=0",
69
- "NK_TARGET_SVE=0",
70
- "NK_TARGET_SVEHALF=0",
71
- "NK_TARGET_SVEBFDOT=0",
72
- "NK_TARGET_SVESDOT=0",
73
- "NK_TARGET_SVE2=0",
74
- "NK_TARGET_SVE2P1=0",
75
- "NK_TARGET_SME=0",
76
- "NK_TARGET_SME2=0",
77
- "NK_TARGET_SME2P1=0",
78
- "NK_TARGET_SMEF64=0",
79
- "NK_TARGET_SMEHALF=0",
80
- "NK_TARGET_SMEBF16=0",
81
- "NK_TARGET_SMEBI32=0",
82
- "NK_TARGET_SMELUT2=0",
83
- "NK_TARGET_SMEFA64=0",
84
- "NK_TARGET_RVV=0",
85
- "NK_TARGET_RVVHALF=0",
86
- "NK_TARGET_RVVBF16=0",
87
- "NK_TARGET_RVVBB=0",
88
- "NK_TARGET_V128RELAXED=0",
89
- ]
90
- },
91
- ],
92
- [
93
- "target_arch=='arm64'",
94
- {
95
- "defines": [
96
- "NK_TARGET_HASWELL=0",
97
- "NK_TARGET_SKYLAKE=0",
98
- "NK_TARGET_ICELAKE=0",
99
- "NK_TARGET_GENOA=0",
100
- "NK_TARGET_SAPPHIRE=0",
101
- "NK_TARGET_TURIN=0",
102
- "NK_TARGET_ALDER=0",
103
- "NK_TARGET_SIERRA=0",
104
- "NK_TARGET_SAPPHIREAMX=0",
105
- "NK_TARGET_GRANITEAMX=0",
106
- "NK_TARGET_NEON=1",
107
- "NK_TARGET_NEONHALF=1",
108
- "NK_TARGET_NEONSDOT=1",
109
- "NK_TARGET_NEONBFDOT=1",
110
- "NK_TARGET_NEONFHM=1",
111
- "NK_TARGET_SVE=1",
112
- "NK_TARGET_SVEHALF=1",
113
- "NK_TARGET_SVEBFDOT=1",
114
- "NK_TARGET_SVESDOT=1",
115
- "NK_TARGET_SVE2=1",
116
- "NK_TARGET_SVE2P1=1",
117
- "NK_TARGET_SME=1",
118
- "NK_TARGET_SME2=1",
119
- "NK_TARGET_SME2P1=1",
120
- "NK_TARGET_SMEF64=1",
121
- "NK_TARGET_SMEHALF=1",
122
- "NK_TARGET_SMEBF16=1",
123
- "NK_TARGET_SMEBI32=1",
124
- "NK_TARGET_SMELUT2=1",
125
- "NK_TARGET_SMEFA64=1",
126
- "NK_TARGET_RVV=0",
127
- "NK_TARGET_RVVHALF=0",
128
- "NK_TARGET_RVVBF16=0",
129
- "NK_TARGET_RVVBB=0",
130
- "NK_TARGET_V128RELAXED=0",
131
- ]
132
- },
133
- ],
134
- [
135
- "target_arch=='riscv64'",
136
- {
137
- "defines": [
138
- "NK_TARGET_HASWELL=0",
139
- "NK_TARGET_SKYLAKE=0",
140
- "NK_TARGET_ICELAKE=0",
141
- "NK_TARGET_GENOA=0",
142
- "NK_TARGET_SAPPHIRE=0",
143
- "NK_TARGET_TURIN=0",
144
- "NK_TARGET_ALDER=0",
145
- "NK_TARGET_SIERRA=0",
146
- "NK_TARGET_SAPPHIREAMX=0",
147
- "NK_TARGET_GRANITEAMX=0",
148
- "NK_TARGET_NEON=0",
149
- "NK_TARGET_NEONHALF=0",
150
- "NK_TARGET_NEONSDOT=0",
151
- "NK_TARGET_NEONBFDOT=0",
152
- "NK_TARGET_NEONFHM=0",
153
- "NK_TARGET_SVE=0",
154
- "NK_TARGET_SVEHALF=0",
155
- "NK_TARGET_SVEBFDOT=0",
156
- "NK_TARGET_SVESDOT=0",
157
- "NK_TARGET_SVE2=0",
158
- "NK_TARGET_SVE2P1=0",
159
- "NK_TARGET_SME=0",
160
- "NK_TARGET_SME2=0",
161
- "NK_TARGET_SME2P1=0",
162
- "NK_TARGET_SMEF64=0",
163
- "NK_TARGET_SMEHALF=0",
164
- "NK_TARGET_SMEBF16=0",
165
- "NK_TARGET_SMEBI32=0",
166
- "NK_TARGET_SMELUT2=0",
167
- "NK_TARGET_SMEFA64=0",
168
- "NK_TARGET_RVV=1",
169
- "NK_TARGET_RVVHALF=1",
170
- "NK_TARGET_RVVBF16=1",
171
- "NK_TARGET_RVVBB=1",
172
- "NK_TARGET_V128RELAXED=0",
173
- ]
174
- },
175
- ],
176
- [
177
- "target_arch=='wasm32' or target_arch=='wasm64'",
178
- {
179
- "defines": [
180
- "NK_TARGET_HASWELL=0",
181
- "NK_TARGET_SKYLAKE=0",
182
- "NK_TARGET_ICELAKE=0",
183
- "NK_TARGET_GENOA=0",
184
- "NK_TARGET_SAPPHIRE=0",
185
- "NK_TARGET_TURIN=0",
186
- "NK_TARGET_ALDER=0",
187
- "NK_TARGET_SIERRA=0",
188
- "NK_TARGET_SAPPHIREAMX=0",
189
- "NK_TARGET_GRANITEAMX=0",
190
- "NK_TARGET_NEON=0",
191
- "NK_TARGET_NEONHALF=0",
192
- "NK_TARGET_NEONSDOT=0",
193
- "NK_TARGET_NEONBFDOT=0",
194
- "NK_TARGET_NEONFHM=0",
195
- "NK_TARGET_SVE=0",
196
- "NK_TARGET_SVEHALF=0",
197
- "NK_TARGET_SVEBFDOT=0",
198
- "NK_TARGET_SVESDOT=0",
199
- "NK_TARGET_SVE2=0",
200
- "NK_TARGET_SVE2P1=0",
201
- "NK_TARGET_SME=0",
202
- "NK_TARGET_SME2=0",
203
- "NK_TARGET_SME2P1=0",
204
- "NK_TARGET_SMEF64=0",
205
- "NK_TARGET_SMEHALF=0",
206
- "NK_TARGET_SMEBF16=0",
207
- "NK_TARGET_SMEBI32=0",
208
- "NK_TARGET_SMELUT2=0",
209
- "NK_TARGET_SMEFA64=0",
210
- "NK_TARGET_RVV=0",
211
- "NK_TARGET_RVVHALF=0",
212
- "NK_TARGET_RVVBF16=0",
213
- "NK_TARGET_RVVBB=0",
214
- "NK_TARGET_V128RELAXED=1",
215
- ]
216
- },
217
- ],
218
- ]
219
- },
66
+ "xcode_settings": {
67
+ "MACOSX_DEPLOYMENT_TARGET": "11.0"
68
+ }
69
+ }
220
70
  ],
221
71
  [
222
- "OS=='freebsd'",
72
+ "OS=='win' and target_arch=='arm64'",
223
73
  {
224
- "conditions": [
225
- [
226
- "target_arch=='x64'",
227
- {
228
- "defines": [
229
- "NK_TARGET_HASWELL=1",
230
- "NK_TARGET_SKYLAKE=1",
231
- "NK_TARGET_ICELAKE=1",
232
- "NK_TARGET_GENOA=1",
233
- "NK_TARGET_SAPPHIRE=1",
234
- "NK_TARGET_TURIN=1",
235
- "NK_TARGET_ALDER=1",
236
- "NK_TARGET_SIERRA=1",
237
- "NK_TARGET_SAPPHIREAMX=0",
238
- "NK_TARGET_GRANITEAMX=0",
239
- "NK_TARGET_NEON=0",
240
- "NK_TARGET_NEONHALF=0",
241
- "NK_TARGET_NEONSDOT=0",
242
- "NK_TARGET_NEONBFDOT=0",
243
- "NK_TARGET_NEONFHM=0",
244
- "NK_TARGET_SVE=0",
245
- "NK_TARGET_SVEHALF=0",
246
- "NK_TARGET_SVEBFDOT=0",
247
- "NK_TARGET_SVESDOT=0",
248
- "NK_TARGET_SVE2=0",
249
- "NK_TARGET_SVE2P1=0",
250
- "NK_TARGET_SME=0",
251
- "NK_TARGET_SME2=0",
252
- "NK_TARGET_SME2P1=0",
253
- "NK_TARGET_SMEF64=0",
254
- "NK_TARGET_SMEHALF=0",
255
- "NK_TARGET_SMEBF16=0",
256
- "NK_TARGET_SMEBI32=0",
257
- "NK_TARGET_SMELUT2=0",
258
- "NK_TARGET_SMEFA64=0",
259
- "NK_TARGET_RVV=0",
260
- "NK_TARGET_RVVHALF=0",
261
- "NK_TARGET_RVVBF16=0",
262
- "NK_TARGET_RVVBB=0",
263
- "NK_TARGET_V128RELAXED=0",
264
- ]
265
- },
266
- ],
267
- [
268
- "target_arch=='arm64'",
269
- {
270
- "defines": [
271
- "NK_TARGET_HASWELL=0",
272
- "NK_TARGET_SKYLAKE=0",
273
- "NK_TARGET_ICELAKE=0",
274
- "NK_TARGET_GENOA=0",
275
- "NK_TARGET_SAPPHIRE=0",
276
- "NK_TARGET_TURIN=0",
277
- "NK_TARGET_ALDER=0",
278
- "NK_TARGET_SIERRA=0",
279
- "NK_TARGET_SAPPHIREAMX=0",
280
- "NK_TARGET_GRANITEAMX=0",
281
- "NK_TARGET_NEON=1",
282
- "NK_TARGET_NEONHALF=1",
283
- "NK_TARGET_NEONSDOT=1",
284
- "NK_TARGET_NEONBFDOT=1",
285
- "NK_TARGET_NEONFHM=1",
286
- "NK_TARGET_SVE=1",
287
- "NK_TARGET_SVEHALF=1",
288
- "NK_TARGET_SVEBFDOT=1",
289
- "NK_TARGET_SVESDOT=1",
290
- "NK_TARGET_SVE2=1",
291
- "NK_TARGET_SVE2P1=1",
292
- "NK_TARGET_SME=1",
293
- "NK_TARGET_SME2=1",
294
- "NK_TARGET_SME2P1=1",
295
- "NK_TARGET_SMEF64=1",
296
- "NK_TARGET_SMEHALF=1",
297
- "NK_TARGET_SMEBF16=1",
298
- "NK_TARGET_SMEBI32=1",
299
- "NK_TARGET_SMELUT2=1",
300
- "NK_TARGET_SMEFA64=1",
301
- "NK_TARGET_RVV=0",
302
- "NK_TARGET_RVVHALF=0",
303
- "NK_TARGET_RVVBF16=0",
304
- "NK_TARGET_RVVBB=0",
305
- "NK_TARGET_V128RELAXED=0",
306
- ]
307
- },
308
- ],
309
- [
310
- "target_arch=='riscv64'",
311
- {
312
- "defines": [
313
- "NK_TARGET_HASWELL=0",
314
- "NK_TARGET_SKYLAKE=0",
315
- "NK_TARGET_ICELAKE=0",
316
- "NK_TARGET_GENOA=0",
317
- "NK_TARGET_SAPPHIRE=0",
318
- "NK_TARGET_TURIN=0",
319
- "NK_TARGET_ALDER=0",
320
- "NK_TARGET_SIERRA=0",
321
- "NK_TARGET_SAPPHIREAMX=0",
322
- "NK_TARGET_GRANITEAMX=0",
323
- "NK_TARGET_NEON=0",
324
- "NK_TARGET_NEONHALF=0",
325
- "NK_TARGET_NEONSDOT=0",
326
- "NK_TARGET_NEONBFDOT=0",
327
- "NK_TARGET_NEONFHM=0",
328
- "NK_TARGET_SVE=0",
329
- "NK_TARGET_SVEHALF=0",
330
- "NK_TARGET_SVEBFDOT=0",
331
- "NK_TARGET_SVESDOT=0",
332
- "NK_TARGET_SVE2=0",
333
- "NK_TARGET_SVE2P1=0",
334
- "NK_TARGET_SME=0",
335
- "NK_TARGET_SME2=0",
336
- "NK_TARGET_SME2P1=0",
337
- "NK_TARGET_SMEF64=0",
338
- "NK_TARGET_SMEHALF=0",
339
- "NK_TARGET_SMEBF16=0",
340
- "NK_TARGET_SMEBI32=0",
341
- "NK_TARGET_SMELUT2=0",
342
- "NK_TARGET_SMEFA64=0",
343
- "NK_TARGET_RVV=1",
344
- "NK_TARGET_RVVHALF=1",
345
- "NK_TARGET_RVVBF16=1",
346
- "NK_TARGET_RVVBB=1",
347
- "NK_TARGET_V128RELAXED=0",
348
- ]
349
- },
350
- ],
74
+ "defines": [
75
+ "_ARM64_"
351
76
  ]
352
- },
353
- ],
354
- [
355
- "OS=='mac'",
356
- {
357
- "xcode_settings": {"MACOSX_DEPLOYMENT_TARGET": "11.0"},
358
- "conditions": [
359
- [
360
- "target_arch=='x64'",
361
- {
362
- "defines": [
363
- "NK_TARGET_HASWELL=1",
364
- "NK_TARGET_SKYLAKE=0",
365
- "NK_TARGET_ICELAKE=0",
366
- "NK_TARGET_GENOA=0",
367
- "NK_TARGET_SAPPHIRE=0",
368
- "NK_TARGET_TURIN=0",
369
- "NK_TARGET_ALDER=0",
370
- "NK_TARGET_SIERRA=0",
371
- "NK_TARGET_SAPPHIREAMX=0",
372
- "NK_TARGET_GRANITEAMX=0",
373
- "NK_TARGET_NEON=0",
374
- "NK_TARGET_NEONHALF=0",
375
- "NK_TARGET_NEONSDOT=0",
376
- "NK_TARGET_NEONBFDOT=0",
377
- "NK_TARGET_NEONFHM=0",
378
- "NK_TARGET_SVE=0",
379
- "NK_TARGET_SVEHALF=0",
380
- "NK_TARGET_SVEBFDOT=0",
381
- "NK_TARGET_SVESDOT=0",
382
- "NK_TARGET_SVE2=0",
383
- "NK_TARGET_SVE2P1=0",
384
- "NK_TARGET_SME=0",
385
- "NK_TARGET_SME2=0",
386
- "NK_TARGET_SME2P1=0",
387
- "NK_TARGET_SMEF64=0",
388
- "NK_TARGET_SMEHALF=0",
389
- "NK_TARGET_SMEBF16=0",
390
- "NK_TARGET_SMEBI32=0",
391
- "NK_TARGET_SMELUT2=0",
392
- "NK_TARGET_SMEFA64=0",
393
- "NK_TARGET_RVV=0",
394
- "NK_TARGET_RVVHALF=0",
395
- "NK_TARGET_RVVBF16=0",
396
- "NK_TARGET_RVVBB=0",
397
- "NK_TARGET_V128RELAXED=0",
398
- ]
399
- },
400
- ],
401
- [
402
- "target_arch=='arm64'",
403
- {
404
- "defines": [
405
- "NK_TARGET_HASWELL=0",
406
- "NK_TARGET_SKYLAKE=0",
407
- "NK_TARGET_ICELAKE=0",
408
- "NK_TARGET_GENOA=0",
409
- "NK_TARGET_SAPPHIRE=0",
410
- "NK_TARGET_TURIN=0",
411
- "NK_TARGET_ALDER=0",
412
- "NK_TARGET_SIERRA=0",
413
- "NK_TARGET_SAPPHIREAMX=0",
414
- "NK_TARGET_GRANITEAMX=0",
415
- "NK_TARGET_NEON=1",
416
- "NK_TARGET_NEONHALF=1",
417
- "NK_TARGET_NEONSDOT=1",
418
- "NK_TARGET_NEONBFDOT=1",
419
- "NK_TARGET_NEONFHM=1",
420
- "NK_TARGET_SVE=0",
421
- "NK_TARGET_SVEHALF=0",
422
- "NK_TARGET_SVEBFDOT=0",
423
- "NK_TARGET_SVESDOT=0",
424
- "NK_TARGET_SVE2=0",
425
- "NK_TARGET_SVE2P1=0",
426
- "NK_TARGET_SME=1",
427
- "NK_TARGET_SME2=1",
428
- "NK_TARGET_SME2P1=1",
429
- "NK_TARGET_SMEF64=1",
430
- "NK_TARGET_SMEHALF=1",
431
- "NK_TARGET_SMEBF16=1",
432
- "NK_TARGET_SMEBI32=1",
433
- "NK_TARGET_SMELUT2=1",
434
- "NK_TARGET_SMEFA64=1",
435
- "NK_TARGET_RVV=0",
436
- "NK_TARGET_RVVHALF=0",
437
- "NK_TARGET_RVVBF16=0",
438
- "NK_TARGET_RVVBB=0",
439
- "NK_TARGET_V128RELAXED=0",
440
- ]
441
- },
442
- ],
443
- ],
444
- },
77
+ }
445
78
  ],
446
79
  [
447
- "OS=='win'",
80
+ "OS=='win' and target_arch=='x64'",
448
81
  {
449
- "conditions": [
450
- [
451
- "target_arch=='x64'",
452
- {
453
- "defines": [
454
- "NK_TARGET_HASWELL=1",
455
- "NK_TARGET_SKYLAKE=1",
456
- "NK_TARGET_ICELAKE=1",
457
- "NK_TARGET_GENOA=0",
458
- "NK_TARGET_SAPPHIRE=1",
459
- "NK_TARGET_TURIN=0",
460
- "NK_TARGET_ALDER=0",
461
- "NK_TARGET_SIERRA=0",
462
- "NK_TARGET_SAPPHIREAMX=0",
463
- "NK_TARGET_GRANITEAMX=0",
464
- "NK_TARGET_NEON=0",
465
- "NK_TARGET_NEONHALF=0",
466
- "NK_TARGET_NEONSDOT=0",
467
- "NK_TARGET_NEONBFDOT=0",
468
- "NK_TARGET_NEONFHM=0",
469
- "NK_TARGET_SVE=0",
470
- "NK_TARGET_SVEHALF=0",
471
- "NK_TARGET_SVEBFDOT=0",
472
- "NK_TARGET_SVESDOT=0",
473
- "NK_TARGET_SVE2=0",
474
- "NK_TARGET_SVE2P1=0",
475
- "NK_TARGET_SME=0",
476
- "NK_TARGET_SME2=0",
477
- "NK_TARGET_SME2P1=0",
478
- "NK_TARGET_SMEF64=0",
479
- "NK_TARGET_SMEHALF=0",
480
- "NK_TARGET_SMEBF16=0",
481
- "NK_TARGET_SMEBI32=0",
482
- "NK_TARGET_SMELUT2=0",
483
- "NK_TARGET_SMEFA64=0",
484
- "NK_TARGET_RVV=0",
485
- "NK_TARGET_RVVHALF=0",
486
- "NK_TARGET_RVVBF16=0",
487
- "NK_TARGET_RVVBB=0",
488
- "NK_TARGET_V128RELAXED=0",
489
- ]
490
- },
491
- ],
492
- [
493
- "target_arch=='arm64'",
494
- {
495
- "defines": [
496
- "NK_TARGET_HASWELL=0",
497
- "NK_TARGET_SKYLAKE=0",
498
- "NK_TARGET_ICELAKE=0",
499
- "NK_TARGET_GENOA=0",
500
- "NK_TARGET_SAPPHIRE=0",
501
- "NK_TARGET_TURIN=0",
502
- "NK_TARGET_ALDER=0",
503
- "NK_TARGET_SIERRA=0",
504
- "NK_TARGET_SAPPHIREAMX=0",
505
- "NK_TARGET_GRANITEAMX=0",
506
- "NK_TARGET_NEON=1",
507
- "NK_TARGET_NEONHALF=0",
508
- "NK_TARGET_NEONSDOT=1",
509
- "NK_TARGET_NEONBFDOT=0",
510
- "NK_TARGET_NEONFHM=0",
511
- "NK_TARGET_SVE=0",
512
- "NK_TARGET_SVEHALF=0",
513
- "NK_TARGET_SVEBFDOT=0",
514
- "NK_TARGET_SVESDOT=0",
515
- "NK_TARGET_SVE2=0",
516
- "NK_TARGET_SVE2P1=0",
517
- "NK_TARGET_SME=0",
518
- "NK_TARGET_SME2=0",
519
- "NK_TARGET_SME2P1=0",
520
- "NK_TARGET_SMEF64=0",
521
- "NK_TARGET_SMEHALF=0",
522
- "NK_TARGET_SMEBF16=0",
523
- "NK_TARGET_SMEBI32=0",
524
- "NK_TARGET_SMELUT2=0",
525
- "NK_TARGET_SMEFA64=0",
526
- "NK_TARGET_RVV=0",
527
- "NK_TARGET_RVVHALF=0",
528
- "NK_TARGET_RVVBF16=0",
529
- "NK_TARGET_RVVBB=0",
530
- "NK_TARGET_V128RELAXED=0",
531
- ]
532
- },
533
- ],
82
+ "defines": [
83
+ "_AMD64_"
534
84
  ]
535
- },
85
+ }
536
86
  ],
537
87
  ],
538
88
  }
539
89
  ],
540
- }
90
+ }
package/c/dispatch_bf16.c CHANGED
@@ -44,6 +44,10 @@ void nk_dispatch_bf16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
44
44
  return;
45
45
  case nk_kernel_maxsim_pack_k: *m = (m_t)&nk_maxsim_pack_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
46
46
  case nk_kernel_maxsim_packed_k: *m = (m_t)&nk_maxsim_packed_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
47
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
48
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
49
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
50
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
47
51
  default: break;
48
52
  }
49
53
  #endif
@@ -94,7 +98,6 @@ void nk_dispatch_bf16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
94
98
  case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
95
99
  case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
96
100
  case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
97
- case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
98
101
  case nk_kernel_dots_packed_size_k:
99
102
  *m = (m_t)&nk_dots_packed_size_bf16_neonbfdot, *c = nk_cap_neonbfdot_k;
100
103
  return;
@@ -213,6 +216,9 @@ void nk_dispatch_bf16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
213
216
  return;
214
217
  case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_bf16_skylake, *c = nk_cap_skylake_k; return;
215
218
  case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_bf16_skylake, *c = nk_cap_skylake_k; return;
219
+ case nk_kernel_rmsd_k: *m = (m_t)&nk_rmsd_bf16_skylake, *c = nk_cap_skylake_k; return;
220
+ case nk_kernel_kabsch_k: *m = (m_t)&nk_kabsch_bf16_skylake, *c = nk_cap_skylake_k; return;
221
+ case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_bf16_skylake, *c = nk_cap_skylake_k; return;
216
222
  default: break;
217
223
  }
218
224
  #endif
@@ -309,6 +315,58 @@ void nk_dispatch_bf16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
309
315
  case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_bf16_rvv, *c = nk_cap_rvv_k; return;
310
316
  default: break;
311
317
  }
318
+ #endif
319
+ #if NK_TARGET_POWERVSX
320
+ if (v & nk_cap_powervsx_k) switch (k) {
321
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_powervsx, *c = nk_cap_powervsx_k; return;
322
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_powervsx, *c = nk_cap_powervsx_k; return;
323
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_powervsx, *c = nk_cap_powervsx_k; return;
324
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_powervsx, *c = nk_cap_powervsx_k; return;
325
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_bf16_powervsx, *c = nk_cap_powervsx_k; return;
326
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_powervsx, *c = nk_cap_powervsx_k; return;
327
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_powervsx, *c = nk_cap_powervsx_k; return;
328
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_bf16_powervsx, *c = nk_cap_powervsx_k; return;
329
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_bf16_powervsx, *c = nk_cap_powervsx_k; return;
330
+ case nk_kernel_angulars_symmetric_k:
331
+ *m = (m_t)&nk_angulars_symmetric_bf16_powervsx, *c = nk_cap_powervsx_k;
332
+ return;
333
+ case nk_kernel_euclideans_packed_k:
334
+ *m = (m_t)&nk_euclideans_packed_bf16_powervsx, *c = nk_cap_powervsx_k;
335
+ return;
336
+ case nk_kernel_euclideans_symmetric_k:
337
+ *m = (m_t)&nk_euclideans_symmetric_bf16_powervsx, *c = nk_cap_powervsx_k;
338
+ return;
339
+ default: break;
340
+ }
341
+ #endif
342
+ #if NK_TARGET_LOONGSONASX
343
+ if (v & nk_cap_loongsonasx_k) switch (k) {
344
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
345
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
346
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
347
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
348
+ case nk_kernel_dots_packed_size_k:
349
+ *m = (m_t)&nk_dots_packed_size_bf16_loongsonasx, *c = nk_cap_loongsonasx_k;
350
+ return;
351
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
352
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
353
+ case nk_kernel_dots_symmetric_k:
354
+ *m = (m_t)&nk_dots_symmetric_bf16_loongsonasx, *c = nk_cap_loongsonasx_k;
355
+ return;
356
+ case nk_kernel_angulars_packed_k:
357
+ *m = (m_t)&nk_angulars_packed_bf16_loongsonasx, *c = nk_cap_loongsonasx_k;
358
+ return;
359
+ case nk_kernel_angulars_symmetric_k:
360
+ *m = (m_t)&nk_angulars_symmetric_bf16_loongsonasx, *c = nk_cap_loongsonasx_k;
361
+ return;
362
+ case nk_kernel_euclideans_packed_k:
363
+ *m = (m_t)&nk_euclideans_packed_bf16_loongsonasx, *c = nk_cap_loongsonasx_k;
364
+ return;
365
+ case nk_kernel_euclideans_symmetric_k:
366
+ *m = (m_t)&nk_euclideans_symmetric_bf16_loongsonasx, *c = nk_cap_loongsonasx_k;
367
+ return;
368
+ default: break;
369
+ }
312
370
  #endif
313
371
  if (v & nk_cap_serial_k) switch (k) {
314
372
  case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_serial, *c = nk_cap_serial_k; return;