whispercpp 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (797) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -3
  3. data/README.md +92 -31
  4. data/Rakefile +26 -7
  5. data/ext/.gitignore +5 -7
  6. data/ext/dependencies.rb +61 -0
  7. data/ext/extconf.rb +21 -198
  8. data/ext/options.rb +221 -0
  9. data/ext/ruby_whisper.c +159 -0
  10. data/ext/ruby_whisper.h +17 -2
  11. data/ext/ruby_whisper_context.c +641 -0
  12. data/ext/ruby_whisper_error.c +52 -0
  13. data/ext/ruby_whisper_model.c +232 -0
  14. data/ext/ruby_whisper_params.c +1301 -0
  15. data/ext/ruby_whisper_segment.c +143 -0
  16. data/ext/ruby_whisper_transcribe.cpp +87 -0
  17. data/ext/ruby_whisper_vad_params.c +288 -0
  18. data/ext/sources/.dockerignore +3 -0
  19. data/ext/sources/.github/workflows/bindings-ruby.yml +21 -0
  20. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  21. data/ext/sources/CMakeLists.txt +251 -0
  22. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  23. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  24. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  25. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  26. data/ext/sources/bindings/javascript/package.json +26 -0
  27. data/ext/sources/bindings/javascript/whisper.js +19 -0
  28. data/ext/sources/build-xcframework.sh +547 -0
  29. data/ext/sources/ci/run.sh +336 -0
  30. data/ext/sources/close-issue.yml +28 -0
  31. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  32. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  33. data/ext/sources/cmake/build-info.cmake +60 -0
  34. data/ext/sources/cmake/git-vars.cmake +22 -0
  35. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  36. data/ext/sources/cmake/whisper.pc.in +10 -0
  37. data/ext/sources/examples/CMakeLists.txt +124 -0
  38. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  39. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +37 -0
  40. data/ext/sources/examples/addon.node/addon.cpp +438 -0
  41. data/ext/sources/examples/addon.node/index.js +54 -0
  42. data/ext/sources/examples/addon.node/package.json +16 -0
  43. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  44. data/ext/sources/examples/bench/bench.cpp +175 -0
  45. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  46. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  47. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  48. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  49. data/ext/sources/examples/cli/cli.cpp +1294 -0
  50. data/ext/sources/examples/coi-serviceworker.js +146 -0
  51. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  52. data/ext/sources/examples/command/command.cpp +776 -0
  53. data/ext/sources/examples/command/commands.txt +9 -0
  54. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  55. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  56. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  57. data/ext/sources/examples/common-ggml.cpp +238 -0
  58. data/ext/sources/examples/common-ggml.h +18 -0
  59. data/ext/sources/examples/common-sdl.cpp +227 -0
  60. data/ext/sources/examples/common-sdl.h +49 -0
  61. data/ext/sources/examples/common-whisper.cpp +168 -0
  62. data/ext/sources/examples/common-whisper.h +24 -0
  63. data/ext/sources/examples/common.cpp +675 -0
  64. data/ext/sources/examples/common.h +322 -0
  65. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  66. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  67. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  68. data/ext/sources/examples/generate-karaoke.sh +57 -0
  69. data/ext/sources/examples/grammar-parser.cpp +423 -0
  70. data/ext/sources/examples/grammar-parser.h +29 -0
  71. data/ext/sources/examples/helpers.js +191 -0
  72. data/ext/sources/examples/json.hpp +24596 -0
  73. data/ext/sources/examples/livestream.sh +112 -0
  74. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  75. data/ext/sources/examples/lsp/lsp.cpp +467 -0
  76. data/ext/sources/examples/lsp/whisper.vim +362 -0
  77. data/ext/sources/examples/miniaudio.h +93468 -0
  78. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  79. data/ext/sources/examples/python/whisper_processor.py +54 -0
  80. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  81. data/ext/sources/examples/quantize/quantize.cpp +223 -0
  82. data/ext/sources/examples/server/CMakeLists.txt +12 -0
  83. data/ext/sources/examples/server/bench.js +29 -0
  84. data/ext/sources/examples/server/httplib.h +10497 -0
  85. data/ext/sources/examples/server/server.cpp +1091 -0
  86. data/ext/sources/examples/server.py +115 -0
  87. data/ext/sources/examples/stb_vorbis.c +5584 -0
  88. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  89. data/ext/sources/examples/stream/stream.cpp +429 -0
  90. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  91. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  92. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  93. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  94. data/ext/sources/examples/sycl/build.sh +22 -0
  95. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  96. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  97. data/ext/sources/examples/talk-llama/CMakeLists.txt +40 -0
  98. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  99. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  100. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  101. data/ext/sources/examples/talk-llama/llama-arch.cpp +1746 -0
  102. data/ext/sources/examples/talk-llama/llama-arch.h +437 -0
  103. data/ext/sources/examples/talk-llama/llama-batch.cpp +374 -0
  104. data/ext/sources/examples/talk-llama/llama-batch.h +89 -0
  105. data/ext/sources/examples/talk-llama/llama-chat.cpp +663 -0
  106. data/ext/sources/examples/talk-llama/llama-chat.h +58 -0
  107. data/ext/sources/examples/talk-llama/llama-context.cpp +2676 -0
  108. data/ext/sources/examples/talk-llama/llama-context.h +276 -0
  109. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  110. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  111. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  112. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  113. data/ext/sources/examples/talk-llama/llama-graph.cpp +1618 -0
  114. data/ext/sources/examples/talk-llama/llama-graph.h +640 -0
  115. data/ext/sources/examples/talk-llama/llama-hparams.cpp +95 -0
  116. data/ext/sources/examples/talk-llama/llama-hparams.h +190 -0
  117. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  118. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  119. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  120. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  121. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2739 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.h +502 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cells.h +379 -0
  124. data/ext/sources/examples/talk-llama/llama-memory.cpp +1 -0
  125. data/ext/sources/examples/talk-llama/llama-memory.h +32 -0
  126. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  127. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  128. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1138 -0
  129. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  130. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +281 -0
  131. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  132. data/ext/sources/examples/talk-llama/llama-model.cpp +13814 -0
  133. data/ext/sources/examples/talk-llama/llama-model.h +425 -0
  134. data/ext/sources/examples/talk-llama/llama-quant.cpp +966 -0
  135. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  136. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  137. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  138. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3340 -0
  139. data/ext/sources/examples/talk-llama/llama-vocab.h +131 -0
  140. data/ext/sources/examples/talk-llama/llama.cpp +354 -0
  141. data/ext/sources/examples/talk-llama/llama.h +1377 -0
  142. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  143. data/ext/sources/examples/talk-llama/speak +40 -0
  144. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  145. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  146. data/ext/sources/examples/talk-llama/talk-llama.cpp +808 -0
  147. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  148. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  149. data/ext/sources/examples/talk-llama/unicode.cpp +849 -0
  150. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  151. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  152. data/ext/sources/examples/vad-speech-segments/speech.cpp +143 -0
  153. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  154. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  155. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  156. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  157. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  158. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  159. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  160. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  161. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +249 -0
  162. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  163. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  164. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  165. data/ext/sources/ggml/CMakeLists.txt +390 -0
  166. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  167. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  168. data/ext/sources/ggml/cmake/common.cmake +26 -0
  169. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  170. data/ext/{ggml → sources/ggml}/include/ggml-alloc.h +1 -1
  171. data/ext/{ggml → sources/ggml}/include/ggml-backend.h +9 -7
  172. data/ext/{ggml → sources/ggml}/include/ggml-cpp.h +2 -1
  173. data/ext/{ggml → sources/ggml}/include/ggml-cpu.h +9 -1
  174. data/ext/{ggml → sources/ggml}/include/ggml-metal.h +1 -1
  175. data/ext/{ggml → sources/ggml}/include/ggml-opt.h +49 -28
  176. data/ext/{ggml → sources/ggml}/include/ggml-rpc.h +6 -1
  177. data/ext/{ggml → sources/ggml}/include/ggml-vulkan.h +0 -2
  178. data/ext/{ggml → sources/ggml}/include/ggml.h +182 -265
  179. data/ext/sources/ggml/include/gguf.h +202 -0
  180. data/ext/sources/ggml/src/CMakeLists.txt +346 -0
  181. data/ext/{ggml → sources/ggml}/src/ggml-alloc.c +34 -29
  182. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  183. data/ext/{ggml → sources/ggml}/src/ggml-backend-impl.h +1 -2
  184. data/ext/{ggml → sources/ggml}/src/ggml-backend-reg.cpp +87 -53
  185. data/ext/{ggml → sources/ggml}/src/ggml-backend.cpp +26 -14
  186. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  187. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +74 -0
  188. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  189. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.cpp +10 -4
  190. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.h +5 -5
  191. data/ext/{ggml → sources/ggml}/src/ggml-cann/aclnn_ops.cpp +1272 -1506
  192. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  193. data/ext/{ggml → sources/ggml}/src/ggml-cann/common.h +135 -1
  194. data/ext/{ggml → sources/ggml}/src/ggml-cann/ggml-cann.cpp +564 -146
  195. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  196. data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/dup.cpp +3 -5
  197. data/ext/{ggml → sources/ggml}/src/ggml-common.h +12 -8
  198. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +504 -0
  199. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.cpp +2 -1
  200. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  201. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  202. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  203. data/ext/sources/ggml/src/ggml-cpu/common.h +72 -0
  204. data/ext/{ggml → sources/ggml}/src/ggml-cpu/cpu-feats-x86.cpp +5 -1
  205. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
  206. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-impl.h +163 -41
  207. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.c +4029 -1117
  208. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3510 -0
  209. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu.cpp +67 -18
  210. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  211. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  212. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  213. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  214. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
  215. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  216. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +8903 -0
  217. data/ext/sources/ggml/src/ggml-cpu/ops.h +110 -0
  218. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  219. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  220. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  221. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +252 -0
  222. data/ext/sources/ggml/src/ggml-cpu/vec.h +818 -0
  223. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  224. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  225. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  227. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  229. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  231. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  232. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  233. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  234. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  235. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  236. data/ext/sources/ggml/src/ggml-cuda/common.cuh +828 -0
  237. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  238. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  239. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  240. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  241. data/ext/sources/ggml/src/ggml-cuda/convert.cu +730 -0
  242. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +26 -0
  243. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  244. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  245. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  246. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  247. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  248. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  249. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  251. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  252. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  254. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1471 -0
  255. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  256. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  257. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  258. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  259. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  260. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  261. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
  262. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  263. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  264. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  265. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  266. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  267. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3505 -0
  268. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  269. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  270. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  271. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  272. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  273. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  274. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  275. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +336 -0
  276. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +12 -0
  277. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  278. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  279. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  280. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  281. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  282. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  283. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  284. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  285. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  286. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  287. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  288. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  289. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  290. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  291. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  292. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  293. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  294. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  295. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  296. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  297. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  298. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  299. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
  300. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  301. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  302. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  303. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +39 -0
  304. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +5 -0
  305. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  306. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  307. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  308. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  309. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  310. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  311. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  312. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  313. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  314. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  315. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  316. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  317. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  318. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  319. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  320. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  321. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  322. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  323. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  324. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  325. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  326. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  328. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  329. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  330. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  332. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  334. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  430. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/unary.cu +289 -0
  432. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +59 -0
  433. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  434. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  436. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/cuda.h +1 -0
  437. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/hip.h +57 -0
  438. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/musa.h +7 -1
  439. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  440. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  441. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +131 -0
  442. data/ext/{ggml → sources/ggml}/src/ggml-impl.h +64 -19
  443. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  444. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  445. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  446. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  447. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  448. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  449. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  450. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  451. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  452. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  453. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  454. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  455. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  456. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  457. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  458. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  459. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  460. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  461. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  462. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  463. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  464. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  465. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  466. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  467. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  468. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  469. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  470. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  471. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  472. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  473. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  474. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  481. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +120 -0
  482. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
  483. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.m +2178 -1064
  484. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.metal +1575 -1218
  485. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  486. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  487. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  488. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
  489. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +5124 -0
  490. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  491. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  492. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  493. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  494. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  495. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  496. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  497. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  498. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  499. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  500. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  501. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  502. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  503. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  504. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  505. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  506. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  507. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  508. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  509. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  510. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  511. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  512. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  513. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  514. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  515. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  516. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  517. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  526. data/ext/{ggml → sources/ggml}/src/ggml-opt.cpp +373 -190
  527. data/ext/{ggml → sources/ggml}/src/ggml-quants.c +114 -120
  528. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  529. data/ext/{ggml → sources/ggml}/src/ggml-rpc/ggml-rpc.cpp +480 -73
  530. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  531. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  532. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +345 -0
  533. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  534. data/ext/{ggml → sources/ggml}/src/ggml-sycl/common.cpp +20 -32
  535. data/ext/sources/ggml/src/ggml-sycl/common.hpp +589 -0
  536. data/ext/{ggml → sources/ggml}/src/ggml-sycl/concat.cpp +32 -33
  537. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  538. data/ext/{ggml → sources/ggml}/src/ggml-sycl/conv.cpp +4 -2
  539. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  540. data/ext/{ggml → sources/ggml}/src/ggml-sycl/convert.cpp +104 -28
  541. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  542. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +700 -0
  543. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  544. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +791 -0
  545. data/ext/{ggml → sources/ggml}/src/ggml-sycl/dmmv.cpp +156 -17
  546. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  547. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
  548. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
  549. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +75 -0
  550. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +99 -0
  551. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +309 -0
  552. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  553. data/ext/{ggml → sources/ggml}/src/ggml-sycl/ggml-sycl.cpp +1004 -1240
  554. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  555. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  556. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  557. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  558. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmq.cpp +0 -1
  559. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  560. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmvq.cpp +261 -166
  561. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  562. data/ext/{ggml → sources/ggml}/src/ggml-sycl/norm.cpp +204 -81
  563. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  564. data/ext/{ggml → sources/ggml}/src/ggml-sycl/outprod.cpp +8 -17
  565. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  566. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  567. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +83 -0
  568. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +361 -0
  569. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  570. data/ext/{ggml → sources/ggml}/src/ggml-sycl/softmax.cpp +35 -25
  571. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  572. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  573. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  574. data/ext/{ggml → sources/ggml}/src/ggml-sycl/tsembd.cpp +3 -3
  575. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  576. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1215 -0
  577. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +293 -0
  578. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  579. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +196 -0
  580. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  581. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/ggml-vulkan.cpp +3130 -1087
  582. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +39 -0
  583. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  584. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  585. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  586. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  587. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  588. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  589. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  590. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  591. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  592. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  593. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  594. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  595. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  596. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  597. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  599. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  692. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -35
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  695. data/ext/{ggml → sources/ggml}/src/ggml.c +676 -1820
  696. data/ext/sources/ggml/src/gguf.cpp +1330 -0
  697. data/ext/{include → sources/include}/whisper.h +68 -2
  698. data/ext/sources/src/CMakeLists.txt +143 -0
  699. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.h +27 -15
  700. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.m +35 -10
  701. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.h +21 -9
  702. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.m +28 -3
  703. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  704. data/ext/sources/src/whisper-arch.h +197 -0
  705. data/ext/{src → sources/src}/whisper.cpp +1905 -374
  706. data/ext/sources/tests/CMakeLists.txt +105 -0
  707. data/ext/sources/tests/earnings21/eval.mk +58 -0
  708. data/ext/sources/tests/earnings21/eval.py +68 -0
  709. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  710. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  711. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  712. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  713. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  714. data/ext/sources/tests/en-0-ref.txt +1 -0
  715. data/ext/sources/tests/en-1-ref.txt +1 -0
  716. data/ext/sources/tests/en-2-ref.txt +1 -0
  717. data/ext/sources/tests/es-0-ref.txt +1 -0
  718. data/ext/sources/tests/librispeech/eval.mk +39 -0
  719. data/ext/sources/tests/librispeech/eval.py +47 -0
  720. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  721. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  722. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  723. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  724. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  725. data/ext/sources/tests/run-tests.sh +130 -0
  726. data/ext/sources/tests/test-c.c +3 -0
  727. data/ext/sources/tests/test-vad-full.cpp +54 -0
  728. data/ext/sources/tests/test-vad.cpp +83 -0
  729. data/ext/sources/tests/test-whisper.js +58 -0
  730. data/extsources.rb +33 -5
  731. data/lib/whisper/model/uri.rb +149 -128
  732. data/sig/whisper.rbs +480 -0
  733. data/tests/helper.rb +28 -0
  734. data/tests/test_callback.rb +45 -3
  735. data/tests/test_error.rb +2 -2
  736. data/tests/test_model.rb +38 -0
  737. data/tests/test_package.rb +18 -3
  738. data/tests/test_params.rb +145 -8
  739. data/tests/test_segment.rb +10 -19
  740. data/tests/test_vad.rb +19 -0
  741. data/tests/test_vad_params.rb +103 -0
  742. data/tests/test_whisper.rb +37 -37
  743. data/whispercpp.gemspec +5 -4
  744. metadata +766 -111
  745. data/ext/cpu.mk +0 -9
  746. data/ext/examples/dr_wav.h +0 -8815
  747. data/ext/ggml/src/ggml-cann/aclnn_ops.h +0 -592
  748. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -4262
  749. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +0 -14123
  750. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +0 -1884
  751. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +0 -14
  752. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +0 -288
  753. data/ext/ggml/src/ggml-sycl/element_wise.cpp +0 -1030
  754. data/ext/ggml/src/ggml-sycl/im2col.cpp +0 -126
  755. data/ext/ggml/src/ggml-sycl/rope.cpp +0 -276
  756. data/ext/ggml/src/ggml-sycl/wkv6.cpp +0 -141
  757. data/ext/metal-embed.mk +0 -17
  758. data/ext/metal.mk +0 -6
  759. data/ext/ruby_whisper.cpp +0 -1909
  760. data/ext/scripts/get-flags.mk +0 -38
  761. data/lib/whisper.rb +0 -2
  762. /data/ext/{ggml → sources/ggml}/include/ggml-blas.h +0 -0
  763. /data/ext/{ggml → sources/ggml}/include/ggml-cann.h +0 -0
  764. /data/ext/{ggml → sources/ggml}/include/ggml-cuda.h +0 -0
  765. /data/ext/{ggml → sources/ggml}/include/ggml-kompute.h +0 -0
  766. /data/ext/{ggml → sources/ggml}/include/ggml-opencl.h +0 -0
  767. /data/ext/{ggml → sources/ggml}/include/ggml-sycl.h +0 -0
  768. /data/ext/{ggml → sources/ggml}/src/ggml-amx/common.h +0 -0
  769. /data/ext/{ggml → sources/ggml}/src/ggml-amx/ggml-amx.cpp +0 -0
  770. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.cpp +0 -0
  771. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.h +0 -0
  772. /data/ext/{ggml → sources/ggml}/src/ggml-blas/ggml-blas.cpp +0 -0
  773. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/ascendc_kernels.h +0 -0
  774. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f16.cpp +0 -0
  775. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f32.cpp +0 -0
  776. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -0
  777. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -0
  778. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -0
  779. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -0
  780. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -0
  781. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.h +0 -0
  782. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/common.h +0 -0
  783. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.cpp +0 -0
  784. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.h +0 -0
  785. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-aarch64.h +0 -0
  786. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.cpp +0 -0
  787. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.h +0 -0
  788. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.h +0 -0
  789. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.cpp +0 -0
  790. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.h +0 -0
  791. /data/ext/{ggml → sources/ggml}/src/ggml-kompute/ggml-kompute.cpp +0 -0
  792. /data/ext/{ggml → sources/ggml}/src/ggml-quants.h +0 -0
  793. /data/ext/{ggml → sources/ggml}/src/ggml-threading.cpp +0 -0
  794. /data/ext/{ggml → sources/ggml}/src/ggml-threading.h +0 -0
  795. /data/ext/{src → sources/src}/coreml/whisper-encoder.h +0 -0
  796. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.cpp +0 -0
  797. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.h +0 -0
@@ -0,0 +1,3217 @@
1
+ #pragma once
2
+
3
+ #include "common.cuh"
4
+ #include "vecdotq.cuh"
5
+ #include "mma.cuh"
6
+
7
+ #include <climits>
8
+ #include <cstdint>
9
+
10
+ using namespace ggml_cuda_mma;
11
+
12
+ #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
13
+ #define MMQ_ITER_K 256
14
+ #define MMQ_NWARPS 8
15
+
16
+ typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
17
+ typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00);
18
+ typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t * __restrict__ get_rows_to_sorted,
19
+ float * __restrict__ dst, const int stride, const int i_max, const int j_max);
20
+
21
+ enum mmq_q8_1_ds_layout {
22
+ MMQ_Q8_1_DS_LAYOUT_D4,
23
+ MMQ_Q8_1_DS_LAYOUT_DS4,
24
+ MMQ_Q8_1_DS_LAYOUT_D2S6,
25
+ };
26
+
27
+ struct block_q8_1_mmq {
28
+ // The y float data is converted to a data layout that can simply be copied to shared memory as a contiguous block.
29
+ // The y float data is first grouped as blocks of 128 values.
30
+ // These blocks are then treated as individual data values and transposed.
31
+ //
32
+ // To avoid shared memory bank conflicts each block is padded with 16 bytes.
33
+ // This padding is also used to store block scales/partial sums.
34
+ // The scales multiplied with the quantized data are equal to the unquantized values.
35
+ // The partial sums are obtained by summing up a subgroup of the contained values (prior to quantization)
36
+ // and are only needed for performance reasons.
37
+ //
38
+ // The exact data stored depends on the x data type.
39
+ union {
40
+ float d4[4]; // 1 32 bit scale per 32 values, stored as d0,d1,d2,d3
41
+ half2 ds4[4]; // 1 16 bit scale + 1 16 bit partial sum per 32 values, stored as d0,s0,d1,s1,d2,s2,d3,s3
42
+ half d2s6[8]; // 1 16 bit scale per 64 values + 1 16 bit partial sum per 16 values for the first 96 values,
43
+ // stored as d0,d1,s1,s2,s3,s4,s5
44
+ };
45
+ int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each
46
+ };
47
+ static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
48
+ static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1), "Unexpected block_q8_1_mmq size");
49
+
50
+ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
51
+ switch (type_x) {
52
+ case GGML_TYPE_Q4_0:
53
+ case GGML_TYPE_Q4_1:
54
+ return MMQ_Q8_1_DS_LAYOUT_DS4;
55
+ case GGML_TYPE_Q5_0:
56
+ return MMQ_Q8_1_DS_LAYOUT_D4;
57
+ case GGML_TYPE_Q5_1:
58
+ return MMQ_Q8_1_DS_LAYOUT_DS4;
59
+ case GGML_TYPE_Q8_0:
60
+ return MMQ_Q8_1_DS_LAYOUT_D4;
61
+ case GGML_TYPE_Q2_K:
62
+ return MMQ_Q8_1_DS_LAYOUT_D2S6;
63
+ case GGML_TYPE_Q3_K:
64
+ return MMQ_Q8_1_DS_LAYOUT_D4;
65
+ case GGML_TYPE_Q4_K:
66
+ case GGML_TYPE_Q5_K:
67
+ return MMQ_Q8_1_DS_LAYOUT_DS4;
68
+ case GGML_TYPE_Q6_K:
69
+ case GGML_TYPE_IQ2_XXS:
70
+ case GGML_TYPE_IQ2_XS:
71
+ case GGML_TYPE_IQ2_S:
72
+ case GGML_TYPE_IQ3_XXS:
73
+ case GGML_TYPE_IQ3_S:
74
+ return MMQ_Q8_1_DS_LAYOUT_D4;
75
+ case GGML_TYPE_IQ1_S:
76
+ return MMQ_Q8_1_DS_LAYOUT_DS4;
77
+ case GGML_TYPE_IQ4_XS:
78
+ case GGML_TYPE_IQ4_NL:
79
+ return MMQ_Q8_1_DS_LAYOUT_D4;
80
+ default:
81
+ GGML_ABORT("fatal error");
82
+ break;
83
+ }
84
+ }
85
+
86
+ struct tile_x_sizes {
87
+ int qs;
88
+ int dm;
89
+ int sc;
90
+ };
91
+
92
+ static int get_mmq_x_max_host(const int cc) {
93
+ return new_mma_available(cc) ? 128 :
94
+ GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
95
+ #ifdef GGML_CUDA_FORCE_MMQ
96
+ 128 : 64;
97
+ #else
98
+ MMQ_DP4A_MAX_BATCH_SIZE : 64;
99
+ #endif // GGML_CUDA_FORCE_MMQ
100
+ }
101
+
102
+ static constexpr __device__ int get_mmq_x_max_device() {
103
+ #ifdef NEW_MMA_AVAILABLE
104
+ return 128;
105
+ #else // NEW_MMA_AVAILABLE
106
+
107
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
108
+ return 128;
109
+ #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
110
+
111
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
112
+ #ifdef GGML_CUDA_FORCE_MMQ
113
+ return 128;
114
+ #else // GGML_CUDA_FORCE_MMQ
115
+ return MMQ_DP4A_MAX_BATCH_SIZE;
116
+ #endif // GGML_CUDA_FORCE_MMQ
117
+ #else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
118
+
119
+ return 64;
120
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
121
+
122
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
123
+ #endif // NEW_MMA_AVAILABLE
124
+ }
125
+
126
+ static int get_mmq_y_host(const int cc) {
127
+ return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
128
+ ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
129
+ }
130
+
131
+ static constexpr __device__ int get_mmq_y_device() {
132
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
133
+ #if defined(RDNA1)
134
+ return 64;
135
+ #else
136
+ return 128;
137
+ #endif // defined RDNA1
138
+ #else
139
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
140
+ return 128;
141
+ #else
142
+ return 64;
143
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
144
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
145
+ }
146
+
147
+ #define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
148
+ #define MMQ_DP4A_TXS_Q4_1 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_1 + mmq_y/QI4_1, 0}
149
+ #define MMQ_DP4A_TXS_Q8_0 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_0 + mmq_y/(QI8_0/2), 0}
150
+ #define MMQ_DP4A_TXS_Q8_0_16 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*4/QI8_0 + mmq_y/(QI8_0/4), 0}
151
+ #define MMQ_DP4A_TXS_Q8_1 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_1 + mmq_y/(QI8_1/2), 0}
152
+ #define MMQ_DP4A_TXS_Q2_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE + mmq_y, 0}
153
+ #define MMQ_DP4A_TXS_Q3_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y, mmq_y*WARP_SIZE/8 + mmq_y/8}
154
+ #define MMQ_DP4A_TXS_Q4_K tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_K, mmq_y*WARP_SIZE/8 + mmq_y/8}
155
+ #define MMQ_DP4A_TXS_Q5_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_K + mmq_y/QI5_K, mmq_y*WARP_SIZE/8 + mmq_y/8}
156
+ #define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8}
157
+
158
+ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
159
+ switch (type) {
160
+ case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0;
161
+ case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1;
162
+ case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0;
163
+ case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1;
164
+ case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0;
165
+ case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K;
166
+ case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K;
167
+ case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K;
168
+ case GGML_TYPE_Q5_K: return MMQ_DP4A_TXS_Q5_K;
169
+ case GGML_TYPE_Q6_K: return MMQ_DP4A_TXS_Q6_K;
170
+ case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0;
171
+ case GGML_TYPE_IQ2_XS: return MMQ_DP4A_TXS_Q8_0_16;
172
+ case GGML_TYPE_IQ2_S: return MMQ_DP4A_TXS_Q8_0_16;
173
+ case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0;
174
+ case GGML_TYPE_IQ3_S: return MMQ_DP4A_TXS_Q8_0;
175
+ case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0;
176
+ case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0;
177
+ case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0;
178
+ default: return tile_x_sizes{0, 0, 0};
179
+ }
180
+ }
181
+
182
+ #define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4)
183
+ #define MMQ_MMA_TILE_X_K_Q8_1 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4)
184
+ #define MMQ_MMA_TILE_X_K_Q2_K (2*WARP_SIZE + WARP_SIZE + 4)
185
+ #define MMQ_MMA_TILE_X_K_Q3_K (2*WARP_SIZE + WARP_SIZE/2 + 4)
186
+ #define MMQ_MMA_TILE_X_K_Q6_K (2*WARP_SIZE + WARP_SIZE/QI6_K + WARP_SIZE/8 + 7)
187
+
188
+ static_assert(MMQ_MMA_TILE_X_K_Q8_0 % 8 == 4, "Wrong padding.");
189
+ static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding.");
190
+ static_assert(MMQ_MMA_TILE_X_K_Q2_K % 8 == 4, "Wrong padding.");
191
+ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
192
+ static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
193
+
194
+ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
195
+ switch (type) {
196
+ case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0;
197
+ case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1;
198
+ case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0;
199
+ case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1;
200
+ case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0;
201
+ case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K;
202
+ case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K;
203
+ case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1;
204
+ case GGML_TYPE_Q5_K: return MMQ_MMA_TILE_X_K_Q8_1;
205
+ case GGML_TYPE_Q6_K: return MMQ_MMA_TILE_X_K_Q6_K;
206
+ case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
207
+ case GGML_TYPE_IQ2_XS: return MMQ_MMA_TILE_X_K_Q3_K;
208
+ case GGML_TYPE_IQ2_S: return MMQ_MMA_TILE_X_K_Q3_K;
209
+ case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
210
+ case GGML_TYPE_IQ3_S: return MMQ_MMA_TILE_X_K_Q8_0;
211
+ case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0;
212
+ case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0;
213
+ case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0;
214
+ default: return 0;
215
+ }
216
+ }
217
+
218
+ #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
219
+
220
+ static int mmq_get_granularity_host(const int mmq_x, const int cc) {
221
+ return new_mma_available(cc) && mmq_x >= 48 ? 16 : 8;
222
+ }
223
+
224
+ #ifdef NEW_MMA_AVAILABLE
225
+ static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) {
226
+ return mmq_x >= 48 ? 16 : 8;
227
+ }
228
+ #else
229
+ static constexpr __device__ int mmq_get_granularity_device(const int /* mmq_x */) {
230
+ return 8;
231
+ }
232
+ #endif // NEW_MMA_AVAILABLE
233
+
234
+ // ------------------------------------------------------------
235
+
236
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
237
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
238
+
239
+ #ifdef NEW_MMA_AVAILABLE
240
+ int * x_qs = (int *) x_tile;
241
+ float * x_df = (float *) (x_qs + 2*WARP_SIZE);
242
+ #else
243
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
244
+ int * x_qs = (int *) x_tile;
245
+ float * x_df = (float *) (x_qs + txs.qs);
246
+ #endif // NEW_MMA_AVAILABLE
247
+
248
+ const int kbx = threadIdx.x / QI4_0;
249
+ const int kqsx = threadIdx.x % QI4_0;
250
+
251
+ #pragma unroll
252
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
253
+ int i = i0 + threadIdx.y;
254
+
255
+ if (need_check) {
256
+ i = min(i, i_max);
257
+ }
258
+
259
+ const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbx;
260
+ const int qs0 = get_int_b2(bxi->qs, kqsx);
261
+
262
+ #ifdef NEW_MMA_AVAILABLE
263
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + 0] = __vsubss4((qs0 >> 0) & 0x0F0F0F0F, 0x08080808);
264
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + QI4_0] = __vsubss4((qs0 >> 4) & 0x0F0F0F0F, 0x08080808);
265
+ #else
266
+ x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0;
267
+ #endif // NEW_MMA_AVAILABLE
268
+ }
269
+
270
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
271
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
272
+
273
+ #pragma unroll
274
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
275
+ int i = i0 + threadIdx.y * QI4_0 + threadIdx.x / blocks_per_tile_x_row;
276
+
277
+ if (need_check) {
278
+ i = min(i, i_max);
279
+ }
280
+
281
+ const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbxd;
282
+
283
+ #ifdef NEW_MMA_AVAILABLE
284
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d;
285
+ #else
286
+ x_df[i*(WARP_SIZE/QI4_0) + i/QI4_0 + kbxd] = bxi->d;
287
+ #endif // NEW_MMA_AVAILABLE
288
+ }
289
+ }
290
+
291
+ template <int mmq_x, int mmq_y, int nwarps>
292
+ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a(
293
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
294
+
295
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
296
+ const int * x_qs = (const int *) x;
297
+ const float * x_df = (const float *) x_qs + txs.qs;
298
+ const int * y_qs = (const int *) y + 4;
299
+ const half2 * y_ds = (const half2 *) y;
300
+
301
+ // #pragma unroll
302
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_0*VDR_Q4_0_Q8_1_MMQ) {
303
+ const int k0 = k00 + k01;
304
+
305
+ #pragma unroll
306
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
307
+ const int j = j0 + threadIdx.y;
308
+
309
+ #pragma unroll
310
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
311
+ const int i = i0 + threadIdx.x;
312
+
313
+ const int kyqs = QI8_1 * ((k01/2) / (QI8_1/2)) + (k01/2) % (QI8_1/2);
314
+
315
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
316
+
317
+ #pragma unroll
318
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
319
+ u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + kyqs + l];
320
+ u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + kyqs + (l + QI4_0)];
321
+ }
322
+
323
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
324
+ (&x_qs[i*(WARP_SIZE + 1) + k0/QR4_0], u,
325
+ x_df[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/(QR4_0*QI4_0)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
326
+ }
327
+ }
328
+ }
329
+ }
330
+
331
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
332
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
333
+
334
+ #ifdef NEW_MMA_AVAILABLE
335
+ int * x_qs = (int *) x_tile;
336
+ half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
337
+ #else
338
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
339
+ int * x_qs = (int *) x_tile;
340
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
341
+ #endif // NEW_MMA_AVAILABLE
342
+
343
+ const int kbx = threadIdx.x / QI4_1;
344
+ const int kqsx = threadIdx.x % QI4_1;
345
+
346
+ #pragma unroll
347
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
348
+ int i = i0 + threadIdx.y;
349
+
350
+ if (need_check) {
351
+ i = min(i, i_max);
352
+ }
353
+
354
+ const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbx;
355
+ const int qs0 = get_int_b4(bxi->qs, kqsx);
356
+
357
+ #ifdef NEW_MMA_AVAILABLE
358
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + 0] = (qs0 >> 0) & 0x0F0F0F0F;
359
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + QI4_1] = (qs0 >> 4) & 0x0F0F0F0F;
360
+ #else
361
+ x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0;
362
+ #endif // NEW_MMA_AVAILABLE
363
+ }
364
+
365
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
366
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
367
+
368
+ #pragma unroll
369
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
370
+ int i = i0 + threadIdx.y * QI4_1 + threadIdx.x / blocks_per_tile_x_row;
371
+
372
+ if (need_check) {
373
+ i = min(i, i_max);
374
+ }
375
+
376
+ const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbxd;
377
+
378
+ #ifdef NEW_MMA_AVAILABLE
379
+ x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm;
380
+ #else
381
+ x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + kbxd] = bxi->dm;
382
+ #endif // NEW_MMA_AVAILABLE
383
+ }
384
+ }
385
+
386
+ template <int mmq_x, int mmq_y, int nwarps>
387
+ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a(
388
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
389
+
390
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
391
+ const int * x_qs = (const int *) x;
392
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
393
+ const int * y_qs = (const int *) y + 4;
394
+ const half2 * y_ds = (const half2 *) y;
395
+
396
+ // #pragma unroll
397
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_1*VDR_Q4_1_Q8_1_MMQ) {
398
+ const int k0 = k00 + k01;
399
+
400
+ #pragma unroll
401
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
402
+ const int j = j0 + threadIdx.y;
403
+
404
+ #pragma unroll
405
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
406
+ const int i = i0 + threadIdx.x;
407
+
408
+ const int kyqs = QI8_1 * ((k01/2) / (QI8_1/2)) + (k01/2) % (QI8_1/2);
409
+
410
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
411
+
412
+ #pragma unroll
413
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
414
+ u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + kyqs + l];
415
+ u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + kyqs + (l + QI4_1)];
416
+ }
417
+
418
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
419
+ (&x_qs[i*(WARP_SIZE + 1) + k0/QR4_1], u,
420
+ x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + k0/(QR4_1*QI4_1)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
421
+ }
422
+ }
423
+ }
424
+ }
425
+
426
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
427
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
428
+
429
+ #ifdef NEW_MMA_AVAILABLE
430
+ int * x_qs = (int *) x_tile;
431
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
432
+ #else
433
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_0, mmq_y);
434
+ int * x_qs = (int *) x_tile;
435
+ float * x_df = (float *) (x_qs + txs.qs);
436
+ #endif // NEW_MMA_AVAILABLE
437
+
438
+ const int kbx = threadIdx.x / QI5_0;
439
+ const int kqsx = threadIdx.x % QI5_0;
440
+
441
+ #pragma unroll
442
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
443
+ int i = i0 + threadIdx.y;
444
+
445
+ if (need_check) {
446
+ i = min(i, i_max);
447
+ }
448
+
449
+ const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbx;
450
+
451
+ const int ql = get_int_b2(bxi->qs, kqsx);
452
+ const int qh = get_int_b2(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_0));
453
+
454
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
455
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
456
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
457
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
458
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
459
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
460
+
461
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
462
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
463
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
464
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
465
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
466
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
467
+
468
+ #ifdef NEW_MMA_AVAILABLE
469
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + 0] = qs0;
470
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1;
471
+ #else
472
+ x_qs[i*(2*WARP_SIZE + 1) + kbx*(2*QI5_0) + kqsx + 0] = qs0;
473
+ x_qs[i*(2*WARP_SIZE + 1) + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1;
474
+ #endif // NEW_MMA_AVAILABLE
475
+ }
476
+
477
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
478
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
479
+
480
+ #pragma unroll
481
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
482
+ int i = i0 + threadIdx.y * QI5_0 + threadIdx.x / blocks_per_tile_x_row;
483
+
484
+ if (need_check) {
485
+ i = min(i, i_max);
486
+ }
487
+
488
+ const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbxd;
489
+
490
+ #ifdef NEW_MMA_AVAILABLE
491
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d;
492
+ #else
493
+ x_df[i*(WARP_SIZE/QI5_0) + i/QI5_0 + kbxd] = bxi->d;
494
+ #endif // NEW_MMA_AVAILABLE
495
+ }
496
+ }
497
+
498
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
499
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
500
+
501
+ #ifdef NEW_MMA_AVAILABLE
502
+ int * x_qs = (int *) x_tile;
503
+ half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
504
+ #else
505
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y);
506
+ int * x_qs = (int *) x_tile;
507
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
508
+ #endif // NEW_MMA_AVAILABLE
509
+
510
+ const int kbx = threadIdx.x / QI5_1;
511
+ const int kqsx = threadIdx.x % QI5_1;
512
+
513
+ #pragma unroll
514
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
515
+ int i = i0 + threadIdx.y;
516
+
517
+ if (need_check) {
518
+ i = min(i, i_max);
519
+ }
520
+
521
+ const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbx;
522
+
523
+ const int ql = get_int_b4(bxi->qs, kqsx);
524
+ const int qh = get_int_b4(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_1));
525
+
526
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
527
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
528
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
529
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
530
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
531
+
532
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
533
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
534
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
535
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
536
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
537
+
538
+ #ifdef NEW_MMA_AVAILABLE
539
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + 0] = qs0;
540
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1;
541
+ #else
542
+ x_qs[i*(2*WARP_SIZE + 1) + kbx*(2*QI5_1) + kqsx + 0] = qs0;
543
+ x_qs[i*(2*WARP_SIZE + 1) + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1;
544
+ #endif // NEW_MMA_AVAILABLE
545
+ }
546
+
547
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
548
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
549
+
550
+ #pragma unroll
551
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
552
+ int i = i0 + threadIdx.y * QI5_1 + threadIdx.x / blocks_per_tile_x_row;
553
+
554
+ if (need_check) {
555
+ i = min(i, i_max);
556
+ }
557
+
558
+ const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbxd;
559
+
560
+ #ifdef NEW_MMA_AVAILABLE
561
+ x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm;
562
+ #else
563
+ x_dm[i*(WARP_SIZE/QI5_1) + i/QI5_1 + kbxd] = bxi->dm;
564
+ #endif // NEW_MMA_AVAILABLE
565
+ }
566
+ }
567
+
568
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
569
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
570
+
571
+ #ifdef NEW_MMA_AVAILABLE
572
+ int * x_qs = (int *) x_tile;
573
+ float * x_df = (float *) (x_tile + 2*WARP_SIZE);
574
+ #else
575
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
576
+ int * x_qs = (int *) x_tile;
577
+ float * x_df = (float *) (x_qs + txs.qs);
578
+ #endif // NEW_MMA_AVAILABLE
579
+
580
+ const int kbx = threadIdx.x / QI8_0;
581
+ const int kqsx = threadIdx.x % QI8_0;
582
+
583
+ #pragma unroll
584
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
585
+ int i = i0 + threadIdx.y;
586
+
587
+ if (need_check) {
588
+ i = min(i, i_max);
589
+ }
590
+
591
+ const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbx;
592
+
593
+ #ifdef NEW_MMA_AVAILABLE
594
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 0 + threadIdx.x] = get_int_b2(bxi[0].qs, kqsx);
595
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + WARP_SIZE + threadIdx.x] = get_int_b2(bxi[WARP_SIZE/QI8_0].qs, kqsx);
596
+ #else
597
+ x_qs[i*(2*WARP_SIZE + 1) + 0 + threadIdx.x] = get_int_b2(bxi[0].qs, kqsx);
598
+ x_qs[i*(2*WARP_SIZE + 1) + WARP_SIZE + threadIdx.x] = get_int_b2(bxi[WARP_SIZE/QI8_0].qs, kqsx);
599
+ #endif // NEW_MMA_AVAILABLE
600
+ }
601
+
602
+ const int blocks_per_tile_x_row = 2*WARP_SIZE / QI8_0;
603
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
604
+
605
+ #pragma unroll
606
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0/2) {
607
+ int i = i0 + threadIdx.y * (QI8_0/2) + threadIdx.x / blocks_per_tile_x_row;
608
+
609
+ if (need_check) {
610
+ i = min(i, i_max);
611
+ }
612
+
613
+ const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbxd;
614
+
615
+ #ifdef NEW_MMA_AVAILABLE
616
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d;
617
+ #else
618
+ x_df[i*(2*WARP_SIZE/QI8_0) + i/(QI8_0/2) + kbxd] = bxi->d;
619
+ #endif // NEW_MMA_AVAILABLE
620
+ }
621
+ }
622
+
623
+ template <int mmq_x, int mmq_y, int nwarps>
624
+ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
625
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
626
+
627
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
628
+ const int * x_qs = (const int *) x;
629
+ const float * x_df = (const float *) x_qs + txs.qs;
630
+ const int * y_qs = (const int *) y + 4;
631
+ const float * y_df = (const float *) y;
632
+
633
+ // #pragma unroll
634
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += VDR_Q8_0_Q8_1_MMQ) {
635
+ const int k0 = k00 + k01;
636
+
637
+ #pragma unroll
638
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
639
+ const int j = j0 + threadIdx.y;
640
+
641
+ #pragma unroll
642
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
643
+ const int i = i0 + threadIdx.x;
644
+
645
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
646
+ (&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0 % WARP_SIZE],
647
+ x_df[i*(2*WARP_SIZE/QI8_0) + i/(QI8_0/2) + k0/QI8_0], y_df[j*MMQ_TILE_Y_K + (k0/QI8_1) % (WARP_SIZE/QI8_1)]);
648
+ }
649
+ }
650
+ }
651
+ }
652
+
653
+ template <int mmq_x, int mmq_y, int nwarps, mmq_q8_1_ds_layout ds_layout>
654
+ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
655
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
656
+
657
+ typedef tile<16, 8, int> tile_A;
658
+ typedef tile< 8, 8, int> tile_B;
659
+ typedef tile<16, 8, int> tile_C;
660
+
661
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
662
+ constexpr int rows_per_warp = 2 * granularity;
663
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
664
+
665
+ y += (threadIdx.y % ntx) * (tile_B::I*MMQ_TILE_Y_K);
666
+
667
+ const int * x_qs = (const int *) x;
668
+ const float * x_df = (const float *) x_qs + 2*WARP_SIZE;
669
+ const int * y_qs = (const int *) y + 4;
670
+ const float * y_df = (const float *) y;
671
+ const half2 * y_ds = (const half2 *) y;
672
+
673
+ tile_A A[ntx][WARP_SIZE/QI8_0];
674
+ float dA[ntx][tile_C::ne/2][WARP_SIZE/QI8_0];
675
+
676
+ const int i0 = (threadIdx.y/ntx)*rows_per_warp;
677
+
678
+ #pragma unroll
679
+ for (int n = 0; n < ntx; ++n) {
680
+ #pragma unroll
681
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
682
+ const int k0 = k00 + k01;
683
+
684
+ load_ldmatrix(A[n][k01/QI8_0], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
685
+ }
686
+
687
+ #pragma unroll
688
+ for (int l = 0; l < tile_C::ne/2; ++l) {
689
+ const int i = i0 + n*tile_A::I + tile_C::get_i(2*l);
690
+
691
+ #pragma unroll
692
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
693
+ const int k0 = k00 + k01;
694
+
695
+ dA[n][l][k01/QI8_0] = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0];
696
+ }
697
+ }
698
+ }
699
+
700
+ #pragma unroll
701
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
702
+ #pragma unroll
703
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
704
+ tile_B B;
705
+ float dB[tile_C::ne/2];
706
+
707
+ load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); // faster than load_ldmatrix
708
+
709
+ #pragma unroll
710
+ for (int l = 0; l < tile_C::ne/2; ++l) {
711
+ const int j = j0 + tile_C::get_j(l);
712
+
713
+ if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) {
714
+ dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
715
+ } else {
716
+ dB[l] = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
717
+ }
718
+ }
719
+
720
+ #pragma unroll
721
+ for (int n = 0; n < ntx; ++n) {
722
+ tile_C C;
723
+ mma(C, A[n][k01/QI8_0], B);
724
+
725
+ #pragma unroll
726
+ for (int l = 0; l < tile_C::ne; ++l) {
727
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l]*dA[n][l/2][k01/QI8_0]*dB[l%2];
728
+ }
729
+ }
730
+ }
731
+ }
732
+ }
733
+
734
+ template <int mmq_x, int mmq_y, int nwarps>
735
+ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
736
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
737
+
738
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y);
739
+ const int * x_qs = (const int *) x;
740
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
741
+ const int * y_qs = (const int *) y + 4;
742
+ const half2 * y_ds = (const half2 *) y;
743
+
744
+ // #pragma unroll
745
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += VDR_Q8_0_Q8_1_MMQ) {
746
+ const int k0 = k00 + k01;
747
+
748
+ #pragma unroll
749
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
750
+ const int j = j0 + threadIdx.y;
751
+
752
+ #pragma unroll
753
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
754
+ const int i = i0 + threadIdx.x;
755
+
756
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
757
+ (&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
758
+ x_dm[i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI8_1], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
759
+ }
760
+ }
761
+ }
762
+ }
763
+
764
+ template <int mmq_x, int mmq_y, int nwarps>
765
+ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
766
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
767
+
768
+ typedef tile<16, 8, int> tile_A;
769
+ typedef tile< 8, 8, int> tile_B;
770
+ typedef tile<16, 8, int> tile_C;
771
+
772
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
773
+ constexpr int rows_per_warp = 2 * granularity;
774
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
775
+
776
+ y += (threadIdx.y % ntx) * (tile_B::J*MMQ_TILE_Y_K);
777
+
778
+ const int * x_qs = (const int *) x;
779
+ const half2 * x_dm = (const half2 *) x_qs + 2*WARP_SIZE;
780
+ const int * y_qs = (const int *) y + 4;
781
+ const half2 * y_dm = (const half2 *) y;
782
+
783
+ tile_A A[ntx][WARP_SIZE/QI8_1];
784
+ float2 dmA[ntx][tile_C::ne/2][WARP_SIZE/QI8_1];
785
+
786
+ const int i0 = (threadIdx.y/ntx)*rows_per_warp;
787
+
788
+ #pragma unroll
789
+ for (int n = 0; n < ntx; ++n) {
790
+ #pragma unroll
791
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
792
+ const int k0 = k00 + k01;
793
+
794
+ load_ldmatrix(A[n][k01/QI8_1], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
795
+ }
796
+
797
+ #pragma unroll
798
+ for (int l = 0; l < tile_C::ne/2; ++l) {
799
+ const int i = i0 + n*tile_A::I + tile_C::get_i(2*l);
800
+
801
+ #pragma unroll
802
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
803
+ const int k0 = k00 + k01;
804
+
805
+ dmA[n][l][k01/QI8_1] = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1]);
806
+ }
807
+ }
808
+ }
809
+
810
+ #pragma unroll
811
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
812
+ #pragma unroll
813
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
814
+ tile_B B;
815
+ float2 dsB[tile_C::ne/2];
816
+
817
+ load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); // faster than load_ldmatrix
818
+
819
+ #pragma unroll
820
+ for (int l = 0; l < tile_C::ne/2; ++l) {
821
+ const int j = j0 + tile_C::get_j(l);
822
+
823
+ dsB[l] = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
824
+ }
825
+
826
+ #pragma unroll
827
+ for (int n = 0; n < ntx; ++n) {
828
+ tile_C C;
829
+ mma(C, A[n][k01/QI8_1], B);
830
+
831
+ #pragma unroll
832
+ for (int l = 0; l < tile_C::ne; ++l) {
833
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += dmA[n][l/2][k01/QI8_1].x*dsB[l%2].x*C.x[l];
834
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += dmA[n][l/2][k01/QI8_1].y*dsB[l%2].y;
835
+ }
836
+ }
837
+ }
838
+ }
839
+ }
840
+
841
+ template <int mmq_x, int mmq_y, int nwarps>
842
+ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
843
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
844
+
845
+ constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
846
+ const int * x_qs = (const int *) x;
847
+ const float * x_df = (const float *) x_qs + txs.qs;
848
+ const int * y_qs = (const int *) y + 4;
849
+ const float * y_df = (const float *) y;
850
+
851
+ // #pragma unroll
852
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
853
+ const int k0 = k00 + k01;
854
+
855
+ #pragma unroll
856
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
857
+ const int j = j0 + threadIdx.y;
858
+
859
+ #pragma unroll
860
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
861
+ const int i = i0 + threadIdx.x;
862
+
863
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_16_q8_1_impl<QI8_0>(
864
+ &x_qs[i*(2*WARP_SIZE + 1) + k0],
865
+ &y_qs[j*MMQ_TILE_Y_K + k01],
866
+ &x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + k0/(QI8_0/2)],
867
+ y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
868
+ }
869
+ }
870
+ }
871
+ }
872
+
873
+ template <int mmq_x, int mmq_y, int nwarps>
874
+ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
875
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
876
+ #ifdef NEW_MMA_AVAILABLE
877
+
878
+ typedef tile<16, 4, int> tile_A;
879
+ typedef tile<16, 8, int> tile_A_8;
880
+ typedef tile< 8, 4, int> tile_B;
881
+ typedef tile<16, 8, int> tile_C;
882
+
883
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
884
+ constexpr int rows_per_warp = 2 * granularity;
885
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
886
+
887
+ y += (threadIdx.y % ntx) * (tile_B::I*MMQ_TILE_Y_K);
888
+
889
+ const int * x_qs = (const int *) x;
890
+ const float * x_df = (const float *) x_qs + WARP_SIZE*2;
891
+ const int * y_qs = (const int *) y + 4;
892
+ const float * y_df = (const float *) y;
893
+
894
+ const int i0 = (threadIdx.y / ntx) * (ntx*tile_A::I);
895
+
896
+ tile_A A[ntx][8];
897
+ float dA[ntx][tile_C::ne/2][8];
898
+
899
+ #pragma unroll
900
+ for (int n = 0; n < ntx; ++n) {
901
+ #pragma unroll
902
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
903
+ const int k0 = k00 + k01;
904
+
905
+ load_ldmatrix(((tile_A_8 *) A[n])[k01/8], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
906
+ }
907
+
908
+ #pragma unroll
909
+ for (int l = 0; l < tile_C::ne/2; ++l) {
910
+ const int i = i0 + n*tile_C::I + tile_C::get_i(2*l);
911
+
912
+ #pragma unroll
913
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += 4) {
914
+ const int k0 = k00 + k01;
915
+
916
+ dA[n][l][k01/4] = x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4];
917
+ }
918
+ }
919
+ }
920
+
921
+ #pragma unroll
922
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
923
+ #pragma unroll
924
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) {
925
+ tile_B B[2];
926
+ float dB[tile_C::ne/2];
927
+
928
+ // Here load_generic is faster than load_ldmatrix.
929
+ load_generic(B[0], y_qs + j0*MMQ_TILE_Y_K + (k01 + 0), MMQ_TILE_Y_K);
930
+ load_generic(B[1], y_qs + j0*MMQ_TILE_Y_K + (k01 + tile_B::J), MMQ_TILE_Y_K);
931
+
932
+ #pragma unroll
933
+ for (int l = 0; l < tile_C::ne/2; ++l) {
934
+ const int j = j0 + tile_C::get_j(l);
935
+
936
+ dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
937
+ }
938
+
939
+ #pragma unroll
940
+ for (int n = 0; n < ntx; ++n) {
941
+ tile_C C[2];
942
+ mma(C[0], A[n][k01/4 + 0], B[0]);
943
+ mma(C[1], A[n][k01/4 + 1], B[1]);
944
+
945
+ #pragma unroll
946
+ for (int l = 0; l < tile_C::ne; ++l) {
947
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += dB[l%2]*(C[0].x[l]*dA[n][l/2][k01/4 + 0] + C[1].x[l]*dA[n][l/2][k01/4 + 1]);
948
+ }
949
+ }
950
+ }
951
+ }
952
+ #else
953
+ GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
954
+ NO_DEVICE_CODE;
955
+ #endif // NEW_MMA_AVAILABLE
956
+ }
957
+
958
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
959
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
960
+
961
+ #ifdef NEW_MMA_AVAILABLE
962
+ int * x_qs = (int *) x_tile;
963
+ half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
964
+ #else
965
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y);
966
+ int * x_qs = (int *) x_tile;
967
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
968
+ #endif // NEW_MMA_AVAILABLE
969
+
970
+ const int kqsx = threadIdx.x % QI2_K;
971
+
972
+ #pragma unroll
973
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI2_K) {
974
+ int i = i0 + threadIdx.y*(WARP_SIZE/QI2_K) + threadIdx.x/QI2_K;
975
+
976
+ if (need_check) {
977
+ i = min(i, i_max);
978
+ }
979
+
980
+ const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride;
981
+
982
+ const int x_ql_0 = get_int_b2(bxi->qs, kqsx);
983
+
984
+ #pragma unroll
985
+ for (int l = 0; l < QR2_K; ++l) {
986
+ const int k = (kqsx/8)*32 + l*8 + kqsx % 8;
987
+
988
+ const int x_qs_k = (x_ql_0 >> (2*l)) & 0x03030303;
989
+
990
+ #ifdef NEW_MMA_AVAILABLE
991
+ x_qs[i*MMQ_MMA_TILE_X_K_Q2_K + k] = x_qs_k;
992
+ #else
993
+ x_qs[i*(2*WARP_SIZE + 1) + k] = x_qs_k;
994
+ #endif // NEW_MMA_AVAILABLE
995
+ }
996
+
997
+ const int sc_m = bxi->scales[kqsx];
998
+ #ifdef FAST_FP16_AVAILABLE
999
+ const half2 x_dm_ik = __hmul2(bxi->dm, make_half2(sc_m & 0x0F, sc_m >> 4));
1000
+ #else
1001
+ const float2 bxi_dmf = __half22float2(bxi->dm);
1002
+ const half2 x_dm_ik = make_half2(bxi_dmf.x*(sc_m & 0x0F), bxi_dmf.y*(sc_m >> 4));
1003
+ #endif // FAST_FP16_AVAILABLE
1004
+
1005
+ #ifdef NEW_MMA_AVAILABLE
1006
+ x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + kqsx] = x_dm_ik;
1007
+ #else
1008
+ x_dm[i*(WARP_SIZE + 1) + kqsx] = x_dm_ik;
1009
+ #endif // NEW_MMA_AVAILABLE
1010
+ }
1011
+ }
1012
+
1013
+ template <int mmq_x, int mmq_y, int nwarps>
1014
+ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
1015
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1016
+
1017
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y);
1018
+ const int * x_qs = (const int *) x;
1019
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
1020
+ const int * y_qs = (const int *) y + 4;
1021
+ const half2 * y_ds = (const half2 *) y;
1022
+
1023
+ float2 y_df[mmq_x/nwarps];
1024
+ #pragma unroll
1025
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1026
+ const int j = j0 + threadIdx.y;
1027
+
1028
+ y_df[j0/nwarps] = __half22float2(y_ds[j*MMQ_TILE_Y_K]);
1029
+ }
1030
+
1031
+ #pragma unroll
1032
+ for (int k01 = 0; k01 < WARP_SIZE/2; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
1033
+ const int k0 = k00 + k01;
1034
+
1035
+ #pragma unroll
1036
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1037
+ const int j = j0 + threadIdx.y;
1038
+
1039
+ #pragma unroll
1040
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1041
+ const int i = i0 + threadIdx.x;
1042
+
1043
+ constexpr int ns = 2;
1044
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
1045
+ &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
1046
+ &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
1047
+ &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1048
+ }
1049
+ }
1050
+ }
1051
+
1052
+ // Some compilers fail to unroll the loop over k01 if there is a conditional statement for ns in the inner loop.
1053
+ // As a workaround 2 separate loops are used instead.
1054
+ #pragma unroll
1055
+ for (int k01 = WARP_SIZE/2; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
1056
+ const int k0 = k00 + k01;
1057
+
1058
+ #pragma unroll
1059
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1060
+ const int j = j0 + threadIdx.y;
1061
+
1062
+ #pragma unroll
1063
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1064
+ const int i = i0 + threadIdx.x;
1065
+
1066
+ constexpr int ns = 1;
1067
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
1068
+ &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
1069
+ &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
1070
+ &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1071
+ }
1072
+ }
1073
+ }
1074
+ }
1075
+
1076
+ template <int mmq_x, int mmq_y, int nwarps>
1077
+ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
1078
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1079
+ #ifdef NEW_MMA_AVAILABLE
1080
+
1081
+ typedef tile<16, 4, int> tile_A;
1082
+ typedef tile<16, 8, int> tile_A_8;
1083
+ typedef tile< 8, 4, int> tile_B;
1084
+ typedef tile<16, 8, int> tile_C;
1085
+
1086
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1087
+ constexpr int rows_per_warp = 2 * granularity;
1088
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1089
+
1090
+ y += (threadIdx.y % ntx) * (tile_B::I*MMQ_TILE_Y_K);
1091
+
1092
+ const int * x_qs = (const int *) x;
1093
+ const half2 * x_dm = (const half2 *) x_qs + WARP_SIZE*2;
1094
+ const int * y_qs = (const int *) y + 4;
1095
+ const half2 * y_ds = (const half2 *) y;
1096
+
1097
+ const int i0 = (threadIdx.y / ntx) * (ntx*tile_A::I);
1098
+
1099
+ tile_A A[ntx][8];
1100
+ float dA[ntx][tile_C::ne/2][8];
1101
+ float mA[ntx][tile_C::ne/2][8];
1102
+
1103
+ #pragma unroll
1104
+ for (int n = 0; n < ntx; ++n) {
1105
+ #pragma unroll
1106
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
1107
+ const int k0 = k00 + k01;
1108
+
1109
+ load_ldmatrix(((tile_A_8 *) A[n])[k01/QI8_1], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
1110
+ }
1111
+ }
1112
+
1113
+ #pragma unroll
1114
+ for (int n = 0; n < ntx; ++n) {
1115
+ #pragma unroll
1116
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1117
+ const int i = i0 + n*tile_C::I + tile_C::get_i(2*l);
1118
+
1119
+ #pragma unroll
1120
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1/2) {
1121
+ const int k0 = k00 + k01;
1122
+
1123
+ const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/(QI8_1/2)]);
1124
+
1125
+ dA[n][l][k01/(QI8_1/2)] = dm.x;
1126
+ mA[n][l][k01/(QI8_1/2)] = dm.y;
1127
+ }
1128
+ }
1129
+ }
1130
+
1131
+ #pragma unroll
1132
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1133
+ float2 dB[tile_C::ne/2];
1134
+
1135
+ #pragma unroll
1136
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1137
+ const int j = j0 + tile_C::get_j(l);
1138
+
1139
+ dB[l] = __half22float2(y_ds[j*MMQ_TILE_Y_K]);
1140
+ }
1141
+
1142
+ #pragma unroll
1143
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
1144
+ tile_B B[2];
1145
+
1146
+ // Here load_generic is faster than load_ldmatrix.
1147
+ load_generic(B[0], y_qs + j0*MMQ_TILE_Y_K + (k01 + 0), MMQ_TILE_Y_K);
1148
+ load_generic(B[1], y_qs + j0*MMQ_TILE_Y_K + (k01 + tile_B::J), MMQ_TILE_Y_K);
1149
+
1150
+ tile_C Cm[2];
1151
+ if (k01 >= WARP_SIZE * 3/4) {
1152
+ tile_A A1;
1153
+ A1.x[0] = 0x01010101;
1154
+ A1.x[1] = 0x01010101;
1155
+ mma(Cm[0], A1, B[0]);
1156
+ mma(Cm[1], A1, B[1]);
1157
+ }
1158
+
1159
+ #pragma unroll
1160
+ for (int n = 0; n < ntx; ++n) {
1161
+ tile_C Cd[2];
1162
+
1163
+ mma(Cd[0], A[n][k01/4 + 0], B[0]);
1164
+ mma(Cd[1], A[n][k01/4 + 1], B[1]);
1165
+
1166
+ #pragma unroll
1167
+ for (int l = 0; l < tile_C::ne; ++l) {
1168
+ float tmp = Cd[0].x[l]*dA[n][l/2][k01/4 + 0] + Cd[1].x[l]*dA[n][l/2][k01/4 + 1];
1169
+ if (k01 >= WARP_SIZE * 3/4) {
1170
+ tmp -= Cm[0].x[l]*mA[n][l/2][k01/4 + 0] + Cm[1].x[l]*mA[n][l/2][k01/4 + 1];
1171
+ }
1172
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*(k01 < WARP_SIZE/2 ? dB[l%2].x : dB[l%2].y);
1173
+ }
1174
+ }
1175
+ }
1176
+
1177
+ #pragma unroll
1178
+ for (int k01 = 0; k01 < WARP_SIZE * 3/4; k01 += QI8_1) {
1179
+ float2 sB[tile_C::ne/2];
1180
+
1181
+ #pragma unroll
1182
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1183
+ const int j = j0 + tile_C::get_j(l);
1184
+
1185
+ sB[l] = __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1186
+ }
1187
+
1188
+ #pragma unroll
1189
+ for (int n = 0; n < ntx; ++n) {
1190
+ #pragma unroll
1191
+ for (int l = 0; l < tile_C::ne; ++l) {
1192
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] -= mA[n][l/2][k01/4 + 0]*sB[l%2].x;
1193
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] -= mA[n][l/2][k01/4 + 1]*sB[l%2].y;
1194
+ }
1195
+ }
1196
+ }
1197
+ }
1198
+ #else
1199
+ GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
1200
+ NO_DEVICE_CODE;
1201
+ #endif // NEW_MMA_AVAILABLE
1202
+ }
1203
+
1204
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
1205
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
1206
+
1207
+ #ifdef NEW_MMA_AVAILABLE
1208
+ int * x_qs = (int *) x_tile;
1209
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
1210
+ #else
1211
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y);
1212
+ int * x_qs = (int *) x_tile;
1213
+ float * x_df = (float *) (x_qs + txs.qs);
1214
+ int * x_sc = (int *) (x_df + txs.dm);
1215
+ #endif // NEW_MMA_AVAILABLE
1216
+
1217
+ const int kqsx = threadIdx.x % QI3_K;
1218
+
1219
+ #pragma unroll
1220
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI3_K) {
1221
+ int i = i0 + threadIdx.y * (WARP_SIZE/QI3_K) + threadIdx.x / QI3_K;
1222
+
1223
+ if (need_check) {
1224
+ i = min(i, i_max);
1225
+ }
1226
+
1227
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
1228
+
1229
+ const int x_ql_0 = get_int_b2(bxi->qs, kqsx);
1230
+ const int x_qh_0 = get_int_b2(bxi->hmask, kqsx % (QI3_K/2)) >> (4 * (kqsx / (QI3_K/2)));
1231
+
1232
+ #pragma unroll
1233
+ for (int l = 0; l < QR3_K; ++l) {
1234
+ const int k = (kqsx/8)*32 + l*8 + kqsx % 8;
1235
+
1236
+ const int x_ql_k = (x_ql_0 >> (2*l)) & 0x03030303;
1237
+ const int x_qh_k = ((x_qh_0 >> l) << 2) & 0x04040404;
1238
+
1239
+ const int x_qs_k = __vsubss4(x_ql_k | x_qh_k, 0x04040404);
1240
+
1241
+ #ifdef NEW_MMA_AVAILABLE
1242
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + k] = x_qs_k;
1243
+ #else
1244
+ x_qs[i*(2*WARP_SIZE + 1) + k] = x_qs_k;
1245
+ #endif // NEW_MMA_AVAILABLE
1246
+ }
1247
+ }
1248
+
1249
+ #pragma unroll
1250
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*8) {
1251
+ int i = i0 + threadIdx.y*8 + threadIdx.x/(WARP_SIZE/8);
1252
+
1253
+ if (need_check) {
1254
+ i = min(i, i_max);
1255
+ }
1256
+
1257
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
1258
+
1259
+ const int ksc = threadIdx.x % (WARP_SIZE/8);
1260
+
1261
+ const int ksc_low = ksc % (QI3_K/8);
1262
+ const int shift_low = 4 * (ksc / (QI3_K/8));
1263
+ const int sc_low = (get_int_b2(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
1264
+
1265
+ const int ksc_high = QI3_K/8;
1266
+ const int shift_high = 2 * ksc;
1267
+ const int sc_high = ((get_int_b2(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
1268
+
1269
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
1270
+
1271
+ #ifdef NEW_MMA_AVAILABLE
1272
+ const int8_t * sc8 = (const int8_t *) &sc;
1273
+ const float d = bxi->d;
1274
+
1275
+ #pragma unroll
1276
+ for (int l = 0; l < int(sizeof(int)); ++l) {
1277
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
1278
+ }
1279
+ #else
1280
+ x_sc[i*(WARP_SIZE/8) + i/8 + threadIdx.x % (WARP_SIZE/8)] = sc;
1281
+ #endif // NEW_MMA_AVAILABLE
1282
+ }
1283
+
1284
+ #ifndef NEW_MMA_AVAILABLE
1285
+ #pragma unroll
1286
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*WARP_SIZE) {
1287
+ int i = (i0 + threadIdx.y*WARP_SIZE + threadIdx.x) % mmq_y;
1288
+
1289
+ if (need_check) {
1290
+ i = min(i, i_max);
1291
+ }
1292
+
1293
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
1294
+
1295
+ x_df[i] = bxi->d;
1296
+ }
1297
+ #endif // NEW_MMA_AVAILABLE
1298
+ }
1299
+
1300
+ template <int mmq_x, int mmq_y, int nwarps>
1301
+ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a(
1302
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1303
+
1304
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y);
1305
+ const int * x_qs = (const int *) x;
1306
+ const float * x_df = (const float *) x_qs + txs.qs;
1307
+ const int * x_sc = (const int *) x_df + txs.dm;
1308
+ const int * y_qs = (const int *) y + 4;
1309
+ const float * y_df = (const float *) y;
1310
+
1311
+ // #pragma unroll
1312
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) {
1313
+ const int k0 = k00 + k01;
1314
+
1315
+ #pragma unroll
1316
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1317
+ const int j = j0 + threadIdx.y;
1318
+
1319
+ #pragma unroll
1320
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1321
+ const int i = i0 + threadIdx.x;
1322
+
1323
+ const int8_t * scales = ((const int8_t *) (x_sc + i*(WARP_SIZE/8) + i/8)) + k0/4;
1324
+
1325
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
1326
+ &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], scales,
1327
+ x_df[i], y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
1328
+ }
1329
+ }
1330
+ }
1331
+ }
1332
+
1333
+ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, const int ksc) {
1334
+ // scale arrangement after the following two lines:
1335
+ // - ksc == 0: sc0, sc1, sc2, sc3
1336
+ // - ksc == 1: sc4, sc5, sc6, sc7
1337
+ // - ksc == 2: m0, m1, m2, m3
1338
+ // - ksc == 3: m4, m5, m6, m7
1339
+ return ((scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F) | // lower 4 bits
1340
+ ((scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030); // upper 2 bits
1341
+ }
1342
+
1343
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
1344
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
1345
+
1346
+ #ifdef NEW_MMA_AVAILABLE
1347
+ int * x_qs = (int *) x_tile;
1348
+ half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
1349
+ #else
1350
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y);
1351
+ int * x_qs = (int *) x_tile;
1352
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
1353
+ int * x_sc = (int *) (x_dm + txs.dm);
1354
+ #endif // NEW_MMA_AVAILABLE
1355
+
1356
+ #pragma unroll
1357
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1358
+ int i = i0 + threadIdx.y;
1359
+
1360
+ if (need_check) {
1361
+ i = min(i, i_max);
1362
+ }
1363
+
1364
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
1365
+ const int qs0 = get_int_b4(bxi->qs, threadIdx.x);
1366
+
1367
+ #ifdef NEW_MMA_AVAILABLE
1368
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(threadIdx.x/8) + threadIdx.x % 8 + 0] = (qs0 >> 0) & 0x0F0F0F0F;
1369
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(threadIdx.x/8) + threadIdx.x % 8 + 8] = (qs0 >> 4) & 0x0F0F0F0F;
1370
+ #else
1371
+ x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0;
1372
+ #endif // NEW_MMA_AVAILABLE
1373
+ }
1374
+
1375
+ #ifdef NEW_MMA_AVAILABLE
1376
+
1377
+ #pragma unroll
1378
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*16) {
1379
+ int i = (i0 + threadIdx.y*16 + threadIdx.x/(WARP_SIZE/16)) % mmq_y;
1380
+
1381
+ if (need_check) {
1382
+ i = min(i, i_max);
1383
+ }
1384
+
1385
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
1386
+
1387
+ const int * scales = (const int *) bxi->scales;
1388
+ const int ksc = threadIdx.x % (WARP_SIZE/16);
1389
+
1390
+ const int sc32 = unpack_scales_q45_K(scales, ksc + 0);
1391
+ const int m32 = unpack_scales_q45_K(scales, ksc + 2);
1392
+
1393
+ const uint8_t * sc8 = (const uint8_t *) &sc32;
1394
+ const uint8_t * m8 = (const uint8_t *) &m32;
1395
+
1396
+ const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
1397
+
1398
+ #pragma unroll
1399
+ for (int l = 0; l < int(sizeof(int)); ++l) {
1400
+ x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
1401
+ }
1402
+ }
1403
+
1404
+ #else
1405
+
1406
+ #pragma unroll
1407
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*QI4_K) {
1408
+ int i = (i0 + threadIdx.y*QI4_K + threadIdx.x) % mmq_y;
1409
+
1410
+ if (need_check) {
1411
+ i = min(i, i_max);
1412
+ }
1413
+
1414
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
1415
+
1416
+ x_dm[i] = bxi->dm;
1417
+ }
1418
+
1419
+ #pragma unroll
1420
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
1421
+ int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
1422
+
1423
+ if (need_check) {
1424
+ i = min(i, i_max);
1425
+ }
1426
+
1427
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI4_K/8);
1428
+
1429
+ const int * scales = (const int *) bxi->scales;
1430
+
1431
+ const int ksc = threadIdx.x % (WARP_SIZE/8);
1432
+ const int scales8 = unpack_scales_q45_K(scales, ksc);
1433
+
1434
+ x_sc[i*(WARP_SIZE/8) + i/8 + ksc] = scales8;
1435
+ }
1436
+ #endif // NEW_MMA_AVAILABLE
1437
+ }
1438
+
1439
+ template <int mmq_x, int mmq_y, int nwarps>
1440
+ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
1441
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1442
+
1443
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y);
1444
+ const int * x_qs = (const int *) x;
1445
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
1446
+ const int * x_sc = (const int *) x_dm + txs.dm;
1447
+ const int * y_qs = (const int *) y + 4;
1448
+ const half2 * y_ds = (const half2 *) y;
1449
+
1450
+ // #pragma unroll
1451
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_K*VDR_Q4_K_Q8_1_MMQ) {
1452
+ const int k0 = k00 + k01;
1453
+
1454
+ #pragma unroll
1455
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1456
+ const int j = j0 + threadIdx.y;
1457
+
1458
+ #pragma unroll
1459
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1460
+ const int i = i0 + threadIdx.x;
1461
+
1462
+ const uint8_t * sc = (const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/32] + 2*(k01/16);
1463
+
1464
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
1465
+ &x_qs[i*(WARP_SIZE + 1) + k0/2], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8,
1466
+ x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
1467
+ }
1468
+ }
1469
+ }
1470
+ }
1471
+
1472
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
1473
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
1474
+
1475
+ #ifdef NEW_MMA_AVAILABLE
1476
+ int * x_qs = (int *) x_tile;
1477
+ half2 * x_dm = (half2 *) (x_qs + WARP_SIZE*2);
1478
+ #else
1479
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y);
1480
+ int * x_qs = (int *) x_tile;
1481
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
1482
+ int * x_sc = (int *) (x_dm + txs.dm);
1483
+ #endif // NEW_MMA_AVAILABLE
1484
+
1485
+ #pragma unroll
1486
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1487
+ int i = i0 + threadIdx.y;
1488
+
1489
+ if (need_check) {
1490
+ i = min(i, i_max);
1491
+ }
1492
+
1493
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
1494
+ const int ky = QR5_K*threadIdx.x;
1495
+
1496
+ const int ql = get_int_b4(bxi->qs, threadIdx.x);
1497
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
1498
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
1499
+
1500
+ const int qh = get_int_b4(bxi->qh, threadIdx.x % (QI5_K/4));
1501
+ const int qh0 = ((qh >> (2 * (threadIdx.x / (QI5_K/4)) + 0)) << 4) & 0x10101010;
1502
+ const int qh1 = ((qh >> (2 * (threadIdx.x / (QI5_K/4)) + 1)) << 4) & 0x10101010;
1503
+
1504
+ const int kq0 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + 0;
1505
+ const int kq1 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + QI5_K/4;
1506
+
1507
+ #ifdef NEW_MMA_AVAILABLE
1508
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq0] = ql0 | qh0;
1509
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq1] = ql1 | qh1;
1510
+ #else
1511
+ x_qs[i*(2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
1512
+ x_qs[i*(2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
1513
+ #endif // NEW_MMA_AVAILABLE
1514
+ }
1515
+
1516
+ #ifdef NEW_MMA_AVAILABLE
1517
+
1518
+ #pragma unroll
1519
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*16) {
1520
+ int i = (i0 + threadIdx.y*16 + threadIdx.x/(WARP_SIZE/16)) % mmq_y;
1521
+
1522
+ if (need_check) {
1523
+ i = min(i, i_max);
1524
+ }
1525
+
1526
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
1527
+
1528
+ const int * scales = (const int *) bxi->scales;
1529
+ const int ksc = threadIdx.x % (WARP_SIZE/16);
1530
+
1531
+ const int sc32 = unpack_scales_q45_K(scales, ksc + 0);
1532
+ const int m32 = unpack_scales_q45_K(scales, ksc + 2);
1533
+
1534
+ const uint8_t * sc8 = (const uint8_t *) &sc32;
1535
+ const uint8_t * m8 = (const uint8_t *) &m32;
1536
+
1537
+ const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
1538
+
1539
+ #pragma unroll
1540
+ for (int l = 0; l < int(sizeof(int)); ++l) {
1541
+ x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
1542
+ }
1543
+ }
1544
+
1545
+ #else
1546
+
1547
+ #pragma unroll
1548
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*QI5_K) {
1549
+ int i = (i0 + threadIdx.y*QI5_K + threadIdx.x) % mmq_y;
1550
+
1551
+ if (need_check) {
1552
+ i = min(i, i_max);
1553
+ }
1554
+
1555
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
1556
+
1557
+ x_dm[i] = bxi->dm;
1558
+ }
1559
+
1560
+ #pragma unroll
1561
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*8) {
1562
+ int i = (i0 + threadIdx.y*8 + threadIdx.x/(WARP_SIZE/8)) % mmq_y;
1563
+
1564
+ if (need_check) {
1565
+ i = min(i, i_max);
1566
+ }
1567
+
1568
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
1569
+
1570
+ const int * scales = (const int *) bxi->scales;
1571
+
1572
+ const int ksc = threadIdx.x % (WARP_SIZE/8);
1573
+ const int scales8 = unpack_scales_q45_K(scales, ksc);
1574
+
1575
+ x_sc[i*(WARP_SIZE/8) + i/8 + ksc] = scales8;
1576
+ }
1577
+ #endif // NEW_MMA_AVAILABLE
1578
+ }
1579
+
1580
+ template <int mmq_x, int mmq_y, int nwarps>
1581
+ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
1582
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1583
+
1584
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y);
1585
+ const int * x_qs = (const int *) x;
1586
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
1587
+ const int * x_sc = (const int *) x_dm + txs.dm;
1588
+ const int * y_qs = (const int *) y + 4;
1589
+ const half2 * y_ds = (const half2 *) y;
1590
+
1591
+ // #pragma unroll
1592
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QR5_K*VDR_Q5_K_Q8_1_MMQ) {
1593
+ const int k0 = k00 + k01;
1594
+
1595
+ #pragma unroll
1596
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1597
+ const int j = j0 + threadIdx.y;
1598
+
1599
+ #pragma unroll
1600
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1601
+ const int i = i0 + threadIdx.x;
1602
+
1603
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k00/32]) + 2*(k01/16);
1604
+
1605
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
1606
+ &x_qs[i*(QR5_K*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8,
1607
+ x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
1608
+ }
1609
+ }
1610
+ }
1611
+ }
1612
+
1613
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
1614
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
1615
+
1616
+ #ifdef NEW_MMA_AVAILABLE
1617
+ int * x_qs = (int *) x_tile;
1618
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
1619
+ int * x_sc = (int *) (x_df + WARP_SIZE/QI6_K);
1620
+ #else
1621
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y);
1622
+ int * x_qs = (int *) x_tile;
1623
+ float * x_df = (float *) (x_qs + txs.qs);
1624
+ int * x_sc = (int *) (x_df + txs.dm);
1625
+ #endif // NEW_MMA_AVAILABLE
1626
+
1627
+ #pragma unroll
1628
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1629
+ int i = i0 + threadIdx.y;
1630
+
1631
+ if (need_check) {
1632
+ i = min(i, i_max);
1633
+ }
1634
+
1635
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride;
1636
+
1637
+ const int ql = get_int_b2(bxi->ql, threadIdx.x);
1638
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
1639
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
1640
+
1641
+ const int qh = get_int_b2(bxi->qh, (QI6_K/4) * (threadIdx.x / (QI6_K/2)) + threadIdx.x % (QI6_K/4));
1642
+ const int qh0 = ((qh >> ((threadIdx.x & 0x08) >> 2)) << 4) & 0x30303030;
1643
+ const int qh1 = (qh >> ((threadIdx.x & 0x08) >> 2)) & 0x30303030;
1644
+
1645
+ const int kq0 = 2*threadIdx.x - threadIdx.x % (QI6_K/2) + 0;
1646
+ const int kq1 = 2*threadIdx.x - threadIdx.x % (QI6_K/2) + QI6_K/2;
1647
+
1648
+ #ifdef NEW_MMA_AVAILABLE
1649
+ x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
1650
+ x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
1651
+ #else
1652
+ x_qs[i*(2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
1653
+ x_qs[i*(2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
1654
+ #endif // NEW_MMA_AVAILABLE
1655
+ }
1656
+
1657
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
1658
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
1659
+
1660
+ #pragma unroll
1661
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
1662
+ int i = (i0 + threadIdx.y * QI6_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
1663
+
1664
+ if (need_check) {
1665
+ i = min(i, i_max);
1666
+ }
1667
+
1668
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbxd;
1669
+
1670
+ #ifdef NEW_MMA_AVAILABLE
1671
+ x_df[i*MMQ_MMA_TILE_X_K_Q6_K + kbxd] = bxi->d;
1672
+ #else
1673
+ x_df[i*(WARP_SIZE/QI6_K) + i/QI6_K + kbxd] = bxi->d;
1674
+ #endif // NEW_MMA_AVAILABLE
1675
+ }
1676
+
1677
+ #pragma unroll
1678
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
1679
+ int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
1680
+
1681
+ if (need_check) {
1682
+ i = min(i, i_max);
1683
+ }
1684
+
1685
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / 4;
1686
+
1687
+ #ifdef NEW_MMA_AVAILABLE
1688
+ x_sc[i*MMQ_MMA_TILE_X_K_Q6_K + threadIdx.x % (WARP_SIZE/8)] = get_int_b2(bxi->scales, threadIdx.x % (QI6_K/8));
1689
+ #else
1690
+ x_sc[i*(WARP_SIZE/8) + i/8 + threadIdx.x % (WARP_SIZE/8)] = get_int_b2(bxi->scales, threadIdx.x % (QI6_K/8));
1691
+ #endif // NEW_MMA_AVAILABLE
1692
+ }
1693
+ }
1694
+
1695
+ template <int mmq_x, int mmq_y, int nwarps>
1696
+ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
1697
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1698
+
1699
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y);
1700
+ const int * x_qs = (const int *) x;
1701
+ const float * x_df = (const float *) x_qs + txs.qs;
1702
+ const int * x_sc = (const int *) x_df + txs.dm;
1703
+ const int * y_qs = (const int *) y + 4;
1704
+ const float * y_df = (const float *) y;
1705
+
1706
+ // #pragma unroll
1707
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += QR6_K*VDR_Q6_K_Q8_1_MMQ) {
1708
+ const int k0 = k00 + k01;
1709
+
1710
+ #pragma unroll
1711
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1712
+ const int j = j0 + threadIdx.y;
1713
+
1714
+ #pragma unroll
1715
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1716
+ const int i = i0 + threadIdx.x;
1717
+
1718
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]);
1719
+
1720
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
1721
+ &x_qs[i*(QR6_K*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc,
1722
+ x_df[i*(WARP_SIZE/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
1723
+ }
1724
+ }
1725
+ }
1726
+ }
1727
+
1728
+ template <int mmq_x, int mmq_y, int nwarps>
1729
+ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
1730
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1731
+ #ifdef NEW_MMA_AVAILABLE
1732
+
1733
+ typedef tile<16, 4, int> tile_A;
1734
+ typedef tile< 8, 4, int> tile_B;
1735
+ typedef tile<16, 8, int> tile_C;
1736
+
1737
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1738
+ constexpr int rows_per_warp = 2 * granularity;
1739
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1740
+
1741
+ y += (threadIdx.y % ntx) * (tile_B::I*MMQ_TILE_Y_K);
1742
+
1743
+ const int * x_qs = (const int *) x;
1744
+ const float * x_df = (const float *) x_qs + WARP_SIZE*2;
1745
+ const int * x_sc = (const int *) x_df + WARP_SIZE/QI6_K;
1746
+ const int * y_qs = (const int *) y + 4;
1747
+ const float * y_df = (const float *) y;
1748
+
1749
+ const int i0 = (threadIdx.y / ntx) * (ntx*tile_A::I);
1750
+
1751
+ tile_A A[ntx][8];
1752
+ int scA[ntx][tile_C::ne/2][8];
1753
+ float dA[ntx][tile_C::ne/2];
1754
+
1755
+ #pragma unroll
1756
+ for (int n = 0; n < ntx; ++n) {
1757
+ #pragma unroll
1758
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
1759
+ const int k0 = k00 + k01;
1760
+
1761
+ load_ldmatrix(A[n][k01/4 + 0], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + (k0 + 0), MMQ_MMA_TILE_X_K_Q6_K);
1762
+ load_ldmatrix(A[n][k01/4 + 1], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + (k0 + tile_A::J), MMQ_MMA_TILE_X_K_Q6_K);
1763
+ }
1764
+
1765
+ #pragma unroll
1766
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += 16) {
1767
+ const int k0 = k00 + k01;
1768
+
1769
+ #pragma unroll
1770
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1771
+ const int i = i0 + n*tile_C::I + tile_C::get_i(2*l);
1772
+
1773
+ const int sc_packed = x_sc[i*MMQ_MMA_TILE_X_K_Q6_K + k0/16];
1774
+ const int8_t * sc = (const int8_t *) &sc_packed;
1775
+
1776
+ #pragma unroll
1777
+ for (int ksc = 0; ksc < sizeof(int); ++ksc) {
1778
+ scA[n][l][k01/4 + ksc] = sc[ksc];
1779
+ }
1780
+ }
1781
+ }
1782
+
1783
+ #pragma unroll
1784
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1785
+ const int i = i0 + n*tile_C::I + tile_C::get_i(2*l);
1786
+
1787
+ dA[n][l] = x_df[i*MMQ_MMA_TILE_X_K_Q6_K];
1788
+ }
1789
+ }
1790
+
1791
+ #pragma unroll
1792
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1793
+ float tmp[ntx][tile_C::ne] = {{0.0f}};
1794
+
1795
+ #pragma unroll
1796
+ for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
1797
+ tile_B B[2];
1798
+ float dB[tile_C::ne/2];
1799
+
1800
+ // Here load_generic is faster than load_ldmatrix.
1801
+ load_generic(B[0], y_qs + j0*MMQ_TILE_Y_K + 0 + k01, MMQ_TILE_Y_K);
1802
+ load_generic(B[1], y_qs + j0*MMQ_TILE_Y_K + tile_B::J + k01, MMQ_TILE_Y_K);
1803
+
1804
+ #pragma unroll
1805
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1806
+ const int j = j0 + tile_C::get_j(l);
1807
+
1808
+ dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
1809
+ }
1810
+
1811
+ #pragma unroll
1812
+ for (int n = 0; n < ntx; ++n) {
1813
+ tile_C C[2];
1814
+ mma(C[0], A[n][k01/4 + 0], B[0]);
1815
+ mma(C[1], A[n][k01/4 + 1], B[1]);
1816
+
1817
+ #pragma unroll
1818
+ for (int l = 0; l < tile_C::ne; ++l) {
1819
+ tmp[n][l] += (C[0].x[l]*scA[n][l/2][k01/4 + 0] + C[1].x[l]*scA[n][l/2][k01/4 + 1])*dB[l%2];
1820
+ }
1821
+ }
1822
+ }
1823
+
1824
+ #pragma unroll
1825
+ for (int n = 0; n < ntx; ++n) {
1826
+ #pragma unroll
1827
+ for (int l = 0; l < tile_C::ne; ++l) {
1828
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp[n][l]*dA[n][l/2];
1829
+ }
1830
+ }
1831
+ }
1832
+ #else
1833
+ GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
1834
+ NO_DEVICE_CODE;
1835
+ #endif // NEW_MMA_AVAILABLE
1836
+ }
1837
+
1838
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_nl(
1839
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
1840
+
1841
+ #ifdef NEW_MMA_AVAILABLE
1842
+ int * x_qs = (int *) x_tile;
1843
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
1844
+ #else
1845
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_NL, mmq_y);
1846
+ int * x_qs = (int *) x_tile;
1847
+ float * x_df = (float *) (x_qs + txs.qs);
1848
+ #endif // NEW_MMA_AVAILABLE
1849
+
1850
+ const int kbx = threadIdx.x / QI4_NL;
1851
+ const int kqsx = threadIdx.x % QI4_NL;
1852
+
1853
+ #pragma unroll
1854
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1855
+ int i = i0 + threadIdx.y;
1856
+
1857
+ if (need_check) {
1858
+ i = min(i, i_max);
1859
+ }
1860
+
1861
+ const block_iq4_nl * bxi = (const block_iq4_nl *) x + kbx0 + i*stride + kbx;
1862
+
1863
+ const int aux_q4 = get_int_b2(bxi->qs, kqsx);
1864
+ const int2 v = get_int_from_table_16(aux_q4);
1865
+ const int k0 = 8 * (threadIdx.x / 4) + threadIdx.x % 4;
1866
+ #ifdef NEW_MMA_AVAILABLE
1867
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
1868
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 4] = v.y;
1869
+ #else
1870
+ x_qs[i*(2*WARP_SIZE + 1) + k0 + 0] = v.x;
1871
+ x_qs[i*(2*WARP_SIZE + 1) + k0 + 4] = v.y;
1872
+ #endif // NEW_MMA_AVAILABLE
1873
+ }
1874
+
1875
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_NL;
1876
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
1877
+
1878
+ #pragma unroll
1879
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_NL) {
1880
+ int i = i0 + threadIdx.y * QI4_NL + threadIdx.x / blocks_per_tile_x_row;
1881
+
1882
+ if (need_check) {
1883
+ i = min(i, i_max);
1884
+ }
1885
+
1886
+ const block_iq4_nl * bxi = (const block_iq4_nl *) x + kbx0 + i*stride + kbxd;
1887
+
1888
+ #ifdef NEW_MMA_AVAILABLE
1889
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d);
1890
+ #else
1891
+ x_df[i*(WARP_SIZE/4) + i/4 + kbxd] = __half2float(bxi->d);
1892
+ #endif // NEW_MMA_AVAILABLE
1893
+ }
1894
+ }
1895
+
1896
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xxs(
1897
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
1898
+
1899
+ #ifdef NEW_MMA_AVAILABLE
1900
+ int * x_qs = (int *) x_tile;
1901
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
1902
+ #else
1903
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_XXS, mmq_y);
1904
+ int * x_qs = (int *) x_tile;
1905
+ float * x_df = (float *) (x_qs + txs.qs);
1906
+ #endif // NEW_MMA_AVAILABLE
1907
+
1908
+ const int kqsx = threadIdx.x % (QI2_XXS/2);
1909
+
1910
+ #pragma unroll
1911
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_XXS/2)) {
1912
+ int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_XXS) + threadIdx.x/(QI2_XXS/2);
1913
+
1914
+ if (need_check) {
1915
+ i = min(i, i_max);
1916
+ }
1917
+
1918
+ const block_iq2_xxs * bxi = (const block_iq2_xxs *) x + kbx0 + i*stride;
1919
+
1920
+ const int q2 = get_int_b2(bxi->qs, 2*kqsx+0);
1921
+ const uint8_t * aux8 = (const uint8_t *) &q2;
1922
+ const uint32_t aux32 = get_int_b2(bxi->qs, 2*kqsx+1);
1923
+
1924
+ #pragma unroll
1925
+ for (int l = 0; l < QR2_XXS; ++l) {
1926
+ const int * grid_pos = (const int *) (iq2xxs_grid + aux8[l]);
1927
+ const int signs_packed = ksigns_iq2xs[(aux32 >> (7*l)) & 0x7F];
1928
+
1929
+ const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
1930
+ const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
1931
+
1932
+ const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
1933
+ const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
1934
+
1935
+ #ifdef NEW_MMA_AVAILABLE
1936
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0;
1937
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid1;
1938
+ #else
1939
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 0)] = grid0;
1940
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 1)] = grid1;
1941
+ #endif // NEW_MMA_AVAILABLE
1942
+ }
1943
+
1944
+ const int ls = aux32 >> 28;
1945
+ const float d = bxi->d;
1946
+ #ifdef NEW_MMA_AVAILABLE
1947
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/4;
1948
+ #else
1949
+ x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = (ls*d + d/2)/4;
1950
+ #endif // NEW_MMA_AVAILABLE
1951
+ }
1952
+ }
1953
+
1954
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xs(
1955
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
1956
+
1957
+ #ifdef NEW_MMA_AVAILABLE
1958
+ int * x_qs = (int *) x_tile;
1959
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
1960
+ #else
1961
+ constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
1962
+ int * x_qs = (int *) x_tile;
1963
+ float * x_df = (float *) (x_qs + txs.qs);
1964
+ #endif // NEW_MMA_AVAILABLE
1965
+
1966
+ const int kqsx = threadIdx.x % (QI2_XS/2);
1967
+
1968
+ #pragma unroll
1969
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_XS/2)) {
1970
+ int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_XS) + threadIdx.x/(QI2_XS/2);
1971
+
1972
+ if (need_check) {
1973
+ i = min(i, i_max);
1974
+ }
1975
+
1976
+ const block_iq2_xs * bxi = (const block_iq2_xs *) x + kbx0 + i*stride;
1977
+
1978
+ const int2 q2_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
1979
+ const uint16_t * q2 = (const uint16_t *) &q2_packed;
1980
+
1981
+ #pragma unroll
1982
+ for (int l = 0; l < QR2_XS; ++l) {
1983
+ const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l] & 0x000001FF));
1984
+ const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
1985
+
1986
+ const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
1987
+ const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
1988
+
1989
+ #ifdef NEW_MMA_AVAILABLE
1990
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
1991
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
1992
+ #else
1993
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 0)] = grid_l;
1994
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 1)] = grid_h;
1995
+ #endif // NEW_MMA_AVAILABLE
1996
+ }
1997
+
1998
+ const int ls = bxi->scales[kqsx];
1999
+ const float d = bxi->d;
2000
+ #ifdef NEW_MMA_AVAILABLE
2001
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
2002
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
2003
+ #else
2004
+ x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
2005
+ x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
2006
+ #endif // NEW_MMA_AVAILABLE
2007
+ }
2008
+ }
2009
+
2010
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_s(
2011
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2012
+
2013
+ #ifdef NEW_MMA_AVAILABLE
2014
+ int * x_qs = (int *) x_tile;
2015
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
2016
+ #else
2017
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_S, mmq_y);
2018
+ int * x_qs = (int *) x_tile;
2019
+ float * x_df = (float *) (x_qs + txs.qs);
2020
+ #endif // NEW_MMA_AVAILABLE
2021
+
2022
+ const int kqsx = threadIdx.x % (QI2_S/2);
2023
+
2024
+ #pragma unroll
2025
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_S/2)) {
2026
+ int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_S) + threadIdx.x/(QI2_S/2);
2027
+
2028
+ if (need_check) {
2029
+ i = min(i, i_max);
2030
+ }
2031
+
2032
+ const block_iq2_s * bxi = (const block_iq2_s *) x + kbx0 + i*stride;
2033
+
2034
+ const int qs_packed = get_int_b2(bxi->qs, kqsx);
2035
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
2036
+
2037
+ const int qh = bxi->qh[kqsx];
2038
+
2039
+ const int signs_packed_32 = get_int_b2(bxi->qs, QK_K/32 + kqsx);
2040
+ const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
2041
+
2042
+ #pragma unroll
2043
+ for (int l = 0; l < QR2_S; ++l) {
2044
+ const int * grid_pos = (const int *)(iq2s_grid + (qs[l] | ((qh << (8-2*l)) & 0x300)));
2045
+
2046
+ const int signs0 = __vcmpne4(((signs_packed_8[l] & 0x03) << 7) | ((signs_packed_8[l] & 0x0C) << 21), 0x00000000);
2047
+ const int signs1 = __vcmpne4(((signs_packed_8[l] & 0x30) << 3) | ((signs_packed_8[l] & 0xC0) << 17), 0x00000000);
2048
+
2049
+ const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
2050
+ const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
2051
+
2052
+ #ifdef NEW_MMA_AVAILABLE
2053
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
2054
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
2055
+ #else
2056
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 0)] = grid_l;
2057
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 1)] = grid_h;
2058
+ #endif // NEW_MMA_AVAILABLE
2059
+ }
2060
+
2061
+ const int ls = bxi->scales[kqsx];
2062
+ const float d = bxi->d;
2063
+ #ifdef NEW_MMA_AVAILABLE
2064
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
2065
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
2066
+ #else
2067
+ x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
2068
+ x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
2069
+ #endif // NEW_MMA_AVAILABLE
2070
+ }
2071
+ }
2072
+
2073
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_xxs(
2074
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2075
+
2076
+ #ifdef NEW_MMA_AVAILABLE
2077
+ int * x_qs = (int *) x_tile;
2078
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
2079
+ #else
2080
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_XXS, mmq_y);
2081
+ int * x_qs = (int *) x_tile;
2082
+ float * x_df = (float *) (x_qs + txs.qs);
2083
+ #endif // NEW_MMA_AVAILABLE
2084
+
2085
+ const int kqsx = threadIdx.x % (QI3_XXS/2);
2086
+
2087
+ #pragma unroll
2088
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI3_XXS/2)) {
2089
+ int i = i0 + threadIdx.y*(2*WARP_SIZE/QI3_XXS) + threadIdx.x/(QI3_XXS/2);
2090
+
2091
+ if (need_check) {
2092
+ i = min(i, i_max);
2093
+ }
2094
+
2095
+ const block_iq3_xxs * bxi = (const block_iq3_xxs *) x + kbx0 + i*stride;
2096
+
2097
+ const int2 q3_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
2098
+ const uint8_t * q3 = (const uint8_t *) &q3_packed;
2099
+ const uint32_t aux32 = get_int_b2(bxi->qs, QK_K/16 + kqsx);
2100
+
2101
+ #pragma unroll
2102
+ for (int l = 0; l < QR3_XXS; ++l) {
2103
+ const int2 grid_pos = make_int2(iq3xxs_grid[q3[2*l+0]], iq3xxs_grid[q3[2*l+1]]);
2104
+
2105
+ const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l)) & 0x7F));
2106
+
2107
+ const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
2108
+ const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
2109
+
2110
+ #ifdef NEW_MMA_AVAILABLE
2111
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l;
2112
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid_h;
2113
+ #else
2114
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 0)] = grid_l;
2115
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 1)] = grid_h;
2116
+ #endif // NEW_MMA_AVAILABLE
2117
+ }
2118
+
2119
+ const int ls = aux32 >> 28;
2120
+ const float d = bxi->d;
2121
+ #ifdef NEW_MMA_AVAILABLE
2122
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/2;
2123
+ #else
2124
+ x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = (ls*d + d/2)/2;
2125
+ #endif // NEW_MMA_AVAILABLE
2126
+ }
2127
+ }
2128
+
2129
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_s(
2130
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2131
+
2132
+ #ifdef NEW_MMA_AVAILABLE
2133
+ int * x_qs = (int *) x_tile;
2134
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
2135
+ #else
2136
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
2137
+ int * x_qs = (int *) x_tile;
2138
+ float * x_df = (float *) (x_qs + txs.qs);
2139
+ #endif // NEW_MMA_AVAILABLE
2140
+
2141
+ const int kqsx = threadIdx.x % (QI3_S/2);
2142
+
2143
+ #pragma unroll
2144
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI3_S/2)) {
2145
+ int i = i0 + threadIdx.y*(2*WARP_SIZE/QI3_S) + threadIdx.x/(QI3_S/2);
2146
+
2147
+ if (need_check) {
2148
+ i = min(i, i_max);
2149
+ }
2150
+
2151
+ const block_iq3_s * bxi = (const block_iq3_s *) x + kbx0 + i*stride;
2152
+
2153
+ const int2 qs_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
2154
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
2155
+
2156
+ const int qh = bxi->qh[kqsx];
2157
+
2158
+ const int signs_packed_32 = get_int_b2(bxi->signs, kqsx);
2159
+ const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
2160
+
2161
+ #pragma unroll
2162
+ for (int l = 0; l < QR3_S; ++l) {
2163
+ const int2 grid_pos = make_int2(
2164
+ iq3s_grid[qs[2*l+0] | ((qh << (8 - 2*l)) & 0x100)],
2165
+ iq3s_grid[qs[2*l+1] | ((qh << (7 - 2*l)) & 0x100)]);
2166
+
2167
+ const int signs0 = __vcmpne4(((signs_packed_8[l] & 0x03) << 7) | ((signs_packed_8[l] & 0x0C) << 21), 0x00000000);
2168
+ const int signs1 = __vcmpne4(((signs_packed_8[l] & 0x30) << 3) | ((signs_packed_8[l] & 0xC0) << 17), 0x00000000);
2169
+
2170
+ const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
2171
+ const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
2172
+
2173
+ #ifdef NEW_MMA_AVAILABLE
2174
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+0)] = grid_l;
2175
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+1)] = grid_h;
2176
+ #else
2177
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l+0)] = grid_l;
2178
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l+1)] = grid_h;
2179
+ #endif // NEW_MMA_AVAILABLE
2180
+ }
2181
+
2182
+ const int ls = 1 + 2*((bxi->scales[kqsx/2] >> (((2*kqsx) << 1) & 0x04)) & 0x0F);
2183
+ const float d = bxi->d;
2184
+ #ifdef NEW_MMA_AVAILABLE
2185
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = ls*d;
2186
+ #else
2187
+ x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = ls*d;
2188
+ #endif // NEW_MMA_AVAILABLE
2189
+ }
2190
+ }
2191
+
2192
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq1_s(
2193
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2194
+
2195
+ #ifdef NEW_MMA_AVAILABLE
2196
+ int * x_qs = (int *) x_tile;
2197
+ half2 * x_ds = (half2 *) (x_qs + WARP_SIZE*2);
2198
+ #else
2199
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
2200
+ int * x_qs = (int *) x_tile;
2201
+ half2 * x_ds = (half2 *) (x_qs + txs.qs);
2202
+ #endif // NEW_MMA_AVAILABLE
2203
+
2204
+ const int kqsx = threadIdx.x % QI1_S;
2205
+
2206
+ #pragma unroll
2207
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI1_S) {
2208
+ int i = i0 + threadIdx.y*(WARP_SIZE/QI1_S) + threadIdx.x/QI1_S;
2209
+
2210
+ if (need_check) {
2211
+ i = min(i, i_max);
2212
+ }
2213
+
2214
+ const block_iq1_s * bxi = (const block_iq1_s *) x + kbx0 + i*stride;
2215
+
2216
+ const int qs_packed = get_int_b2(bxi->qs, kqsx);
2217
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
2218
+
2219
+ const int qh = bxi->qh[kqsx];
2220
+
2221
+ #pragma unroll
2222
+ for (int l = 0; l < QR1_S/2; ++l) {
2223
+ const int grid = iq1s_grid_gpu[qs[l] | (((qh >> (3*l)) & 0x07) << 8)];
2224
+
2225
+ const int grid0 = (grid >> 0) & 0x0F0F0F0F;
2226
+ const int grid1 = (grid >> 4) & 0x0F0F0F0F;
2227
+
2228
+ #ifdef NEW_MMA_AVAILABLE
2229
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+0)] = grid0;
2230
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+1)] = grid1;
2231
+ #else
2232
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l+0)] = grid0;
2233
+ x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l+1)] = grid1;
2234
+ #endif // NEW_MMA_AVAILABLE
2235
+ }
2236
+
2237
+ const float d1q = __half2float(bxi->d) * (((qh >> 11) & 0x0E) + 1);
2238
+ const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
2239
+
2240
+ #ifdef NEW_MMA_AVAILABLE
2241
+ x_ds[i*MMQ_MMA_TILE_X_K_Q8_1 + kqsx] = make_half2(d1q, d1q*delta);
2242
+ #else
2243
+ x_ds[i*(WARP_SIZE/4) + i/4 + kqsx] = make_half2(d1q, d1q*delta);
2244
+ #endif // NEW_MMA_AVAILABLE
2245
+ }
2246
+ }
2247
+
2248
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_xs(
2249
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2250
+
2251
+ #ifdef NEW_MMA_AVAILABLE
2252
+ int * x_qs = (int *) x_tile;
2253
+ float * x_df = (float *) (x_qs + WARP_SIZE*2);
2254
+ #else
2255
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
2256
+ int * x_qs = (int *) x_tile;
2257
+ float * x_df = (float *) (x_qs + txs.qs);
2258
+ #endif // NEW_MMA_AVAILABLE
2259
+
2260
+ const int kbx = 0; // threadIdx.x / QI4_XS
2261
+ const int kqsx = threadIdx.x; // threadIdx.x % QI4_XS
2262
+
2263
+ #pragma unroll
2264
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2265
+ int i = i0 + threadIdx.y;
2266
+
2267
+ if (need_check) {
2268
+ i = min(i, i_max);
2269
+ }
2270
+
2271
+ const block_iq4_xs * bxi = (const block_iq4_xs *) x + kbx0 + i*stride + kbx;
2272
+
2273
+ const int aux_q4 = get_int_b4(bxi->qs, kqsx);
2274
+ const int2 v = get_int_from_table_16(aux_q4);
2275
+ const int k0 = 8 * (threadIdx.x / 4) + threadIdx.x % 4;
2276
+ #ifdef NEW_MMA_AVAILABLE
2277
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
2278
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 4] = v.y;
2279
+ #else
2280
+ x_qs[i*(2*WARP_SIZE + 1) + k0 + 0] = v.x;
2281
+ x_qs[i*(2*WARP_SIZE + 1) + k0 + 4] = v.y;
2282
+ #endif // NEW_MMA_AVAILABLE
2283
+ }
2284
+
2285
+ #pragma unroll
2286
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2287
+ int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
2288
+
2289
+ if (need_check) {
2290
+ i = min(i, i_max);
2291
+ }
2292
+
2293
+ const block_iq4_xs * bxi = (const block_iq4_xs *) x + kbx0 + i*stride;
2294
+
2295
+ const float d = __half2float(bxi->d);
2296
+
2297
+ const int ls = ((bxi->scales_l[(threadIdx.x % 8)/2] >> (4*(threadIdx.x % 2))) & 0x0F)
2298
+ | (((bxi->scales_h >> (2*(threadIdx.x % 8))) & 0x03) << 4);
2299
+
2300
+ #ifdef NEW_MMA_AVAILABLE
2301
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = d * (ls - 32);
2302
+ #else
2303
+ x_df[i*(WARP_SIZE/4) + i/4 + threadIdx.x % 8] = d * (ls - 32);
2304
+ #endif // NEW_MMA_AVAILABLE
2305
+ }
2306
+ }
2307
+
2308
+ template<int mmq_x, int mmq_y, int nwarps, bool need_check>
2309
+ static __device__ __forceinline__ void mmq_write_back_dp4a(
2310
+ const float * __restrict__ sum, const int32_t * __restrict__ ids_dst, float * __restrict__ dst,
2311
+ const int stride, const int i_max, const int j_max) {
2312
+ #pragma unroll
2313
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
2314
+ const int j = j0 + threadIdx.y;
2315
+
2316
+ if (j > j_max) {
2317
+ return;
2318
+ }
2319
+
2320
+ #pragma unroll
2321
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
2322
+ const int i = i0 + threadIdx.x;
2323
+
2324
+ if (need_check && i > i_max) {
2325
+ continue;
2326
+ }
2327
+
2328
+ dst[ids_dst[j]*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
2329
+ }
2330
+ }
2331
+ }
2332
+
2333
+ template<int mmq_x, int mmq_y, int nwarps, bool need_check>
2334
+ static __device__ __forceinline__ void mmq_write_back_mma(
2335
+ const float * __restrict__ sum, const int * __restrict__ ids_dst, float * __restrict__ dst,
2336
+ const int stride, const int i_max, const int j_max) {
2337
+ typedef tile<16, 8, int> tile_C;
2338
+
2339
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
2340
+ constexpr int rows_per_warp = 2 * granularity;
2341
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
2342
+
2343
+ const int i0 = (threadIdx.y / ntx) * (ntx*tile_C::I);
2344
+ #ifdef NEW_MMA_AVAILABLE
2345
+ static_assert(nwarps*tile_C::I == mmq_y, "nwarps*tile_C::I != mmq_y");
2346
+ #endif // NEW_MMA_AVAILABLE
2347
+
2348
+ #pragma unroll
2349
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
2350
+ #pragma unroll
2351
+ for (int n = 0; n < ntx; ++n) {
2352
+ #pragma unroll
2353
+ for (int l = 0; l < tile_C::ne; ++l) {
2354
+ const int j = j0 + (threadIdx.y % ntx) * tile_C::J + tile_C::get_j(l);
2355
+
2356
+ if (j > j_max) {
2357
+ continue;
2358
+ }
2359
+
2360
+ const int i = i0 + n*tile_C::I + tile_C::get_i(l);
2361
+
2362
+ if (need_check && i > i_max) {
2363
+ continue;
2364
+ }
2365
+
2366
+ dst[ids_dst[j]*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l];
2367
+ }
2368
+ }
2369
+ }
2370
+ }
2371
+
2372
+ // -------------------------------------------------------------------------------------------------------------------------------------
2373
+
2374
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check, ggml_type type>
2375
+ struct mmq_type_traits;
2376
+
2377
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2378
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
2379
+ static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;
2380
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>;
2381
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_DS4>;
2382
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2383
+ };
2384
+
2385
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2386
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
2387
+ static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ;
2388
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>;
2389
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
2390
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2391
+ };
2392
+
2393
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2394
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
2395
+ static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ;
2396
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>;
2397
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
2398
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2399
+ };
2400
+
2401
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2402
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
2403
+ static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ;
2404
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>;
2405
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
2406
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2407
+ };
2408
+
2409
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2410
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
2411
+ static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ;
2412
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>;
2413
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
2414
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2415
+ };
2416
+
2417
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2418
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
2419
+ static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ;
2420
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>;
2421
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q2_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
2422
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q2_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2423
+ };
2424
+
2425
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2426
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
2427
+ static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ;
2428
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>;
2429
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y, nwarps>;
2430
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q3_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2431
+ };
2432
+
2433
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2434
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
2435
+ static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ;
2436
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
2437
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
2438
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2439
+ };
2440
+
2441
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2442
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
2443
+ static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ;
2444
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
2445
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
2446
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q5_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2447
+ };
2448
+
2449
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2450
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
2451
+ static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ;
2452
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
2453
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q6_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
2454
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q6_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2455
+ };
2456
+
2457
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2458
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_XXS> {
2459
+ static constexpr int vdr = VDR_IQ2_XXS_Q8_1_MMQ;
2460
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_xxs<mmq_y, nwarps, need_check>;
2461
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
2462
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2463
+ };
2464
+
2465
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2466
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_XS> {
2467
+ static constexpr int vdr = VDR_IQ2_XS_Q8_1_MMQ;
2468
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_xs<mmq_y, nwarps, need_check>;
2469
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y, nwarps>;
2470
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2471
+ };
2472
+
2473
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2474
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_S> {
2475
+ static constexpr int vdr = VDR_IQ2_S_Q8_1_MMQ;
2476
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_s<mmq_y, nwarps, need_check>;
2477
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y, nwarps>;
2478
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2479
+ };
2480
+
2481
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2482
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ3_XXS> {
2483
+ static constexpr int vdr = VDR_IQ3_XXS_Q8_1_MMQ;
2484
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_xxs<mmq_y, nwarps, need_check>;
2485
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
2486
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2487
+ };
2488
+
2489
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2490
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ3_S> {
2491
+ static constexpr int vdr = VDR_IQ3_S_Q8_1_MMQ;
2492
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_s<mmq_y, nwarps, need_check>;
2493
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
2494
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2495
+ };
2496
+
2497
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2498
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ1_S> {
2499
+ static constexpr int vdr = VDR_IQ1_S_Q8_1_MMQ;
2500
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq1_s<mmq_y, nwarps, need_check>;
2501
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
2502
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2503
+ };
2504
+
2505
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2506
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ4_NL> {
2507
+ static constexpr int vdr = VDR_IQ4_NL_Q8_1_MMQ;
2508
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_nl<mmq_y, nwarps, need_check>;
2509
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
2510
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2511
+ };
2512
+
2513
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
2514
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ4_XS> {
2515
+ static constexpr int vdr = VDR_IQ4_XS_Q8_1_MMQ;
2516
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_xs<mmq_y, nwarps, need_check>;
2517
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
2518
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
2519
+ };
2520
+
2521
+ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
2522
+ static __device__ __forceinline__ void mul_mat_q_process_tile(
2523
+ const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
2524
+ const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
2525
+ const int stride_row_x, const int ncols_y, const int stride_col_dst,
2526
+ const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
2527
+
2528
+ constexpr int qk = ggml_cuda_type_traits<type>::qk;
2529
+ constexpr int mmq_y = get_mmq_y_device();
2530
+ constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
2531
+
2532
+ extern __shared__ int data_mul_mat_q[];
2533
+ int * tile_y = data_mul_mat_q + mmq_x;
2534
+ int * tile_x = tile_y + GGML_PAD(mmq_x*(WARP_SIZE + WARP_SIZE/QI8_1), nwarps*WARP_SIZE);
2535
+
2536
+ #ifdef NEW_MMA_AVAILABLE
2537
+ constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot_mma;
2538
+ constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
2539
+ #else
2540
+ constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot_dp4a;
2541
+ constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
2542
+ #endif // NEW_MMA_AVAILABLE
2543
+
2544
+ constexpr int blocks_per_iter = MMQ_ITER_K / qk;
2545
+
2546
+ float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
2547
+
2548
+ for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
2549
+ load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
2550
+
2551
+ {
2552
+ const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int));
2553
+ #pragma unroll
2554
+ for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
2555
+ int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
2556
+
2557
+ tile_y[l] = by0[l];
2558
+ }
2559
+ }
2560
+
2561
+ __syncthreads();
2562
+
2563
+ vec_dot(tile_x, tile_y, sum, 0);
2564
+
2565
+ __syncthreads();
2566
+
2567
+ {
2568
+ const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int));
2569
+ #pragma unroll
2570
+ for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
2571
+ int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
2572
+
2573
+ tile_y[l] = by0[l];
2574
+ }
2575
+ }
2576
+
2577
+ __syncthreads();
2578
+
2579
+ vec_dot(tile_x, tile_y, sum, WARP_SIZE);
2580
+
2581
+ __syncthreads();
2582
+ }
2583
+
2584
+ if (fixup) {
2585
+ write_back(sum, ids_dst, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x);
2586
+ } else {
2587
+ write_back(sum, ids_dst, dst, stride_col_dst, tile_x_max_i, tile_y_max_j);
2588
+ }
2589
+ }
2590
+
2591
+
2592
+ // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
2593
+
2594
+ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
2595
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
2596
+ #if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
2597
+ __launch_bounds__(WARP_SIZE*nwarps, 2)
2598
+ #endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
2599
+ #else
2600
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
2601
+ __launch_bounds__(WARP_SIZE*nwarps, 1)
2602
+ #else
2603
+ __launch_bounds__(WARP_SIZE*nwarps, 2)
2604
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
2605
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
2606
+ static __global__ void mul_mat_q(
2607
+ const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
2608
+ const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
2609
+ const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
2610
+ const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
2611
+ const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
2612
+
2613
+ // Skip unused template specializations for faster compilation:
2614
+ if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
2615
+ NO_DEVICE_CODE;
2616
+ return;
2617
+ }
2618
+
2619
+ constexpr int qk = ggml_cuda_type_traits<type>::qk;
2620
+ constexpr int mmq_y = get_mmq_y_device();
2621
+
2622
+ const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x
2623
+ const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
2624
+
2625
+ // Initialize the ids for writing back data with just the index.
2626
+ // For regular matrix multiplications this is never changed.
2627
+ // For MoE the correct indices are loaded from ids_dst.
2628
+ extern __shared__ int ids_dst_shared[]; // Stored at beginning of shared memory.
2629
+ #pragma unroll
2630
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
2631
+ const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
2632
+
2633
+ if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) {
2634
+ break;
2635
+ }
2636
+
2637
+ ids_dst_shared[j] = j;
2638
+ }
2639
+ __syncthreads();
2640
+
2641
+ // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
2642
+ #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
2643
+ {
2644
+ const int wt = blockIdx.z / nchannels_y;
2645
+ const int zt = blockIdx.z - wt*nchannels_y;
2646
+ const int jt = blockIdx.y;
2647
+ const int it = blockIdx.x;
2648
+
2649
+ // Defaults for regular matrix multiplication:
2650
+ int col_low = 0;
2651
+ int col_high = ncols_dst;
2652
+ int col_diff = ncols_dst;
2653
+ int offset_y = wt*stride_sample_y + zt*stride_channel_y;
2654
+ int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
2655
+
2656
+ if (ids_dst) {
2657
+ col_low = expert_bounds[zt + 0];
2658
+ col_high = expert_bounds[zt + 1];
2659
+ col_diff = col_high - col_low;
2660
+
2661
+ offset_y = 0;
2662
+ offset_dst = 0;
2663
+
2664
+ if (jt*mmq_x >= col_diff) {
2665
+ return;
2666
+ }
2667
+
2668
+ // __syncthreads(); // There is no previous tile that could cause a race condition.
2669
+ #pragma unroll
2670
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
2671
+ const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
2672
+
2673
+ if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) {
2674
+ break;
2675
+ }
2676
+
2677
+ ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
2678
+ }
2679
+ __syncthreads();
2680
+ }
2681
+
2682
+ offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
2683
+ offset_dst += it*mmq_y;
2684
+
2685
+ const int tile_x_max_i = nrows_x - it*mmq_y - 1;
2686
+ const int tile_y_max_j = col_diff - jt*mmq_x - 1;
2687
+
2688
+ const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
2689
+
2690
+ constexpr bool fixup = false;
2691
+ mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
2692
+ (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
2693
+ tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
2694
+ return;
2695
+ }
2696
+ #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
2697
+
2698
+ const int64_t blocks_per_ne00 = ncols_x / qk;
2699
+ constexpr int blocks_per_iter = MMQ_ITER_K / qk;
2700
+
2701
+ // kbc == k block continuous, current index in continuous ijk space.
2702
+ int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
2703
+ int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
2704
+
2705
+ kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
2706
+ kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter;
2707
+
2708
+ // kb0 == k index when doing the matrix multiplication for an output tile.
2709
+ int kb0_start = kbc % blocks_per_ne00;
2710
+ int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
2711
+ while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
2712
+ int tmp = kbc;
2713
+ const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
2714
+ tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
2715
+ const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
2716
+ tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
2717
+ const int zt = tmp / (ntx*blocks_per_ne00);
2718
+ tmp -= zt * (ntx*blocks_per_ne00);
2719
+ const int jt = tmp / blocks_per_ne00;
2720
+
2721
+ // Defaults for regular matrix multiplication:
2722
+ int col_low = 0;
2723
+ int col_high = ncols_dst;
2724
+ int col_diff = ncols_dst;
2725
+ int offset_y = wt*stride_sample_y + zt*stride_channel_y;
2726
+ int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
2727
+
2728
+ if (ids_dst) {
2729
+ col_low = expert_bounds[zt + 0];
2730
+ col_high = expert_bounds[zt + 1];
2731
+ col_diff = col_high - col_low;
2732
+
2733
+ offset_y = 0;
2734
+ offset_dst = 0;
2735
+
2736
+ if (jt*mmq_x >= col_diff) {
2737
+ kbc += blocks_per_ne00;
2738
+ kbc -= kbc % blocks_per_ne00;
2739
+
2740
+ kb0_start = 0;
2741
+ kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
2742
+
2743
+ continue;
2744
+ }
2745
+
2746
+ __syncthreads();
2747
+ #pragma unroll
2748
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
2749
+ const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
2750
+
2751
+ if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) {
2752
+ break;
2753
+ }
2754
+
2755
+ ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
2756
+ }
2757
+ __syncthreads();
2758
+ }
2759
+
2760
+ offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
2761
+ offset_dst += it*mmq_y;
2762
+
2763
+ const int tile_x_max_i = nrows_x - it*mmq_y - 1;
2764
+ const int tile_y_max_j = col_diff - jt*mmq_x - 1;
2765
+
2766
+ const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
2767
+
2768
+ constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
2769
+ mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
2770
+ (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
2771
+ tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
2772
+
2773
+ kbc += blocks_per_ne00;
2774
+ kbc -= kbc % blocks_per_ne00;
2775
+
2776
+ kb0_start = 0;
2777
+ kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
2778
+ }
2779
+
2780
+ if (kbc >= kbc_stop) {
2781
+ return;
2782
+ }
2783
+
2784
+ int tmp = kbc;
2785
+ const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
2786
+ tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
2787
+ const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
2788
+ tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
2789
+ const int zt = tmp / (ntx*blocks_per_ne00);
2790
+ tmp -= zt * (ntx*blocks_per_ne00);
2791
+ const int jt = tmp / blocks_per_ne00;
2792
+
2793
+ // Defaults for regular matrix multiplication:
2794
+ int col_low = 0;
2795
+ int col_high = ncols_dst;
2796
+ int col_diff = ncols_dst;
2797
+ int offset_y = wt*stride_sample_y + zt*stride_channel_y;
2798
+ int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
2799
+
2800
+ if (ids_dst) {
2801
+ col_low = expert_bounds[zt + 0];
2802
+ col_high = expert_bounds[zt + 1];
2803
+ col_diff = col_high - col_low;
2804
+
2805
+ offset_y = 0;
2806
+ offset_dst = 0;
2807
+
2808
+ if (jt*mmq_x >= col_diff) {
2809
+ return;
2810
+ }
2811
+
2812
+ // The memory layout for the fixup buffer is always contiguous, therefore reset ids:
2813
+ __syncthreads();
2814
+ #pragma unroll
2815
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
2816
+ const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
2817
+
2818
+ if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) {
2819
+ break;
2820
+ }
2821
+
2822
+ ids_dst_shared[j] = j;
2823
+ }
2824
+ __syncthreads();
2825
+ }
2826
+
2827
+ offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
2828
+ offset_dst += it*mmq_y;
2829
+
2830
+ const int tile_x_max_i = nrows_x - it*mmq_y - 1;
2831
+ const int tile_y_max_j = col_diff - jt*mmq_x - 1;
2832
+
2833
+ const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
2834
+
2835
+ constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
2836
+ mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
2837
+ (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
2838
+ tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
2839
+ }
2840
+
2841
+
2842
+ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
2843
+ static __global__ void mul_mat_q_stream_k_fixup(
2844
+ const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
2845
+ const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
2846
+ const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
2847
+ constexpr int mmq_y = get_mmq_y_device();
2848
+ constexpr int qk = ggml_cuda_type_traits<type>::qk;
2849
+ constexpr int blocks_per_iter = MMQ_ITER_K / qk;
2850
+ const int64_t blocks_per_ne00 = ncols_x / qk;
2851
+
2852
+ float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
2853
+
2854
+ const int ntx = (ncols_dst + mmq_x - 1) / mmq_x;
2855
+ const int nty = (nrows_x + mmq_y - 1) / mmq_y;
2856
+
2857
+ const int bidx0 = blockIdx.x;
2858
+
2859
+ // kbc == k block continuous, current index in continuous ijk space.
2860
+ int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
2861
+ int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
2862
+
2863
+ kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter;
2864
+ kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter;
2865
+
2866
+ const bool did_not_have_any_data = kbc0 == kbc0_stop;
2867
+ const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0;
2868
+ const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0;
2869
+ if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
2870
+ return;
2871
+ }
2872
+
2873
+ bool any_fixup = false;
2874
+
2875
+ // Iterate over previous blocks and sum up partial sums written to fixup buffer.
2876
+ // All CUDA blocks that get here must have a previous block that needs a fixup.
2877
+ int64_t bidx = bidx0 - 1;
2878
+ int64_t kbc_stop = kbc0;
2879
+ while(true) {
2880
+ int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
2881
+ kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
2882
+
2883
+ if (kbc == kbc_stop) { // Did not have any data.
2884
+ bidx--;
2885
+ kbc_stop = kbc;
2886
+ continue;
2887
+ }
2888
+
2889
+ any_fixup = true;
2890
+
2891
+ #pragma unroll
2892
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
2893
+ const int j = j0 + threadIdx.y;
2894
+
2895
+ #pragma unroll
2896
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
2897
+ const int i = i0 + threadIdx.x;
2898
+
2899
+ sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
2900
+ }
2901
+ }
2902
+
2903
+ // If this block started in a previous tile we are done and don't need to combine additional partial results.
2904
+ if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) {
2905
+ break;
2906
+ }
2907
+ bidx--;
2908
+ kbc_stop = kbc;
2909
+ }
2910
+
2911
+ if (!any_fixup) {
2912
+ return;
2913
+ }
2914
+
2915
+ int tmp = kbc0;
2916
+ const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
2917
+ tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
2918
+ const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
2919
+ tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
2920
+ const int zt = tmp / (ntx*blocks_per_ne00);
2921
+ tmp -= zt * (ntx*blocks_per_ne00);
2922
+ const int jt = tmp / blocks_per_ne00;
2923
+
2924
+ if (!ids_dst) {
2925
+ const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
2926
+ dst += offset_dst;
2927
+
2928
+ const int i_max = nrows_x - it*mmq_y - 1;
2929
+ const int j_max = ncols_dst - jt*mmq_x - 1;
2930
+
2931
+ #pragma unroll
2932
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
2933
+ const int j = j0 + threadIdx.y;
2934
+
2935
+ if (j > j_max) {
2936
+ return;
2937
+ }
2938
+
2939
+ #pragma unroll
2940
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
2941
+ const int i = i0 + threadIdx.x;
2942
+
2943
+ if (need_check && i > i_max) {
2944
+ continue;
2945
+ }
2946
+
2947
+ dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
2948
+ }
2949
+ }
2950
+ return;
2951
+ }
2952
+
2953
+ __shared__ int ids_dst_shared[mmq_x];
2954
+ const int col_low = expert_bounds[zt + 0];
2955
+ const int col_high = expert_bounds[zt + 1];
2956
+ const int col_diff = col_high - col_low;
2957
+
2958
+ for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) {
2959
+ ids_dst_shared[j] = ids_dst[col_low + j];
2960
+ }
2961
+ __syncthreads();
2962
+
2963
+ const int offset_dst = it*mmq_y;
2964
+ dst += offset_dst;
2965
+
2966
+ const int i_max = nrows_x - it*mmq_y - 1;
2967
+ const int j_max = col_diff - jt*mmq_x - 1;
2968
+
2969
+ #pragma unroll
2970
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
2971
+ const int j = j0 + threadIdx.y;
2972
+
2973
+ if (j > j_max) {
2974
+ return;
2975
+ }
2976
+
2977
+ #pragma unroll
2978
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
2979
+ const int i = i0 + threadIdx.x;
2980
+
2981
+ if (need_check && i > i_max) {
2982
+ continue;
2983
+ }
2984
+
2985
+ dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
2986
+ }
2987
+ }
2988
+ }
2989
+
2990
+ struct mmq_args {
2991
+ const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
2992
+ int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst;
2993
+ int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
2994
+ int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
2995
+ bool use_stream_k;
2996
+ };
2997
+
2998
+ template<ggml_type type>
2999
+ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc) {
3000
+ const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y);
3001
+ const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
3002
+ const size_t nbs_ids = mmq_x*sizeof(int);
3003
+ const size_t nbs_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
3004
+ const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq);
3005
+ return nbs_ids + nbs_x + GGML_PAD(nbs_y, MMQ_NWARPS*WARP_SIZE*sizeof(int));
3006
+ }
3007
+
3008
+ template <ggml_type type, int mmq_x>
3009
+ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
3010
+ const int id = ggml_cuda_get_device();
3011
+ const int cc = ggml_cuda_info().devices[id].cc;
3012
+ const int nsm = ggml_cuda_info().devices[id].nsm;
3013
+ const int mmq_y = get_mmq_y_host(cc);
3014
+
3015
+ const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1);
3016
+
3017
+ const int nbytes_shared = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc);
3018
+
3019
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
3020
+ static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
3021
+ if (!shared_memory_limit_raised[id]) {
3022
+ CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
3023
+ CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
3024
+ shared_memory_limit_raised[id] = true;
3025
+ }
3026
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
3027
+
3028
+ const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
3029
+ const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;
3030
+ const int ntzw = args.nchannels_y * args.nsamples_y;
3031
+ const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
3032
+
3033
+ GGML_ASSERT(args.nchannels_y % args.nchannels_x == 0);
3034
+ GGML_ASSERT(args.nsamples_y % args.nsamples_x == 0);
3035
+ const int channel_ratio = args.nchannels_y / args.nchannels_x;
3036
+ const int sample_ratio = args.nsamples_y / args.nsamples_x;
3037
+
3038
+ if (!args.use_stream_k) {
3039
+ if (args.nrows_x % mmq_y == 0) {
3040
+ constexpr bool need_check = false;
3041
+ mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
3042
+ (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
3043
+ args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
3044
+ channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
3045
+ sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
3046
+ } else {
3047
+ constexpr bool need_check = true;
3048
+ mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
3049
+ (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
3050
+ args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
3051
+ channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
3052
+ sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
3053
+ }
3054
+ return;
3055
+ }
3056
+
3057
+ const dim3 block_nums_stream_k(nsm, 1, 1);
3058
+ const bool fixup_needed = ntx*nty*ntzw % nsm != 0;
3059
+
3060
+ ggml_cuda_pool & pool = ctx.pool(id);
3061
+ ggml_cuda_pool_alloc<float> tmp_fixup(pool);
3062
+ if (fixup_needed) {
3063
+ tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y);
3064
+ }
3065
+
3066
+ if (args.nrows_x % mmq_y == 0) {
3067
+ constexpr bool need_check = false;
3068
+
3069
+ mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
3070
+ (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
3071
+ args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
3072
+ channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
3073
+ sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
3074
+
3075
+ if (!fixup_needed) {
3076
+ return;
3077
+ }
3078
+
3079
+ mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
3080
+ (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
3081
+ args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
3082
+ } else {
3083
+ constexpr bool need_check = true;
3084
+
3085
+ mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
3086
+ (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
3087
+ args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
3088
+ channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
3089
+ sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
3090
+
3091
+ if (!fixup_needed) {
3092
+ return;
3093
+ }
3094
+
3095
+ mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
3096
+ (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
3097
+ args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
3098
+ }
3099
+ }
3100
+
3101
+ template <ggml_type type>
3102
+ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
3103
+ const int id = ggml_cuda_get_device();
3104
+ const int cc = ggml_cuda_info().devices[id].cc;
3105
+ const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
3106
+
3107
+ const int mmq_x_max = get_mmq_x_max_host(cc);
3108
+ const int mmq_y = get_mmq_y_host(cc);
3109
+
3110
+ int mmq_x_best = 0;
3111
+ int ntiles_x_best = INT_MAX;
3112
+
3113
+ for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) {
3114
+ const int granularity = mmq_get_granularity_host(mmq_x, cc);
3115
+
3116
+ if (mmq_x % granularity != 0 || mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc) > smpbo) {
3117
+ continue;
3118
+ }
3119
+
3120
+ const int ntiles_x = (args.ncols_y + mmq_x - 1) / mmq_x;
3121
+
3122
+ if (ntiles_x < ntiles_x_best) {
3123
+ mmq_x_best = mmq_x;
3124
+ ntiles_x_best = ntiles_x;
3125
+ }
3126
+ }
3127
+
3128
+ switch (mmq_x_best) {
3129
+ case 8:
3130
+ launch_mul_mat_q<type, 8>(ctx, args, stream);
3131
+ break;
3132
+ case 16:
3133
+ launch_mul_mat_q<type, 16>(ctx, args, stream);
3134
+ break;
3135
+ case 24:
3136
+ launch_mul_mat_q<type, 24>(ctx, args, stream);
3137
+ break;
3138
+ case 32:
3139
+ launch_mul_mat_q<type, 32>(ctx, args, stream);
3140
+ break;
3141
+ case 40:
3142
+ launch_mul_mat_q<type, 40>(ctx, args, stream);
3143
+ break;
3144
+ case 48:
3145
+ launch_mul_mat_q<type, 48>(ctx, args, stream);
3146
+ break;
3147
+ case 56:
3148
+ launch_mul_mat_q<type, 56>(ctx, args, stream);
3149
+ break;
3150
+ case 64:
3151
+ launch_mul_mat_q<type, 64>(ctx, args, stream);
3152
+ break;
3153
+ case 72:
3154
+ launch_mul_mat_q<type, 72>(ctx, args, stream);
3155
+ break;
3156
+ case 80:
3157
+ launch_mul_mat_q<type, 80>(ctx, args, stream);
3158
+ break;
3159
+ case 88:
3160
+ launch_mul_mat_q<type, 88>(ctx, args, stream);
3161
+ break;
3162
+ case 96:
3163
+ launch_mul_mat_q<type, 96>(ctx, args, stream);
3164
+ break;
3165
+ case 104:
3166
+ launch_mul_mat_q<type, 104>(ctx, args, stream);
3167
+ break;
3168
+ case 112:
3169
+ launch_mul_mat_q<type, 112>(ctx, args, stream);
3170
+ break;
3171
+ case 120:
3172
+ launch_mul_mat_q<type, 120>(ctx, args, stream);
3173
+ break;
3174
+ case 128:
3175
+ launch_mul_mat_q<type, 128>(ctx, args, stream);
3176
+ break;
3177
+ default:
3178
+ fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
3179
+ GGML_ABORT("fatal error");
3180
+ break;
3181
+ }
3182
+ }
3183
+
3184
+ #define DECL_MMQ_CASE(type) \
3185
+ template void mul_mat_q_case<type>(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) \
3186
+
3187
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_0);
3188
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_1);
3189
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_0);
3190
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_1);
3191
+ extern DECL_MMQ_CASE(GGML_TYPE_Q8_0);
3192
+ extern DECL_MMQ_CASE(GGML_TYPE_Q2_K);
3193
+ extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
3194
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_K);
3195
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_K);
3196
+ extern DECL_MMQ_CASE(GGML_TYPE_Q6_K);
3197
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
3198
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
3199
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
3200
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
3201
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
3202
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
3203
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
3204
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
3205
+
3206
+ // -------------------------------------------------------------------------------------------------------------------------
3207
+
3208
+ void ggml_cuda_mul_mat_q(
3209
+ ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
3210
+
3211
+ void ggml_cuda_op_mul_mat_q(
3212
+ ggml_backend_cuda_context & ctx,
3213
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
3214
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
3215
+ const int64_t src1_padded_row_size, cudaStream_t stream);
3216
+
3217
+ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11);