toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2107) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1124 -0
  3. data/LICENSE +21 -0
  4. data/Makefile +2022 -0
  5. data/README.md +154 -0
  6. data/bin/toy +10 -0
  7. data/lib/toy/compute.rb +135 -0
  8. data/lib/toy/compute_cuda.rb +104 -0
  9. data/lib/toy/compute_metal.rb +97 -0
  10. data/lib/toy/core/cli/describe.rb +188 -0
  11. data/lib/toy/core/cli/eval.rb +385 -0
  12. data/lib/toy/core/cli/exit_codes.rb +15 -0
  13. data/lib/toy/core/cli/fetch.rb +238 -0
  14. data/lib/toy/core/cli/infer.rb +268 -0
  15. data/lib/toy/core/cli/install.rb +228 -0
  16. data/lib/toy/core/cli/list.rb +86 -0
  17. data/lib/toy/core/cli/manifest.rb +49 -0
  18. data/lib/toy/core/cli/new.rb +594 -0
  19. data/lib/toy/core/cli/serve.rb +237 -0
  20. data/lib/toy/core/cli/train.rb +471 -0
  21. data/lib/toy/core/cli.rb +165 -0
  22. data/lib/toy/core/config.rb +64 -0
  23. data/lib/toy/core/gguf_meta.rb +161 -0
  24. data/lib/toy/core/model_scan.rb +221 -0
  25. data/lib/toy/core/run_log.rb +94 -0
  26. data/lib/toy/core/toy_root.rb +95 -0
  27. data/lib/toy/dev/toy_card.rb +299 -0
  28. data/lib/toy/dev/toy_describe_flow.rb +412 -0
  29. data/lib/toy/dev/toy_logprobs.rb +86 -0
  30. data/lib/toy/dev/toy_tap.rb +183 -0
  31. data/lib/toy/dev/toy_token_drift.rb +121 -0
  32. data/lib/toy/ffi/tinynn.rb +1491 -0
  33. data/lib/toy/ffi/tinynn_cuda.rb +1124 -0
  34. data/lib/toy/ffi/tinynn_metal.rb +359 -0
  35. data/lib/toy/ffi_manifest.rb +84 -0
  36. data/lib/toy/io/bpe.rb +325 -0
  37. data/lib/toy/io/gguf_kv.rb +35 -0
  38. data/lib/toy/io/gguf_load.rb +331 -0
  39. data/lib/toy/io/loaders/toy_gpt2_loader.rb +70 -0
  40. data/lib/toy/io/loaders/toy_smollm2_loader.rb +754 -0
  41. data/lib/toy/io/model_index.rb +206 -0
  42. data/lib/toy/io/run_bundle.rb +280 -0
  43. data/lib/toy/io/tokenizer.rb +613 -0
  44. data/lib/toy/io/toy_corpus_loader.rb +52 -0
  45. data/lib/toy/io/toy_events.rb +56 -0
  46. data/lib/toy/io/toy_image_loader.rb +48 -0
  47. data/lib/toy/llm/adamw.rb +169 -0
  48. data/lib/toy/llm/archs/llama_arch.rb +233 -0
  49. data/lib/toy/llm/archs/llama_arch_cuda.rb +237 -0
  50. data/lib/toy/llm/archs/llama_arch_metal.rb +237 -0
  51. data/lib/toy/llm/blocks/transformer_block.rb +876 -0
  52. data/lib/toy/llm/blocks/transformer_block_cuda.rb +880 -0
  53. data/lib/toy/llm/blocks/transformer_block_metal.rb +880 -0
  54. data/lib/toy/llm/classify_batch.rb +88 -0
  55. data/lib/toy/llm/engine/gpt2_fwd_engine.rb +360 -0
  56. data/lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb +362 -0
  57. data/lib/toy/llm/engine/gpt2_fwd_engine_metal.rb +362 -0
  58. data/lib/toy/llm/engine/gpt2_kv_engine.rb +346 -0
  59. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +348 -0
  60. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +348 -0
  61. data/lib/toy/llm/engine/gpt2_seq_engine.rb +289 -0
  62. data/lib/toy/llm/engine/gpt2_seq_engine_cuda.rb +293 -0
  63. data/lib/toy/llm/engine/gpt2_seq_engine_metal.rb +293 -0
  64. data/lib/toy/llm/engine/llama_kv_engine.rb +1593 -0
  65. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +1526 -0
  66. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +1526 -0
  67. data/lib/toy/llm/engine/llama_seq_engine.rb +1233 -0
  68. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +1238 -0
  69. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +1238 -0
  70. data/lib/toy/llm/engine/vit_tiny_engine.rb +467 -0
  71. data/lib/toy/llm/labels.rb +142 -0
  72. data/lib/toy/llm/primitives/gqa.rb +62 -0
  73. data/lib/toy/llm/primitives/gqa_cuda.rb +66 -0
  74. data/lib/toy/llm/primitives/gqa_metal.rb +66 -0
  75. data/lib/toy/llm/primitives/rms_norm.rb +39 -0
  76. data/lib/toy/llm/primitives/rms_norm_cuda.rb +43 -0
  77. data/lib/toy/llm/primitives/rms_norm_metal.rb +43 -0
  78. data/lib/toy/llm/primitives/rope.rb +68 -0
  79. data/lib/toy/llm/primitives/rope_cuda.rb +72 -0
  80. data/lib/toy/llm/primitives/rope_metal.rb +72 -0
  81. data/lib/toy/llm/primitives/swiglu.rb +41 -0
  82. data/lib/toy/llm/primitives/swiglu_cuda.rb +45 -0
  83. data/lib/toy/llm/primitives/swiglu_metal.rb +45 -0
  84. data/lib/toy/llm/recipe_options.rb +71 -0
  85. data/lib/toy/llm/recipes/from_scratch.rb +105 -0
  86. data/lib/toy/llm/recipes/from_scratch_cuda.rb +109 -0
  87. data/lib/toy/llm/recipes/from_scratch_metal.rb +109 -0
  88. data/lib/toy/llm/recipes/lora.rb +110 -0
  89. data/lib/toy/llm/recipes/lora_cuda.rb +114 -0
  90. data/lib/toy/llm/recipes/lora_metal.rb +114 -0
  91. data/lib/toy/llm/recipes/vit_tiny.rb +75 -0
  92. data/lib/toy/llm/recipes/warm_start.rb +235 -0
  93. data/lib/toy/llm/recipes/warm_start_cuda.rb +239 -0
  94. data/lib/toy/llm/recipes/warm_start_metal.rb +239 -0
  95. data/lib/toy/llm/training_batch.rb +133 -0
  96. data/lib/toy/models/arch.rb +253 -0
  97. data/lib/toy/models/gpt2.rb +311 -0
  98. data/lib/toy/models/toy_gpt2.rb +177 -0
  99. data/lib/toy/models/toy_smollm2.rb +393 -0
  100. data/lib/toy/models/toy_vit.rb +83 -0
  101. data/lib/toy/models/transformer.rb +1494 -0
  102. data/lib/toy/models/transformer_lm.rb +298 -0
  103. data/lib/toy/models/transformer_lm_cuda.rb +159 -0
  104. data/lib/toy/models/transformer_lm_metal.rb +142 -0
  105. data/lib/toy/mri.rb +300 -0
  106. data/lib/toy/run/eval.rb +76 -0
  107. data/lib/toy/run/eval_cuda.rb +66 -0
  108. data/lib/toy/run/eval_lmc.rb +334 -0
  109. data/lib/toy/run/eval_metal.rb +67 -0
  110. data/lib/toy/run/infer.rb +130 -0
  111. data/lib/toy/run/infer_cuda.rb +118 -0
  112. data/lib/toy/run/infer_metal.rb +119 -0
  113. data/lib/toy/run/infer_trace.rb +37 -0
  114. data/lib/toy/run/serve.rb +144 -0
  115. data/lib/toy/run/train.rb +404 -0
  116. data/lib/toy/run/train_cuda.rb +397 -0
  117. data/lib/toy/run/train_gpt2.rb +103 -0
  118. data/lib/toy/run/train_gpt2_cuda.rb +85 -0
  119. data/lib/toy/run/train_gpt2_metal.rb +85 -0
  120. data/lib/toy/run/train_lora.rb +207 -0
  121. data/lib/toy/run/train_lora_cuda.rb +219 -0
  122. data/lib/toy/run/train_metal.rb +227 -0
  123. data/lib/toy/run/train_vit.rb +251 -0
  124. data/lib/toy/serve/openai/embeddings_handler.rb +92 -0
  125. data/lib/toy/serve/openai/handlers.rb +143 -0
  126. data/lib/toy/serve/openai/server.rb +159 -0
  127. data/lib/toy/train/sampler.rb +314 -0
  128. data/lib/toy/train/toy_chat_template.rb +179 -0
  129. data/lib/toy/train/toy_drift_grad.rb +176 -0
  130. data/lib/toy/train/toy_gguf_fuse.rb +428 -0
  131. data/lib/toy/train/toy_gguf_writer.rb +100 -0
  132. data/lib/toy/train/toy_lr_schedule.rb +39 -0
  133. data/lib/toy/train/toy_sample.rb +125 -0
  134. data/lib/toy/train/toy_trainer.rb +86 -0
  135. data/lib/toy/train/training.rb +160 -0
  136. data/lib/toy/version.rb +11 -0
  137. data/lib/toy.rb +902 -0
  138. data/prep/progress +118 -0
  139. data/prep/quietly +64 -0
  140. data/sig/toy.rbs +397 -0
  141. data/sig/toy_compute.rbs +450 -0
  142. data/spinel-ext.json +122 -0
  143. data/tinynn/Makefile +71 -0
  144. data/tinynn/tinynn_backend_cuda.c +99 -0
  145. data/tinynn/tinynn_backend_metal.m +75 -0
  146. data/tinynn/tinynn_events.c +122 -0
  147. data/tinynn/tinynn_events.h +83 -0
  148. data/tinynn/tinynn_ggml.c +2460 -0
  149. data/tinynn/tinynn_ggml.h +545 -0
  150. data/tinynn/tinynn_gguf.c +783 -0
  151. data/tinynn/tinynn_gguf.h +167 -0
  152. data/tinynn/tinynn_trace.c +180 -0
  153. data/tinynn/tinynn_trace.h +85 -0
  154. data/vendor/ggml/AUTHORS +335 -0
  155. data/vendor/ggml/CMakeLists.txt +505 -0
  156. data/vendor/ggml/CONTRIBUTING.md +3 -0
  157. data/vendor/ggml/LICENSE +21 -0
  158. data/vendor/ggml/README.md +50 -0
  159. data/vendor/ggml/ci/run.sh +395 -0
  160. data/vendor/ggml/cmake/FindNCCL.cmake +36 -0
  161. data/vendor/ggml/cmake/GitVars.cmake +22 -0
  162. data/vendor/ggml/cmake/common.cmake +50 -0
  163. data/vendor/ggml/cmake/ggml-config.cmake.in +191 -0
  164. data/vendor/ggml/docs/gguf.md +828 -0
  165. data/vendor/ggml/examples/CMakeLists.txt +34 -0
  166. data/vendor/ggml/examples/common-ggml.cpp +244 -0
  167. data/vendor/ggml/examples/common-ggml.h +18 -0
  168. data/vendor/ggml/examples/common.cpp +675 -0
  169. data/vendor/ggml/examples/common.h +322 -0
  170. data/vendor/ggml/examples/gpt-2/CMakeLists.txt +32 -0
  171. data/vendor/ggml/examples/gpt-2/README.md +225 -0
  172. data/vendor/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  173. data/vendor/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  174. data/vendor/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  175. data/vendor/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  176. data/vendor/ggml/examples/gpt-2/download-model.sh +48 -0
  177. data/vendor/ggml/examples/gpt-2/main-alloc.cpp +880 -0
  178. data/vendor/ggml/examples/gpt-2/main-backend.cpp +946 -0
  179. data/vendor/ggml/examples/gpt-2/main-batched.cpp +1210 -0
  180. data/vendor/ggml/examples/gpt-2/main-ctx.cpp +840 -0
  181. data/vendor/ggml/examples/gpt-2/main-sched.cpp +1079 -0
  182. data/vendor/ggml/examples/gpt-2/quantize.cpp +184 -0
  183. data/vendor/ggml/examples/gpt-j/CMakeLists.txt +13 -0
  184. data/vendor/ggml/examples/gpt-j/README.md +239 -0
  185. data/vendor/ggml/examples/gpt-j/convert-h5-to-ggml.py +173 -0
  186. data/vendor/ggml/examples/gpt-j/download-ggml-model.sh +69 -0
  187. data/vendor/ggml/examples/gpt-j/download-model.sh +11 -0
  188. data/vendor/ggml/examples/gpt-j/main.cpp +755 -0
  189. data/vendor/ggml/examples/gpt-j/quantize.cpp +182 -0
  190. data/vendor/ggml/examples/magika/CMakeLists.txt +17 -0
  191. data/vendor/ggml/examples/magika/README.md +23 -0
  192. data/vendor/ggml/examples/magika/convert.py +32 -0
  193. data/vendor/ggml/examples/magika/main.cpp +374 -0
  194. data/vendor/ggml/examples/mnist/CMakeLists.txt +58 -0
  195. data/vendor/ggml/examples/mnist/README.md +206 -0
  196. data/vendor/ggml/examples/mnist/mnist-common.cpp +496 -0
  197. data/vendor/ggml/examples/mnist/mnist-common.h +166 -0
  198. data/vendor/ggml/examples/mnist/mnist-eval.cpp +67 -0
  199. data/vendor/ggml/examples/mnist/mnist-train-cnn.py +91 -0
  200. data/vendor/ggml/examples/mnist/mnist-train-fc.py +131 -0
  201. data/vendor/ggml/examples/mnist/mnist-train.cpp +39 -0
  202. data/vendor/ggml/examples/mnist/server.py +36 -0
  203. data/vendor/ggml/examples/mnist/web/index.html +178 -0
  204. data/vendor/ggml/examples/perf-metal/CMakeLists.txt +7 -0
  205. data/vendor/ggml/examples/perf-metal/perf-metal.cpp +152 -0
  206. data/vendor/ggml/examples/prompts/dolly-v2.txt +100 -0
  207. data/vendor/ggml/examples/prompts/gpt-2-chinese.txt +1 -0
  208. data/vendor/ggml/examples/prompts/gpt-2.txt +100 -0
  209. data/vendor/ggml/examples/prompts/gpt-j.txt +100 -0
  210. data/vendor/ggml/examples/prompts/gpt-neox-japanese.txt +1 -0
  211. data/vendor/ggml/examples/prompts/gpt-neox.txt +100 -0
  212. data/vendor/ggml/examples/prompts/polyglot-ko.txt +3 -0
  213. data/vendor/ggml/examples/prompts/replit.txt +100 -0
  214. data/vendor/ggml/examples/prompts/starcoder.txt +100 -0
  215. data/vendor/ggml/examples/prompts/test-cases.txt +110 -0
  216. data/vendor/ggml/examples/prompts/tokenize_huggingface.py +65 -0
  217. data/vendor/ggml/examples/prompts/whisper.txt +100 -0
  218. data/vendor/ggml/examples/python/README.md +115 -0
  219. data/vendor/ggml/examples/python/api.h +14 -0
  220. data/vendor/ggml/examples/python/example_add_quant.py +25 -0
  221. data/vendor/ggml/examples/python/example_test_all_quants.py +68 -0
  222. data/vendor/ggml/examples/python/ggml/__init__.py +58 -0
  223. data/vendor/ggml/examples/python/ggml/__init__.pyi +2406 -0
  224. data/vendor/ggml/examples/python/ggml/cffi.py +11 -0
  225. data/vendor/ggml/examples/python/ggml/ffi/__init__.pyi +7 -0
  226. data/vendor/ggml/examples/python/ggml/utils.py +182 -0
  227. data/vendor/ggml/examples/python/regenerate.py +42 -0
  228. data/vendor/ggml/examples/python/stubs.py +128 -0
  229. data/vendor/ggml/examples/python/test_tensor.py +258 -0
  230. data/vendor/ggml/examples/sam/CMakeLists.txt +13 -0
  231. data/vendor/ggml/examples/sam/README.md +95 -0
  232. data/vendor/ggml/examples/sam/convert-pth-to-ggml.py +147 -0
  233. data/vendor/ggml/examples/sam/example.jpg +0 -0
  234. data/vendor/ggml/examples/sam/sam.cpp +2370 -0
  235. data/vendor/ggml/examples/simple/CMakeLists.txt +21 -0
  236. data/vendor/ggml/examples/simple/README.md +61 -0
  237. data/vendor/ggml/examples/simple/simple-backend.cpp +153 -0
  238. data/vendor/ggml/examples/simple/simple-ctx.cpp +127 -0
  239. data/vendor/ggml/examples/stb_image.h +7987 -0
  240. data/vendor/ggml/examples/stb_image_write.h +1724 -0
  241. data/vendor/ggml/examples/test-cmake/CMakeLists.txt +10 -0
  242. data/vendor/ggml/examples/test-cmake/README.md +3 -0
  243. data/vendor/ggml/examples/test-cmake/test-cmake.cpp +6 -0
  244. data/vendor/ggml/examples/yolo/CMakeLists.txt +6 -0
  245. data/vendor/ggml/examples/yolo/README.md +59 -0
  246. data/vendor/ggml/examples/yolo/convert-yolov3-tiny.py +53 -0
  247. data/vendor/ggml/examples/yolo/data/coco.names +80 -0
  248. data/vendor/ggml/examples/yolo/data/labels/100_0.png +0 -0
  249. data/vendor/ggml/examples/yolo/data/labels/100_1.png +0 -0
  250. data/vendor/ggml/examples/yolo/data/labels/100_2.png +0 -0
  251. data/vendor/ggml/examples/yolo/data/labels/100_3.png +0 -0
  252. data/vendor/ggml/examples/yolo/data/labels/100_4.png +0 -0
  253. data/vendor/ggml/examples/yolo/data/labels/100_5.png +0 -0
  254. data/vendor/ggml/examples/yolo/data/labels/100_6.png +0 -0
  255. data/vendor/ggml/examples/yolo/data/labels/100_7.png +0 -0
  256. data/vendor/ggml/examples/yolo/data/labels/101_0.png +0 -0
  257. data/vendor/ggml/examples/yolo/data/labels/101_1.png +0 -0
  258. data/vendor/ggml/examples/yolo/data/labels/101_2.png +0 -0
  259. data/vendor/ggml/examples/yolo/data/labels/101_3.png +0 -0
  260. data/vendor/ggml/examples/yolo/data/labels/101_4.png +0 -0
  261. data/vendor/ggml/examples/yolo/data/labels/101_5.png +0 -0
  262. data/vendor/ggml/examples/yolo/data/labels/101_6.png +0 -0
  263. data/vendor/ggml/examples/yolo/data/labels/101_7.png +0 -0
  264. data/vendor/ggml/examples/yolo/data/labels/102_0.png +0 -0
  265. data/vendor/ggml/examples/yolo/data/labels/102_1.png +0 -0
  266. data/vendor/ggml/examples/yolo/data/labels/102_2.png +0 -0
  267. data/vendor/ggml/examples/yolo/data/labels/102_3.png +0 -0
  268. data/vendor/ggml/examples/yolo/data/labels/102_4.png +0 -0
  269. data/vendor/ggml/examples/yolo/data/labels/102_5.png +0 -0
  270. data/vendor/ggml/examples/yolo/data/labels/102_6.png +0 -0
  271. data/vendor/ggml/examples/yolo/data/labels/102_7.png +0 -0
  272. data/vendor/ggml/examples/yolo/data/labels/103_0.png +0 -0
  273. data/vendor/ggml/examples/yolo/data/labels/103_1.png +0 -0
  274. data/vendor/ggml/examples/yolo/data/labels/103_2.png +0 -0
  275. data/vendor/ggml/examples/yolo/data/labels/103_3.png +0 -0
  276. data/vendor/ggml/examples/yolo/data/labels/103_4.png +0 -0
  277. data/vendor/ggml/examples/yolo/data/labels/103_5.png +0 -0
  278. data/vendor/ggml/examples/yolo/data/labels/103_6.png +0 -0
  279. data/vendor/ggml/examples/yolo/data/labels/103_7.png +0 -0
  280. data/vendor/ggml/examples/yolo/data/labels/104_0.png +0 -0
  281. data/vendor/ggml/examples/yolo/data/labels/104_1.png +0 -0
  282. data/vendor/ggml/examples/yolo/data/labels/104_2.png +0 -0
  283. data/vendor/ggml/examples/yolo/data/labels/104_3.png +0 -0
  284. data/vendor/ggml/examples/yolo/data/labels/104_4.png +0 -0
  285. data/vendor/ggml/examples/yolo/data/labels/104_5.png +0 -0
  286. data/vendor/ggml/examples/yolo/data/labels/104_6.png +0 -0
  287. data/vendor/ggml/examples/yolo/data/labels/104_7.png +0 -0
  288. data/vendor/ggml/examples/yolo/data/labels/105_0.png +0 -0
  289. data/vendor/ggml/examples/yolo/data/labels/105_1.png +0 -0
  290. data/vendor/ggml/examples/yolo/data/labels/105_2.png +0 -0
  291. data/vendor/ggml/examples/yolo/data/labels/105_3.png +0 -0
  292. data/vendor/ggml/examples/yolo/data/labels/105_4.png +0 -0
  293. data/vendor/ggml/examples/yolo/data/labels/105_5.png +0 -0
  294. data/vendor/ggml/examples/yolo/data/labels/105_6.png +0 -0
  295. data/vendor/ggml/examples/yolo/data/labels/105_7.png +0 -0
  296. data/vendor/ggml/examples/yolo/data/labels/106_0.png +0 -0
  297. data/vendor/ggml/examples/yolo/data/labels/106_1.png +0 -0
  298. data/vendor/ggml/examples/yolo/data/labels/106_2.png +0 -0
  299. data/vendor/ggml/examples/yolo/data/labels/106_3.png +0 -0
  300. data/vendor/ggml/examples/yolo/data/labels/106_4.png +0 -0
  301. data/vendor/ggml/examples/yolo/data/labels/106_5.png +0 -0
  302. data/vendor/ggml/examples/yolo/data/labels/106_6.png +0 -0
  303. data/vendor/ggml/examples/yolo/data/labels/106_7.png +0 -0
  304. data/vendor/ggml/examples/yolo/data/labels/107_0.png +0 -0
  305. data/vendor/ggml/examples/yolo/data/labels/107_1.png +0 -0
  306. data/vendor/ggml/examples/yolo/data/labels/107_2.png +0 -0
  307. data/vendor/ggml/examples/yolo/data/labels/107_3.png +0 -0
  308. data/vendor/ggml/examples/yolo/data/labels/107_4.png +0 -0
  309. data/vendor/ggml/examples/yolo/data/labels/107_5.png +0 -0
  310. data/vendor/ggml/examples/yolo/data/labels/107_6.png +0 -0
  311. data/vendor/ggml/examples/yolo/data/labels/107_7.png +0 -0
  312. data/vendor/ggml/examples/yolo/data/labels/108_0.png +0 -0
  313. data/vendor/ggml/examples/yolo/data/labels/108_1.png +0 -0
  314. data/vendor/ggml/examples/yolo/data/labels/108_2.png +0 -0
  315. data/vendor/ggml/examples/yolo/data/labels/108_3.png +0 -0
  316. data/vendor/ggml/examples/yolo/data/labels/108_4.png +0 -0
  317. data/vendor/ggml/examples/yolo/data/labels/108_5.png +0 -0
  318. data/vendor/ggml/examples/yolo/data/labels/108_6.png +0 -0
  319. data/vendor/ggml/examples/yolo/data/labels/108_7.png +0 -0
  320. data/vendor/ggml/examples/yolo/data/labels/109_0.png +0 -0
  321. data/vendor/ggml/examples/yolo/data/labels/109_1.png +0 -0
  322. data/vendor/ggml/examples/yolo/data/labels/109_2.png +0 -0
  323. data/vendor/ggml/examples/yolo/data/labels/109_3.png +0 -0
  324. data/vendor/ggml/examples/yolo/data/labels/109_4.png +0 -0
  325. data/vendor/ggml/examples/yolo/data/labels/109_5.png +0 -0
  326. data/vendor/ggml/examples/yolo/data/labels/109_6.png +0 -0
  327. data/vendor/ggml/examples/yolo/data/labels/109_7.png +0 -0
  328. data/vendor/ggml/examples/yolo/data/labels/110_0.png +0 -0
  329. data/vendor/ggml/examples/yolo/data/labels/110_1.png +0 -0
  330. data/vendor/ggml/examples/yolo/data/labels/110_2.png +0 -0
  331. data/vendor/ggml/examples/yolo/data/labels/110_3.png +0 -0
  332. data/vendor/ggml/examples/yolo/data/labels/110_4.png +0 -0
  333. data/vendor/ggml/examples/yolo/data/labels/110_5.png +0 -0
  334. data/vendor/ggml/examples/yolo/data/labels/110_6.png +0 -0
  335. data/vendor/ggml/examples/yolo/data/labels/110_7.png +0 -0
  336. data/vendor/ggml/examples/yolo/data/labels/111_0.png +0 -0
  337. data/vendor/ggml/examples/yolo/data/labels/111_1.png +0 -0
  338. data/vendor/ggml/examples/yolo/data/labels/111_2.png +0 -0
  339. data/vendor/ggml/examples/yolo/data/labels/111_3.png +0 -0
  340. data/vendor/ggml/examples/yolo/data/labels/111_4.png +0 -0
  341. data/vendor/ggml/examples/yolo/data/labels/111_5.png +0 -0
  342. data/vendor/ggml/examples/yolo/data/labels/111_6.png +0 -0
  343. data/vendor/ggml/examples/yolo/data/labels/111_7.png +0 -0
  344. data/vendor/ggml/examples/yolo/data/labels/112_0.png +0 -0
  345. data/vendor/ggml/examples/yolo/data/labels/112_1.png +0 -0
  346. data/vendor/ggml/examples/yolo/data/labels/112_2.png +0 -0
  347. data/vendor/ggml/examples/yolo/data/labels/112_3.png +0 -0
  348. data/vendor/ggml/examples/yolo/data/labels/112_4.png +0 -0
  349. data/vendor/ggml/examples/yolo/data/labels/112_5.png +0 -0
  350. data/vendor/ggml/examples/yolo/data/labels/112_6.png +0 -0
  351. data/vendor/ggml/examples/yolo/data/labels/112_7.png +0 -0
  352. data/vendor/ggml/examples/yolo/data/labels/113_0.png +0 -0
  353. data/vendor/ggml/examples/yolo/data/labels/113_1.png +0 -0
  354. data/vendor/ggml/examples/yolo/data/labels/113_2.png +0 -0
  355. data/vendor/ggml/examples/yolo/data/labels/113_3.png +0 -0
  356. data/vendor/ggml/examples/yolo/data/labels/113_4.png +0 -0
  357. data/vendor/ggml/examples/yolo/data/labels/113_5.png +0 -0
  358. data/vendor/ggml/examples/yolo/data/labels/113_6.png +0 -0
  359. data/vendor/ggml/examples/yolo/data/labels/113_7.png +0 -0
  360. data/vendor/ggml/examples/yolo/data/labels/114_0.png +0 -0
  361. data/vendor/ggml/examples/yolo/data/labels/114_1.png +0 -0
  362. data/vendor/ggml/examples/yolo/data/labels/114_2.png +0 -0
  363. data/vendor/ggml/examples/yolo/data/labels/114_3.png +0 -0
  364. data/vendor/ggml/examples/yolo/data/labels/114_4.png +0 -0
  365. data/vendor/ggml/examples/yolo/data/labels/114_5.png +0 -0
  366. data/vendor/ggml/examples/yolo/data/labels/114_6.png +0 -0
  367. data/vendor/ggml/examples/yolo/data/labels/114_7.png +0 -0
  368. data/vendor/ggml/examples/yolo/data/labels/115_0.png +0 -0
  369. data/vendor/ggml/examples/yolo/data/labels/115_1.png +0 -0
  370. data/vendor/ggml/examples/yolo/data/labels/115_2.png +0 -0
  371. data/vendor/ggml/examples/yolo/data/labels/115_3.png +0 -0
  372. data/vendor/ggml/examples/yolo/data/labels/115_4.png +0 -0
  373. data/vendor/ggml/examples/yolo/data/labels/115_5.png +0 -0
  374. data/vendor/ggml/examples/yolo/data/labels/115_6.png +0 -0
  375. data/vendor/ggml/examples/yolo/data/labels/115_7.png +0 -0
  376. data/vendor/ggml/examples/yolo/data/labels/116_0.png +0 -0
  377. data/vendor/ggml/examples/yolo/data/labels/116_1.png +0 -0
  378. data/vendor/ggml/examples/yolo/data/labels/116_2.png +0 -0
  379. data/vendor/ggml/examples/yolo/data/labels/116_3.png +0 -0
  380. data/vendor/ggml/examples/yolo/data/labels/116_4.png +0 -0
  381. data/vendor/ggml/examples/yolo/data/labels/116_5.png +0 -0
  382. data/vendor/ggml/examples/yolo/data/labels/116_6.png +0 -0
  383. data/vendor/ggml/examples/yolo/data/labels/116_7.png +0 -0
  384. data/vendor/ggml/examples/yolo/data/labels/117_0.png +0 -0
  385. data/vendor/ggml/examples/yolo/data/labels/117_1.png +0 -0
  386. data/vendor/ggml/examples/yolo/data/labels/117_2.png +0 -0
  387. data/vendor/ggml/examples/yolo/data/labels/117_3.png +0 -0
  388. data/vendor/ggml/examples/yolo/data/labels/117_4.png +0 -0
  389. data/vendor/ggml/examples/yolo/data/labels/117_5.png +0 -0
  390. data/vendor/ggml/examples/yolo/data/labels/117_6.png +0 -0
  391. data/vendor/ggml/examples/yolo/data/labels/117_7.png +0 -0
  392. data/vendor/ggml/examples/yolo/data/labels/118_0.png +0 -0
  393. data/vendor/ggml/examples/yolo/data/labels/118_1.png +0 -0
  394. data/vendor/ggml/examples/yolo/data/labels/118_2.png +0 -0
  395. data/vendor/ggml/examples/yolo/data/labels/118_3.png +0 -0
  396. data/vendor/ggml/examples/yolo/data/labels/118_4.png +0 -0
  397. data/vendor/ggml/examples/yolo/data/labels/118_5.png +0 -0
  398. data/vendor/ggml/examples/yolo/data/labels/118_6.png +0 -0
  399. data/vendor/ggml/examples/yolo/data/labels/118_7.png +0 -0
  400. data/vendor/ggml/examples/yolo/data/labels/119_0.png +0 -0
  401. data/vendor/ggml/examples/yolo/data/labels/119_1.png +0 -0
  402. data/vendor/ggml/examples/yolo/data/labels/119_2.png +0 -0
  403. data/vendor/ggml/examples/yolo/data/labels/119_3.png +0 -0
  404. data/vendor/ggml/examples/yolo/data/labels/119_4.png +0 -0
  405. data/vendor/ggml/examples/yolo/data/labels/119_5.png +0 -0
  406. data/vendor/ggml/examples/yolo/data/labels/119_6.png +0 -0
  407. data/vendor/ggml/examples/yolo/data/labels/119_7.png +0 -0
  408. data/vendor/ggml/examples/yolo/data/labels/120_0.png +0 -0
  409. data/vendor/ggml/examples/yolo/data/labels/120_1.png +0 -0
  410. data/vendor/ggml/examples/yolo/data/labels/120_2.png +0 -0
  411. data/vendor/ggml/examples/yolo/data/labels/120_3.png +0 -0
  412. data/vendor/ggml/examples/yolo/data/labels/120_4.png +0 -0
  413. data/vendor/ggml/examples/yolo/data/labels/120_5.png +0 -0
  414. data/vendor/ggml/examples/yolo/data/labels/120_6.png +0 -0
  415. data/vendor/ggml/examples/yolo/data/labels/120_7.png +0 -0
  416. data/vendor/ggml/examples/yolo/data/labels/121_0.png +0 -0
  417. data/vendor/ggml/examples/yolo/data/labels/121_1.png +0 -0
  418. data/vendor/ggml/examples/yolo/data/labels/121_2.png +0 -0
  419. data/vendor/ggml/examples/yolo/data/labels/121_3.png +0 -0
  420. data/vendor/ggml/examples/yolo/data/labels/121_4.png +0 -0
  421. data/vendor/ggml/examples/yolo/data/labels/121_5.png +0 -0
  422. data/vendor/ggml/examples/yolo/data/labels/121_6.png +0 -0
  423. data/vendor/ggml/examples/yolo/data/labels/121_7.png +0 -0
  424. data/vendor/ggml/examples/yolo/data/labels/122_0.png +0 -0
  425. data/vendor/ggml/examples/yolo/data/labels/122_1.png +0 -0
  426. data/vendor/ggml/examples/yolo/data/labels/122_2.png +0 -0
  427. data/vendor/ggml/examples/yolo/data/labels/122_3.png +0 -0
  428. data/vendor/ggml/examples/yolo/data/labels/122_4.png +0 -0
  429. data/vendor/ggml/examples/yolo/data/labels/122_5.png +0 -0
  430. data/vendor/ggml/examples/yolo/data/labels/122_6.png +0 -0
  431. data/vendor/ggml/examples/yolo/data/labels/122_7.png +0 -0
  432. data/vendor/ggml/examples/yolo/data/labels/123_0.png +0 -0
  433. data/vendor/ggml/examples/yolo/data/labels/123_1.png +0 -0
  434. data/vendor/ggml/examples/yolo/data/labels/123_2.png +0 -0
  435. data/vendor/ggml/examples/yolo/data/labels/123_3.png +0 -0
  436. data/vendor/ggml/examples/yolo/data/labels/123_4.png +0 -0
  437. data/vendor/ggml/examples/yolo/data/labels/123_5.png +0 -0
  438. data/vendor/ggml/examples/yolo/data/labels/123_6.png +0 -0
  439. data/vendor/ggml/examples/yolo/data/labels/123_7.png +0 -0
  440. data/vendor/ggml/examples/yolo/data/labels/124_0.png +0 -0
  441. data/vendor/ggml/examples/yolo/data/labels/124_1.png +0 -0
  442. data/vendor/ggml/examples/yolo/data/labels/124_2.png +0 -0
  443. data/vendor/ggml/examples/yolo/data/labels/124_3.png +0 -0
  444. data/vendor/ggml/examples/yolo/data/labels/124_4.png +0 -0
  445. data/vendor/ggml/examples/yolo/data/labels/124_5.png +0 -0
  446. data/vendor/ggml/examples/yolo/data/labels/124_6.png +0 -0
  447. data/vendor/ggml/examples/yolo/data/labels/124_7.png +0 -0
  448. data/vendor/ggml/examples/yolo/data/labels/125_0.png +0 -0
  449. data/vendor/ggml/examples/yolo/data/labels/125_1.png +0 -0
  450. data/vendor/ggml/examples/yolo/data/labels/125_2.png +0 -0
  451. data/vendor/ggml/examples/yolo/data/labels/125_3.png +0 -0
  452. data/vendor/ggml/examples/yolo/data/labels/125_4.png +0 -0
  453. data/vendor/ggml/examples/yolo/data/labels/125_5.png +0 -0
  454. data/vendor/ggml/examples/yolo/data/labels/125_6.png +0 -0
  455. data/vendor/ggml/examples/yolo/data/labels/125_7.png +0 -0
  456. data/vendor/ggml/examples/yolo/data/labels/126_0.png +0 -0
  457. data/vendor/ggml/examples/yolo/data/labels/126_1.png +0 -0
  458. data/vendor/ggml/examples/yolo/data/labels/126_2.png +0 -0
  459. data/vendor/ggml/examples/yolo/data/labels/126_3.png +0 -0
  460. data/vendor/ggml/examples/yolo/data/labels/126_4.png +0 -0
  461. data/vendor/ggml/examples/yolo/data/labels/126_5.png +0 -0
  462. data/vendor/ggml/examples/yolo/data/labels/126_6.png +0 -0
  463. data/vendor/ggml/examples/yolo/data/labels/126_7.png +0 -0
  464. data/vendor/ggml/examples/yolo/data/labels/32_0.png +0 -0
  465. data/vendor/ggml/examples/yolo/data/labels/32_1.png +0 -0
  466. data/vendor/ggml/examples/yolo/data/labels/32_2.png +0 -0
  467. data/vendor/ggml/examples/yolo/data/labels/32_3.png +0 -0
  468. data/vendor/ggml/examples/yolo/data/labels/32_4.png +0 -0
  469. data/vendor/ggml/examples/yolo/data/labels/32_5.png +0 -0
  470. data/vendor/ggml/examples/yolo/data/labels/32_6.png +0 -0
  471. data/vendor/ggml/examples/yolo/data/labels/32_7.png +0 -0
  472. data/vendor/ggml/examples/yolo/data/labels/33_0.png +0 -0
  473. data/vendor/ggml/examples/yolo/data/labels/33_1.png +0 -0
  474. data/vendor/ggml/examples/yolo/data/labels/33_2.png +0 -0
  475. data/vendor/ggml/examples/yolo/data/labels/33_3.png +0 -0
  476. data/vendor/ggml/examples/yolo/data/labels/33_4.png +0 -0
  477. data/vendor/ggml/examples/yolo/data/labels/33_5.png +0 -0
  478. data/vendor/ggml/examples/yolo/data/labels/33_6.png +0 -0
  479. data/vendor/ggml/examples/yolo/data/labels/33_7.png +0 -0
  480. data/vendor/ggml/examples/yolo/data/labels/34_0.png +0 -0
  481. data/vendor/ggml/examples/yolo/data/labels/34_1.png +0 -0
  482. data/vendor/ggml/examples/yolo/data/labels/34_2.png +0 -0
  483. data/vendor/ggml/examples/yolo/data/labels/34_3.png +0 -0
  484. data/vendor/ggml/examples/yolo/data/labels/34_4.png +0 -0
  485. data/vendor/ggml/examples/yolo/data/labels/34_5.png +0 -0
  486. data/vendor/ggml/examples/yolo/data/labels/34_6.png +0 -0
  487. data/vendor/ggml/examples/yolo/data/labels/34_7.png +0 -0
  488. data/vendor/ggml/examples/yolo/data/labels/35_0.png +0 -0
  489. data/vendor/ggml/examples/yolo/data/labels/35_1.png +0 -0
  490. data/vendor/ggml/examples/yolo/data/labels/35_2.png +0 -0
  491. data/vendor/ggml/examples/yolo/data/labels/35_3.png +0 -0
  492. data/vendor/ggml/examples/yolo/data/labels/35_4.png +0 -0
  493. data/vendor/ggml/examples/yolo/data/labels/35_5.png +0 -0
  494. data/vendor/ggml/examples/yolo/data/labels/35_6.png +0 -0
  495. data/vendor/ggml/examples/yolo/data/labels/35_7.png +0 -0
  496. data/vendor/ggml/examples/yolo/data/labels/36_0.png +0 -0
  497. data/vendor/ggml/examples/yolo/data/labels/36_1.png +0 -0
  498. data/vendor/ggml/examples/yolo/data/labels/36_2.png +0 -0
  499. data/vendor/ggml/examples/yolo/data/labels/36_3.png +0 -0
  500. data/vendor/ggml/examples/yolo/data/labels/36_4.png +0 -0
  501. data/vendor/ggml/examples/yolo/data/labels/36_5.png +0 -0
  502. data/vendor/ggml/examples/yolo/data/labels/36_6.png +0 -0
  503. data/vendor/ggml/examples/yolo/data/labels/36_7.png +0 -0
  504. data/vendor/ggml/examples/yolo/data/labels/37_0.png +0 -0
  505. data/vendor/ggml/examples/yolo/data/labels/37_1.png +0 -0
  506. data/vendor/ggml/examples/yolo/data/labels/37_2.png +0 -0
  507. data/vendor/ggml/examples/yolo/data/labels/37_3.png +0 -0
  508. data/vendor/ggml/examples/yolo/data/labels/37_4.png +0 -0
  509. data/vendor/ggml/examples/yolo/data/labels/37_5.png +0 -0
  510. data/vendor/ggml/examples/yolo/data/labels/37_6.png +0 -0
  511. data/vendor/ggml/examples/yolo/data/labels/37_7.png +0 -0
  512. data/vendor/ggml/examples/yolo/data/labels/38_0.png +0 -0
  513. data/vendor/ggml/examples/yolo/data/labels/38_1.png +0 -0
  514. data/vendor/ggml/examples/yolo/data/labels/38_2.png +0 -0
  515. data/vendor/ggml/examples/yolo/data/labels/38_3.png +0 -0
  516. data/vendor/ggml/examples/yolo/data/labels/38_4.png +0 -0
  517. data/vendor/ggml/examples/yolo/data/labels/38_5.png +0 -0
  518. data/vendor/ggml/examples/yolo/data/labels/38_6.png +0 -0
  519. data/vendor/ggml/examples/yolo/data/labels/38_7.png +0 -0
  520. data/vendor/ggml/examples/yolo/data/labels/39_0.png +0 -0
  521. data/vendor/ggml/examples/yolo/data/labels/39_1.png +0 -0
  522. data/vendor/ggml/examples/yolo/data/labels/39_2.png +0 -0
  523. data/vendor/ggml/examples/yolo/data/labels/39_3.png +0 -0
  524. data/vendor/ggml/examples/yolo/data/labels/39_4.png +0 -0
  525. data/vendor/ggml/examples/yolo/data/labels/39_5.png +0 -0
  526. data/vendor/ggml/examples/yolo/data/labels/39_6.png +0 -0
  527. data/vendor/ggml/examples/yolo/data/labels/39_7.png +0 -0
  528. data/vendor/ggml/examples/yolo/data/labels/40_0.png +0 -0
  529. data/vendor/ggml/examples/yolo/data/labels/40_1.png +0 -0
  530. data/vendor/ggml/examples/yolo/data/labels/40_2.png +0 -0
  531. data/vendor/ggml/examples/yolo/data/labels/40_3.png +0 -0
  532. data/vendor/ggml/examples/yolo/data/labels/40_4.png +0 -0
  533. data/vendor/ggml/examples/yolo/data/labels/40_5.png +0 -0
  534. data/vendor/ggml/examples/yolo/data/labels/40_6.png +0 -0
  535. data/vendor/ggml/examples/yolo/data/labels/40_7.png +0 -0
  536. data/vendor/ggml/examples/yolo/data/labels/41_0.png +0 -0
  537. data/vendor/ggml/examples/yolo/data/labels/41_1.png +0 -0
  538. data/vendor/ggml/examples/yolo/data/labels/41_2.png +0 -0
  539. data/vendor/ggml/examples/yolo/data/labels/41_3.png +0 -0
  540. data/vendor/ggml/examples/yolo/data/labels/41_4.png +0 -0
  541. data/vendor/ggml/examples/yolo/data/labels/41_5.png +0 -0
  542. data/vendor/ggml/examples/yolo/data/labels/41_6.png +0 -0
  543. data/vendor/ggml/examples/yolo/data/labels/41_7.png +0 -0
  544. data/vendor/ggml/examples/yolo/data/labels/42_0.png +0 -0
  545. data/vendor/ggml/examples/yolo/data/labels/42_1.png +0 -0
  546. data/vendor/ggml/examples/yolo/data/labels/42_2.png +0 -0
  547. data/vendor/ggml/examples/yolo/data/labels/42_3.png +0 -0
  548. data/vendor/ggml/examples/yolo/data/labels/42_4.png +0 -0
  549. data/vendor/ggml/examples/yolo/data/labels/42_5.png +0 -0
  550. data/vendor/ggml/examples/yolo/data/labels/42_6.png +0 -0
  551. data/vendor/ggml/examples/yolo/data/labels/42_7.png +0 -0
  552. data/vendor/ggml/examples/yolo/data/labels/43_0.png +0 -0
  553. data/vendor/ggml/examples/yolo/data/labels/43_1.png +0 -0
  554. data/vendor/ggml/examples/yolo/data/labels/43_2.png +0 -0
  555. data/vendor/ggml/examples/yolo/data/labels/43_3.png +0 -0
  556. data/vendor/ggml/examples/yolo/data/labels/43_4.png +0 -0
  557. data/vendor/ggml/examples/yolo/data/labels/43_5.png +0 -0
  558. data/vendor/ggml/examples/yolo/data/labels/43_6.png +0 -0
  559. data/vendor/ggml/examples/yolo/data/labels/43_7.png +0 -0
  560. data/vendor/ggml/examples/yolo/data/labels/44_0.png +0 -0
  561. data/vendor/ggml/examples/yolo/data/labels/44_1.png +0 -0
  562. data/vendor/ggml/examples/yolo/data/labels/44_2.png +0 -0
  563. data/vendor/ggml/examples/yolo/data/labels/44_3.png +0 -0
  564. data/vendor/ggml/examples/yolo/data/labels/44_4.png +0 -0
  565. data/vendor/ggml/examples/yolo/data/labels/44_5.png +0 -0
  566. data/vendor/ggml/examples/yolo/data/labels/44_6.png +0 -0
  567. data/vendor/ggml/examples/yolo/data/labels/44_7.png +0 -0
  568. data/vendor/ggml/examples/yolo/data/labels/45_0.png +0 -0
  569. data/vendor/ggml/examples/yolo/data/labels/45_1.png +0 -0
  570. data/vendor/ggml/examples/yolo/data/labels/45_2.png +0 -0
  571. data/vendor/ggml/examples/yolo/data/labels/45_3.png +0 -0
  572. data/vendor/ggml/examples/yolo/data/labels/45_4.png +0 -0
  573. data/vendor/ggml/examples/yolo/data/labels/45_5.png +0 -0
  574. data/vendor/ggml/examples/yolo/data/labels/45_6.png +0 -0
  575. data/vendor/ggml/examples/yolo/data/labels/45_7.png +0 -0
  576. data/vendor/ggml/examples/yolo/data/labels/46_0.png +0 -0
  577. data/vendor/ggml/examples/yolo/data/labels/46_1.png +0 -0
  578. data/vendor/ggml/examples/yolo/data/labels/46_2.png +0 -0
  579. data/vendor/ggml/examples/yolo/data/labels/46_3.png +0 -0
  580. data/vendor/ggml/examples/yolo/data/labels/46_4.png +0 -0
  581. data/vendor/ggml/examples/yolo/data/labels/46_5.png +0 -0
  582. data/vendor/ggml/examples/yolo/data/labels/46_6.png +0 -0
  583. data/vendor/ggml/examples/yolo/data/labels/46_7.png +0 -0
  584. data/vendor/ggml/examples/yolo/data/labels/47_0.png +0 -0
  585. data/vendor/ggml/examples/yolo/data/labels/47_1.png +0 -0
  586. data/vendor/ggml/examples/yolo/data/labels/47_2.png +0 -0
  587. data/vendor/ggml/examples/yolo/data/labels/47_3.png +0 -0
  588. data/vendor/ggml/examples/yolo/data/labels/47_4.png +0 -0
  589. data/vendor/ggml/examples/yolo/data/labels/47_5.png +0 -0
  590. data/vendor/ggml/examples/yolo/data/labels/47_6.png +0 -0
  591. data/vendor/ggml/examples/yolo/data/labels/47_7.png +0 -0
  592. data/vendor/ggml/examples/yolo/data/labels/48_0.png +0 -0
  593. data/vendor/ggml/examples/yolo/data/labels/48_1.png +0 -0
  594. data/vendor/ggml/examples/yolo/data/labels/48_2.png +0 -0
  595. data/vendor/ggml/examples/yolo/data/labels/48_3.png +0 -0
  596. data/vendor/ggml/examples/yolo/data/labels/48_4.png +0 -0
  597. data/vendor/ggml/examples/yolo/data/labels/48_5.png +0 -0
  598. data/vendor/ggml/examples/yolo/data/labels/48_6.png +0 -0
  599. data/vendor/ggml/examples/yolo/data/labels/48_7.png +0 -0
  600. data/vendor/ggml/examples/yolo/data/labels/49_0.png +0 -0
  601. data/vendor/ggml/examples/yolo/data/labels/49_1.png +0 -0
  602. data/vendor/ggml/examples/yolo/data/labels/49_2.png +0 -0
  603. data/vendor/ggml/examples/yolo/data/labels/49_3.png +0 -0
  604. data/vendor/ggml/examples/yolo/data/labels/49_4.png +0 -0
  605. data/vendor/ggml/examples/yolo/data/labels/49_5.png +0 -0
  606. data/vendor/ggml/examples/yolo/data/labels/49_6.png +0 -0
  607. data/vendor/ggml/examples/yolo/data/labels/49_7.png +0 -0
  608. data/vendor/ggml/examples/yolo/data/labels/50_0.png +0 -0
  609. data/vendor/ggml/examples/yolo/data/labels/50_1.png +0 -0
  610. data/vendor/ggml/examples/yolo/data/labels/50_2.png +0 -0
  611. data/vendor/ggml/examples/yolo/data/labels/50_3.png +0 -0
  612. data/vendor/ggml/examples/yolo/data/labels/50_4.png +0 -0
  613. data/vendor/ggml/examples/yolo/data/labels/50_5.png +0 -0
  614. data/vendor/ggml/examples/yolo/data/labels/50_6.png +0 -0
  615. data/vendor/ggml/examples/yolo/data/labels/50_7.png +0 -0
  616. data/vendor/ggml/examples/yolo/data/labels/51_0.png +0 -0
  617. data/vendor/ggml/examples/yolo/data/labels/51_1.png +0 -0
  618. data/vendor/ggml/examples/yolo/data/labels/51_2.png +0 -0
  619. data/vendor/ggml/examples/yolo/data/labels/51_3.png +0 -0
  620. data/vendor/ggml/examples/yolo/data/labels/51_4.png +0 -0
  621. data/vendor/ggml/examples/yolo/data/labels/51_5.png +0 -0
  622. data/vendor/ggml/examples/yolo/data/labels/51_6.png +0 -0
  623. data/vendor/ggml/examples/yolo/data/labels/51_7.png +0 -0
  624. data/vendor/ggml/examples/yolo/data/labels/52_0.png +0 -0
  625. data/vendor/ggml/examples/yolo/data/labels/52_1.png +0 -0
  626. data/vendor/ggml/examples/yolo/data/labels/52_2.png +0 -0
  627. data/vendor/ggml/examples/yolo/data/labels/52_3.png +0 -0
  628. data/vendor/ggml/examples/yolo/data/labels/52_4.png +0 -0
  629. data/vendor/ggml/examples/yolo/data/labels/52_5.png +0 -0
  630. data/vendor/ggml/examples/yolo/data/labels/52_6.png +0 -0
  631. data/vendor/ggml/examples/yolo/data/labels/52_7.png +0 -0
  632. data/vendor/ggml/examples/yolo/data/labels/53_0.png +0 -0
  633. data/vendor/ggml/examples/yolo/data/labels/53_1.png +0 -0
  634. data/vendor/ggml/examples/yolo/data/labels/53_2.png +0 -0
  635. data/vendor/ggml/examples/yolo/data/labels/53_3.png +0 -0
  636. data/vendor/ggml/examples/yolo/data/labels/53_4.png +0 -0
  637. data/vendor/ggml/examples/yolo/data/labels/53_5.png +0 -0
  638. data/vendor/ggml/examples/yolo/data/labels/53_6.png +0 -0
  639. data/vendor/ggml/examples/yolo/data/labels/53_7.png +0 -0
  640. data/vendor/ggml/examples/yolo/data/labels/54_0.png +0 -0
  641. data/vendor/ggml/examples/yolo/data/labels/54_1.png +0 -0
  642. data/vendor/ggml/examples/yolo/data/labels/54_2.png +0 -0
  643. data/vendor/ggml/examples/yolo/data/labels/54_3.png +0 -0
  644. data/vendor/ggml/examples/yolo/data/labels/54_4.png +0 -0
  645. data/vendor/ggml/examples/yolo/data/labels/54_5.png +0 -0
  646. data/vendor/ggml/examples/yolo/data/labels/54_6.png +0 -0
  647. data/vendor/ggml/examples/yolo/data/labels/54_7.png +0 -0
  648. data/vendor/ggml/examples/yolo/data/labels/55_0.png +0 -0
  649. data/vendor/ggml/examples/yolo/data/labels/55_1.png +0 -0
  650. data/vendor/ggml/examples/yolo/data/labels/55_2.png +0 -0
  651. data/vendor/ggml/examples/yolo/data/labels/55_3.png +0 -0
  652. data/vendor/ggml/examples/yolo/data/labels/55_4.png +0 -0
  653. data/vendor/ggml/examples/yolo/data/labels/55_5.png +0 -0
  654. data/vendor/ggml/examples/yolo/data/labels/55_6.png +0 -0
  655. data/vendor/ggml/examples/yolo/data/labels/55_7.png +0 -0
  656. data/vendor/ggml/examples/yolo/data/labels/56_0.png +0 -0
  657. data/vendor/ggml/examples/yolo/data/labels/56_1.png +0 -0
  658. data/vendor/ggml/examples/yolo/data/labels/56_2.png +0 -0
  659. data/vendor/ggml/examples/yolo/data/labels/56_3.png +0 -0
  660. data/vendor/ggml/examples/yolo/data/labels/56_4.png +0 -0
  661. data/vendor/ggml/examples/yolo/data/labels/56_5.png +0 -0
  662. data/vendor/ggml/examples/yolo/data/labels/56_6.png +0 -0
  663. data/vendor/ggml/examples/yolo/data/labels/56_7.png +0 -0
  664. data/vendor/ggml/examples/yolo/data/labels/57_0.png +0 -0
  665. data/vendor/ggml/examples/yolo/data/labels/57_1.png +0 -0
  666. data/vendor/ggml/examples/yolo/data/labels/57_2.png +0 -0
  667. data/vendor/ggml/examples/yolo/data/labels/57_3.png +0 -0
  668. data/vendor/ggml/examples/yolo/data/labels/57_4.png +0 -0
  669. data/vendor/ggml/examples/yolo/data/labels/57_5.png +0 -0
  670. data/vendor/ggml/examples/yolo/data/labels/57_6.png +0 -0
  671. data/vendor/ggml/examples/yolo/data/labels/57_7.png +0 -0
  672. data/vendor/ggml/examples/yolo/data/labels/58_0.png +0 -0
  673. data/vendor/ggml/examples/yolo/data/labels/58_1.png +0 -0
  674. data/vendor/ggml/examples/yolo/data/labels/58_2.png +0 -0
  675. data/vendor/ggml/examples/yolo/data/labels/58_3.png +0 -0
  676. data/vendor/ggml/examples/yolo/data/labels/58_4.png +0 -0
  677. data/vendor/ggml/examples/yolo/data/labels/58_5.png +0 -0
  678. data/vendor/ggml/examples/yolo/data/labels/58_6.png +0 -0
  679. data/vendor/ggml/examples/yolo/data/labels/58_7.png +0 -0
  680. data/vendor/ggml/examples/yolo/data/labels/59_0.png +0 -0
  681. data/vendor/ggml/examples/yolo/data/labels/59_1.png +0 -0
  682. data/vendor/ggml/examples/yolo/data/labels/59_2.png +0 -0
  683. data/vendor/ggml/examples/yolo/data/labels/59_3.png +0 -0
  684. data/vendor/ggml/examples/yolo/data/labels/59_4.png +0 -0
  685. data/vendor/ggml/examples/yolo/data/labels/59_5.png +0 -0
  686. data/vendor/ggml/examples/yolo/data/labels/59_6.png +0 -0
  687. data/vendor/ggml/examples/yolo/data/labels/59_7.png +0 -0
  688. data/vendor/ggml/examples/yolo/data/labels/60_0.png +0 -0
  689. data/vendor/ggml/examples/yolo/data/labels/60_1.png +0 -0
  690. data/vendor/ggml/examples/yolo/data/labels/60_2.png +0 -0
  691. data/vendor/ggml/examples/yolo/data/labels/60_3.png +0 -0
  692. data/vendor/ggml/examples/yolo/data/labels/60_4.png +0 -0
  693. data/vendor/ggml/examples/yolo/data/labels/60_5.png +0 -0
  694. data/vendor/ggml/examples/yolo/data/labels/60_6.png +0 -0
  695. data/vendor/ggml/examples/yolo/data/labels/60_7.png +0 -0
  696. data/vendor/ggml/examples/yolo/data/labels/61_0.png +0 -0
  697. data/vendor/ggml/examples/yolo/data/labels/61_1.png +0 -0
  698. data/vendor/ggml/examples/yolo/data/labels/61_2.png +0 -0
  699. data/vendor/ggml/examples/yolo/data/labels/61_3.png +0 -0
  700. data/vendor/ggml/examples/yolo/data/labels/61_4.png +0 -0
  701. data/vendor/ggml/examples/yolo/data/labels/61_5.png +0 -0
  702. data/vendor/ggml/examples/yolo/data/labels/61_6.png +0 -0
  703. data/vendor/ggml/examples/yolo/data/labels/61_7.png +0 -0
  704. data/vendor/ggml/examples/yolo/data/labels/62_0.png +0 -0
  705. data/vendor/ggml/examples/yolo/data/labels/62_1.png +0 -0
  706. data/vendor/ggml/examples/yolo/data/labels/62_2.png +0 -0
  707. data/vendor/ggml/examples/yolo/data/labels/62_3.png +0 -0
  708. data/vendor/ggml/examples/yolo/data/labels/62_4.png +0 -0
  709. data/vendor/ggml/examples/yolo/data/labels/62_5.png +0 -0
  710. data/vendor/ggml/examples/yolo/data/labels/62_6.png +0 -0
  711. data/vendor/ggml/examples/yolo/data/labels/62_7.png +0 -0
  712. data/vendor/ggml/examples/yolo/data/labels/63_0.png +0 -0
  713. data/vendor/ggml/examples/yolo/data/labels/63_1.png +0 -0
  714. data/vendor/ggml/examples/yolo/data/labels/63_2.png +0 -0
  715. data/vendor/ggml/examples/yolo/data/labels/63_3.png +0 -0
  716. data/vendor/ggml/examples/yolo/data/labels/63_4.png +0 -0
  717. data/vendor/ggml/examples/yolo/data/labels/63_5.png +0 -0
  718. data/vendor/ggml/examples/yolo/data/labels/63_6.png +0 -0
  719. data/vendor/ggml/examples/yolo/data/labels/63_7.png +0 -0
  720. data/vendor/ggml/examples/yolo/data/labels/64_0.png +0 -0
  721. data/vendor/ggml/examples/yolo/data/labels/64_1.png +0 -0
  722. data/vendor/ggml/examples/yolo/data/labels/64_2.png +0 -0
  723. data/vendor/ggml/examples/yolo/data/labels/64_3.png +0 -0
  724. data/vendor/ggml/examples/yolo/data/labels/64_4.png +0 -0
  725. data/vendor/ggml/examples/yolo/data/labels/64_5.png +0 -0
  726. data/vendor/ggml/examples/yolo/data/labels/64_6.png +0 -0
  727. data/vendor/ggml/examples/yolo/data/labels/64_7.png +0 -0
  728. data/vendor/ggml/examples/yolo/data/labels/65_0.png +0 -0
  729. data/vendor/ggml/examples/yolo/data/labels/65_1.png +0 -0
  730. data/vendor/ggml/examples/yolo/data/labels/65_2.png +0 -0
  731. data/vendor/ggml/examples/yolo/data/labels/65_3.png +0 -0
  732. data/vendor/ggml/examples/yolo/data/labels/65_4.png +0 -0
  733. data/vendor/ggml/examples/yolo/data/labels/65_5.png +0 -0
  734. data/vendor/ggml/examples/yolo/data/labels/65_6.png +0 -0
  735. data/vendor/ggml/examples/yolo/data/labels/65_7.png +0 -0
  736. data/vendor/ggml/examples/yolo/data/labels/66_0.png +0 -0
  737. data/vendor/ggml/examples/yolo/data/labels/66_1.png +0 -0
  738. data/vendor/ggml/examples/yolo/data/labels/66_2.png +0 -0
  739. data/vendor/ggml/examples/yolo/data/labels/66_3.png +0 -0
  740. data/vendor/ggml/examples/yolo/data/labels/66_4.png +0 -0
  741. data/vendor/ggml/examples/yolo/data/labels/66_5.png +0 -0
  742. data/vendor/ggml/examples/yolo/data/labels/66_6.png +0 -0
  743. data/vendor/ggml/examples/yolo/data/labels/66_7.png +0 -0
  744. data/vendor/ggml/examples/yolo/data/labels/67_0.png +0 -0
  745. data/vendor/ggml/examples/yolo/data/labels/67_1.png +0 -0
  746. data/vendor/ggml/examples/yolo/data/labels/67_2.png +0 -0
  747. data/vendor/ggml/examples/yolo/data/labels/67_3.png +0 -0
  748. data/vendor/ggml/examples/yolo/data/labels/67_4.png +0 -0
  749. data/vendor/ggml/examples/yolo/data/labels/67_5.png +0 -0
  750. data/vendor/ggml/examples/yolo/data/labels/67_6.png +0 -0
  751. data/vendor/ggml/examples/yolo/data/labels/67_7.png +0 -0
  752. data/vendor/ggml/examples/yolo/data/labels/68_0.png +0 -0
  753. data/vendor/ggml/examples/yolo/data/labels/68_1.png +0 -0
  754. data/vendor/ggml/examples/yolo/data/labels/68_2.png +0 -0
  755. data/vendor/ggml/examples/yolo/data/labels/68_3.png +0 -0
  756. data/vendor/ggml/examples/yolo/data/labels/68_4.png +0 -0
  757. data/vendor/ggml/examples/yolo/data/labels/68_5.png +0 -0
  758. data/vendor/ggml/examples/yolo/data/labels/68_6.png +0 -0
  759. data/vendor/ggml/examples/yolo/data/labels/68_7.png +0 -0
  760. data/vendor/ggml/examples/yolo/data/labels/69_0.png +0 -0
  761. data/vendor/ggml/examples/yolo/data/labels/69_1.png +0 -0
  762. data/vendor/ggml/examples/yolo/data/labels/69_2.png +0 -0
  763. data/vendor/ggml/examples/yolo/data/labels/69_3.png +0 -0
  764. data/vendor/ggml/examples/yolo/data/labels/69_4.png +0 -0
  765. data/vendor/ggml/examples/yolo/data/labels/69_5.png +0 -0
  766. data/vendor/ggml/examples/yolo/data/labels/69_6.png +0 -0
  767. data/vendor/ggml/examples/yolo/data/labels/69_7.png +0 -0
  768. data/vendor/ggml/examples/yolo/data/labels/70_0.png +0 -0
  769. data/vendor/ggml/examples/yolo/data/labels/70_1.png +0 -0
  770. data/vendor/ggml/examples/yolo/data/labels/70_2.png +0 -0
  771. data/vendor/ggml/examples/yolo/data/labels/70_3.png +0 -0
  772. data/vendor/ggml/examples/yolo/data/labels/70_4.png +0 -0
  773. data/vendor/ggml/examples/yolo/data/labels/70_5.png +0 -0
  774. data/vendor/ggml/examples/yolo/data/labels/70_6.png +0 -0
  775. data/vendor/ggml/examples/yolo/data/labels/70_7.png +0 -0
  776. data/vendor/ggml/examples/yolo/data/labels/71_0.png +0 -0
  777. data/vendor/ggml/examples/yolo/data/labels/71_1.png +0 -0
  778. data/vendor/ggml/examples/yolo/data/labels/71_2.png +0 -0
  779. data/vendor/ggml/examples/yolo/data/labels/71_3.png +0 -0
  780. data/vendor/ggml/examples/yolo/data/labels/71_4.png +0 -0
  781. data/vendor/ggml/examples/yolo/data/labels/71_5.png +0 -0
  782. data/vendor/ggml/examples/yolo/data/labels/71_6.png +0 -0
  783. data/vendor/ggml/examples/yolo/data/labels/71_7.png +0 -0
  784. data/vendor/ggml/examples/yolo/data/labels/72_0.png +0 -0
  785. data/vendor/ggml/examples/yolo/data/labels/72_1.png +0 -0
  786. data/vendor/ggml/examples/yolo/data/labels/72_2.png +0 -0
  787. data/vendor/ggml/examples/yolo/data/labels/72_3.png +0 -0
  788. data/vendor/ggml/examples/yolo/data/labels/72_4.png +0 -0
  789. data/vendor/ggml/examples/yolo/data/labels/72_5.png +0 -0
  790. data/vendor/ggml/examples/yolo/data/labels/72_6.png +0 -0
  791. data/vendor/ggml/examples/yolo/data/labels/72_7.png +0 -0
  792. data/vendor/ggml/examples/yolo/data/labels/73_0.png +0 -0
  793. data/vendor/ggml/examples/yolo/data/labels/73_1.png +0 -0
  794. data/vendor/ggml/examples/yolo/data/labels/73_2.png +0 -0
  795. data/vendor/ggml/examples/yolo/data/labels/73_3.png +0 -0
  796. data/vendor/ggml/examples/yolo/data/labels/73_4.png +0 -0
  797. data/vendor/ggml/examples/yolo/data/labels/73_5.png +0 -0
  798. data/vendor/ggml/examples/yolo/data/labels/73_6.png +0 -0
  799. data/vendor/ggml/examples/yolo/data/labels/73_7.png +0 -0
  800. data/vendor/ggml/examples/yolo/data/labels/74_0.png +0 -0
  801. data/vendor/ggml/examples/yolo/data/labels/74_1.png +0 -0
  802. data/vendor/ggml/examples/yolo/data/labels/74_2.png +0 -0
  803. data/vendor/ggml/examples/yolo/data/labels/74_3.png +0 -0
  804. data/vendor/ggml/examples/yolo/data/labels/74_4.png +0 -0
  805. data/vendor/ggml/examples/yolo/data/labels/74_5.png +0 -0
  806. data/vendor/ggml/examples/yolo/data/labels/74_6.png +0 -0
  807. data/vendor/ggml/examples/yolo/data/labels/74_7.png +0 -0
  808. data/vendor/ggml/examples/yolo/data/labels/75_0.png +0 -0
  809. data/vendor/ggml/examples/yolo/data/labels/75_1.png +0 -0
  810. data/vendor/ggml/examples/yolo/data/labels/75_2.png +0 -0
  811. data/vendor/ggml/examples/yolo/data/labels/75_3.png +0 -0
  812. data/vendor/ggml/examples/yolo/data/labels/75_4.png +0 -0
  813. data/vendor/ggml/examples/yolo/data/labels/75_5.png +0 -0
  814. data/vendor/ggml/examples/yolo/data/labels/75_6.png +0 -0
  815. data/vendor/ggml/examples/yolo/data/labels/75_7.png +0 -0
  816. data/vendor/ggml/examples/yolo/data/labels/76_0.png +0 -0
  817. data/vendor/ggml/examples/yolo/data/labels/76_1.png +0 -0
  818. data/vendor/ggml/examples/yolo/data/labels/76_2.png +0 -0
  819. data/vendor/ggml/examples/yolo/data/labels/76_3.png +0 -0
  820. data/vendor/ggml/examples/yolo/data/labels/76_4.png +0 -0
  821. data/vendor/ggml/examples/yolo/data/labels/76_5.png +0 -0
  822. data/vendor/ggml/examples/yolo/data/labels/76_6.png +0 -0
  823. data/vendor/ggml/examples/yolo/data/labels/76_7.png +0 -0
  824. data/vendor/ggml/examples/yolo/data/labels/77_0.png +0 -0
  825. data/vendor/ggml/examples/yolo/data/labels/77_1.png +0 -0
  826. data/vendor/ggml/examples/yolo/data/labels/77_2.png +0 -0
  827. data/vendor/ggml/examples/yolo/data/labels/77_3.png +0 -0
  828. data/vendor/ggml/examples/yolo/data/labels/77_4.png +0 -0
  829. data/vendor/ggml/examples/yolo/data/labels/77_5.png +0 -0
  830. data/vendor/ggml/examples/yolo/data/labels/77_6.png +0 -0
  831. data/vendor/ggml/examples/yolo/data/labels/77_7.png +0 -0
  832. data/vendor/ggml/examples/yolo/data/labels/78_0.png +0 -0
  833. data/vendor/ggml/examples/yolo/data/labels/78_1.png +0 -0
  834. data/vendor/ggml/examples/yolo/data/labels/78_2.png +0 -0
  835. data/vendor/ggml/examples/yolo/data/labels/78_3.png +0 -0
  836. data/vendor/ggml/examples/yolo/data/labels/78_4.png +0 -0
  837. data/vendor/ggml/examples/yolo/data/labels/78_5.png +0 -0
  838. data/vendor/ggml/examples/yolo/data/labels/78_6.png +0 -0
  839. data/vendor/ggml/examples/yolo/data/labels/78_7.png +0 -0
  840. data/vendor/ggml/examples/yolo/data/labels/79_0.png +0 -0
  841. data/vendor/ggml/examples/yolo/data/labels/79_1.png +0 -0
  842. data/vendor/ggml/examples/yolo/data/labels/79_2.png +0 -0
  843. data/vendor/ggml/examples/yolo/data/labels/79_3.png +0 -0
  844. data/vendor/ggml/examples/yolo/data/labels/79_4.png +0 -0
  845. data/vendor/ggml/examples/yolo/data/labels/79_5.png +0 -0
  846. data/vendor/ggml/examples/yolo/data/labels/79_6.png +0 -0
  847. data/vendor/ggml/examples/yolo/data/labels/79_7.png +0 -0
  848. data/vendor/ggml/examples/yolo/data/labels/80_0.png +0 -0
  849. data/vendor/ggml/examples/yolo/data/labels/80_1.png +0 -0
  850. data/vendor/ggml/examples/yolo/data/labels/80_2.png +0 -0
  851. data/vendor/ggml/examples/yolo/data/labels/80_3.png +0 -0
  852. data/vendor/ggml/examples/yolo/data/labels/80_4.png +0 -0
  853. data/vendor/ggml/examples/yolo/data/labels/80_5.png +0 -0
  854. data/vendor/ggml/examples/yolo/data/labels/80_6.png +0 -0
  855. data/vendor/ggml/examples/yolo/data/labels/80_7.png +0 -0
  856. data/vendor/ggml/examples/yolo/data/labels/81_0.png +0 -0
  857. data/vendor/ggml/examples/yolo/data/labels/81_1.png +0 -0
  858. data/vendor/ggml/examples/yolo/data/labels/81_2.png +0 -0
  859. data/vendor/ggml/examples/yolo/data/labels/81_3.png +0 -0
  860. data/vendor/ggml/examples/yolo/data/labels/81_4.png +0 -0
  861. data/vendor/ggml/examples/yolo/data/labels/81_5.png +0 -0
  862. data/vendor/ggml/examples/yolo/data/labels/81_6.png +0 -0
  863. data/vendor/ggml/examples/yolo/data/labels/81_7.png +0 -0
  864. data/vendor/ggml/examples/yolo/data/labels/82_0.png +0 -0
  865. data/vendor/ggml/examples/yolo/data/labels/82_1.png +0 -0
  866. data/vendor/ggml/examples/yolo/data/labels/82_2.png +0 -0
  867. data/vendor/ggml/examples/yolo/data/labels/82_3.png +0 -0
  868. data/vendor/ggml/examples/yolo/data/labels/82_4.png +0 -0
  869. data/vendor/ggml/examples/yolo/data/labels/82_5.png +0 -0
  870. data/vendor/ggml/examples/yolo/data/labels/82_6.png +0 -0
  871. data/vendor/ggml/examples/yolo/data/labels/82_7.png +0 -0
  872. data/vendor/ggml/examples/yolo/data/labels/83_0.png +0 -0
  873. data/vendor/ggml/examples/yolo/data/labels/83_1.png +0 -0
  874. data/vendor/ggml/examples/yolo/data/labels/83_2.png +0 -0
  875. data/vendor/ggml/examples/yolo/data/labels/83_3.png +0 -0
  876. data/vendor/ggml/examples/yolo/data/labels/83_4.png +0 -0
  877. data/vendor/ggml/examples/yolo/data/labels/83_5.png +0 -0
  878. data/vendor/ggml/examples/yolo/data/labels/83_6.png +0 -0
  879. data/vendor/ggml/examples/yolo/data/labels/83_7.png +0 -0
  880. data/vendor/ggml/examples/yolo/data/labels/84_0.png +0 -0
  881. data/vendor/ggml/examples/yolo/data/labels/84_1.png +0 -0
  882. data/vendor/ggml/examples/yolo/data/labels/84_2.png +0 -0
  883. data/vendor/ggml/examples/yolo/data/labels/84_3.png +0 -0
  884. data/vendor/ggml/examples/yolo/data/labels/84_4.png +0 -0
  885. data/vendor/ggml/examples/yolo/data/labels/84_5.png +0 -0
  886. data/vendor/ggml/examples/yolo/data/labels/84_6.png +0 -0
  887. data/vendor/ggml/examples/yolo/data/labels/84_7.png +0 -0
  888. data/vendor/ggml/examples/yolo/data/labels/85_0.png +0 -0
  889. data/vendor/ggml/examples/yolo/data/labels/85_1.png +0 -0
  890. data/vendor/ggml/examples/yolo/data/labels/85_2.png +0 -0
  891. data/vendor/ggml/examples/yolo/data/labels/85_3.png +0 -0
  892. data/vendor/ggml/examples/yolo/data/labels/85_4.png +0 -0
  893. data/vendor/ggml/examples/yolo/data/labels/85_5.png +0 -0
  894. data/vendor/ggml/examples/yolo/data/labels/85_6.png +0 -0
  895. data/vendor/ggml/examples/yolo/data/labels/85_7.png +0 -0
  896. data/vendor/ggml/examples/yolo/data/labels/86_0.png +0 -0
  897. data/vendor/ggml/examples/yolo/data/labels/86_1.png +0 -0
  898. data/vendor/ggml/examples/yolo/data/labels/86_2.png +0 -0
  899. data/vendor/ggml/examples/yolo/data/labels/86_3.png +0 -0
  900. data/vendor/ggml/examples/yolo/data/labels/86_4.png +0 -0
  901. data/vendor/ggml/examples/yolo/data/labels/86_5.png +0 -0
  902. data/vendor/ggml/examples/yolo/data/labels/86_6.png +0 -0
  903. data/vendor/ggml/examples/yolo/data/labels/86_7.png +0 -0
  904. data/vendor/ggml/examples/yolo/data/labels/87_0.png +0 -0
  905. data/vendor/ggml/examples/yolo/data/labels/87_1.png +0 -0
  906. data/vendor/ggml/examples/yolo/data/labels/87_2.png +0 -0
  907. data/vendor/ggml/examples/yolo/data/labels/87_3.png +0 -0
  908. data/vendor/ggml/examples/yolo/data/labels/87_4.png +0 -0
  909. data/vendor/ggml/examples/yolo/data/labels/87_5.png +0 -0
  910. data/vendor/ggml/examples/yolo/data/labels/87_6.png +0 -0
  911. data/vendor/ggml/examples/yolo/data/labels/87_7.png +0 -0
  912. data/vendor/ggml/examples/yolo/data/labels/88_0.png +0 -0
  913. data/vendor/ggml/examples/yolo/data/labels/88_1.png +0 -0
  914. data/vendor/ggml/examples/yolo/data/labels/88_2.png +0 -0
  915. data/vendor/ggml/examples/yolo/data/labels/88_3.png +0 -0
  916. data/vendor/ggml/examples/yolo/data/labels/88_4.png +0 -0
  917. data/vendor/ggml/examples/yolo/data/labels/88_5.png +0 -0
  918. data/vendor/ggml/examples/yolo/data/labels/88_6.png +0 -0
  919. data/vendor/ggml/examples/yolo/data/labels/88_7.png +0 -0
  920. data/vendor/ggml/examples/yolo/data/labels/89_0.png +0 -0
  921. data/vendor/ggml/examples/yolo/data/labels/89_1.png +0 -0
  922. data/vendor/ggml/examples/yolo/data/labels/89_2.png +0 -0
  923. data/vendor/ggml/examples/yolo/data/labels/89_3.png +0 -0
  924. data/vendor/ggml/examples/yolo/data/labels/89_4.png +0 -0
  925. data/vendor/ggml/examples/yolo/data/labels/89_5.png +0 -0
  926. data/vendor/ggml/examples/yolo/data/labels/89_6.png +0 -0
  927. data/vendor/ggml/examples/yolo/data/labels/89_7.png +0 -0
  928. data/vendor/ggml/examples/yolo/data/labels/90_0.png +0 -0
  929. data/vendor/ggml/examples/yolo/data/labels/90_1.png +0 -0
  930. data/vendor/ggml/examples/yolo/data/labels/90_2.png +0 -0
  931. data/vendor/ggml/examples/yolo/data/labels/90_3.png +0 -0
  932. data/vendor/ggml/examples/yolo/data/labels/90_4.png +0 -0
  933. data/vendor/ggml/examples/yolo/data/labels/90_5.png +0 -0
  934. data/vendor/ggml/examples/yolo/data/labels/90_6.png +0 -0
  935. data/vendor/ggml/examples/yolo/data/labels/90_7.png +0 -0
  936. data/vendor/ggml/examples/yolo/data/labels/91_0.png +0 -0
  937. data/vendor/ggml/examples/yolo/data/labels/91_1.png +0 -0
  938. data/vendor/ggml/examples/yolo/data/labels/91_2.png +0 -0
  939. data/vendor/ggml/examples/yolo/data/labels/91_3.png +0 -0
  940. data/vendor/ggml/examples/yolo/data/labels/91_4.png +0 -0
  941. data/vendor/ggml/examples/yolo/data/labels/91_5.png +0 -0
  942. data/vendor/ggml/examples/yolo/data/labels/91_6.png +0 -0
  943. data/vendor/ggml/examples/yolo/data/labels/91_7.png +0 -0
  944. data/vendor/ggml/examples/yolo/data/labels/92_0.png +0 -0
  945. data/vendor/ggml/examples/yolo/data/labels/92_1.png +0 -0
  946. data/vendor/ggml/examples/yolo/data/labels/92_2.png +0 -0
  947. data/vendor/ggml/examples/yolo/data/labels/92_3.png +0 -0
  948. data/vendor/ggml/examples/yolo/data/labels/92_4.png +0 -0
  949. data/vendor/ggml/examples/yolo/data/labels/92_5.png +0 -0
  950. data/vendor/ggml/examples/yolo/data/labels/92_6.png +0 -0
  951. data/vendor/ggml/examples/yolo/data/labels/92_7.png +0 -0
  952. data/vendor/ggml/examples/yolo/data/labels/93_0.png +0 -0
  953. data/vendor/ggml/examples/yolo/data/labels/93_1.png +0 -0
  954. data/vendor/ggml/examples/yolo/data/labels/93_2.png +0 -0
  955. data/vendor/ggml/examples/yolo/data/labels/93_3.png +0 -0
  956. data/vendor/ggml/examples/yolo/data/labels/93_4.png +0 -0
  957. data/vendor/ggml/examples/yolo/data/labels/93_5.png +0 -0
  958. data/vendor/ggml/examples/yolo/data/labels/93_6.png +0 -0
  959. data/vendor/ggml/examples/yolo/data/labels/93_7.png +0 -0
  960. data/vendor/ggml/examples/yolo/data/labels/94_0.png +0 -0
  961. data/vendor/ggml/examples/yolo/data/labels/94_1.png +0 -0
  962. data/vendor/ggml/examples/yolo/data/labels/94_2.png +0 -0
  963. data/vendor/ggml/examples/yolo/data/labels/94_3.png +0 -0
  964. data/vendor/ggml/examples/yolo/data/labels/94_4.png +0 -0
  965. data/vendor/ggml/examples/yolo/data/labels/94_5.png +0 -0
  966. data/vendor/ggml/examples/yolo/data/labels/94_6.png +0 -0
  967. data/vendor/ggml/examples/yolo/data/labels/94_7.png +0 -0
  968. data/vendor/ggml/examples/yolo/data/labels/95_0.png +0 -0
  969. data/vendor/ggml/examples/yolo/data/labels/95_1.png +0 -0
  970. data/vendor/ggml/examples/yolo/data/labels/95_2.png +0 -0
  971. data/vendor/ggml/examples/yolo/data/labels/95_3.png +0 -0
  972. data/vendor/ggml/examples/yolo/data/labels/95_4.png +0 -0
  973. data/vendor/ggml/examples/yolo/data/labels/95_5.png +0 -0
  974. data/vendor/ggml/examples/yolo/data/labels/95_6.png +0 -0
  975. data/vendor/ggml/examples/yolo/data/labels/95_7.png +0 -0
  976. data/vendor/ggml/examples/yolo/data/labels/96_0.png +0 -0
  977. data/vendor/ggml/examples/yolo/data/labels/96_1.png +0 -0
  978. data/vendor/ggml/examples/yolo/data/labels/96_2.png +0 -0
  979. data/vendor/ggml/examples/yolo/data/labels/96_3.png +0 -0
  980. data/vendor/ggml/examples/yolo/data/labels/96_4.png +0 -0
  981. data/vendor/ggml/examples/yolo/data/labels/96_5.png +0 -0
  982. data/vendor/ggml/examples/yolo/data/labels/96_6.png +0 -0
  983. data/vendor/ggml/examples/yolo/data/labels/96_7.png +0 -0
  984. data/vendor/ggml/examples/yolo/data/labels/97_0.png +0 -0
  985. data/vendor/ggml/examples/yolo/data/labels/97_1.png +0 -0
  986. data/vendor/ggml/examples/yolo/data/labels/97_2.png +0 -0
  987. data/vendor/ggml/examples/yolo/data/labels/97_3.png +0 -0
  988. data/vendor/ggml/examples/yolo/data/labels/97_4.png +0 -0
  989. data/vendor/ggml/examples/yolo/data/labels/97_5.png +0 -0
  990. data/vendor/ggml/examples/yolo/data/labels/97_6.png +0 -0
  991. data/vendor/ggml/examples/yolo/data/labels/97_7.png +0 -0
  992. data/vendor/ggml/examples/yolo/data/labels/98_0.png +0 -0
  993. data/vendor/ggml/examples/yolo/data/labels/98_1.png +0 -0
  994. data/vendor/ggml/examples/yolo/data/labels/98_2.png +0 -0
  995. data/vendor/ggml/examples/yolo/data/labels/98_3.png +0 -0
  996. data/vendor/ggml/examples/yolo/data/labels/98_4.png +0 -0
  997. data/vendor/ggml/examples/yolo/data/labels/98_5.png +0 -0
  998. data/vendor/ggml/examples/yolo/data/labels/98_6.png +0 -0
  999. data/vendor/ggml/examples/yolo/data/labels/98_7.png +0 -0
  1000. data/vendor/ggml/examples/yolo/data/labels/99_0.png +0 -0
  1001. data/vendor/ggml/examples/yolo/data/labels/99_1.png +0 -0
  1002. data/vendor/ggml/examples/yolo/data/labels/99_2.png +0 -0
  1003. data/vendor/ggml/examples/yolo/data/labels/99_3.png +0 -0
  1004. data/vendor/ggml/examples/yolo/data/labels/99_4.png +0 -0
  1005. data/vendor/ggml/examples/yolo/data/labels/99_5.png +0 -0
  1006. data/vendor/ggml/examples/yolo/data/labels/99_6.png +0 -0
  1007. data/vendor/ggml/examples/yolo/data/labels/99_7.png +0 -0
  1008. data/vendor/ggml/examples/yolo/yolo-image.cpp +210 -0
  1009. data/vendor/ggml/examples/yolo/yolo-image.h +39 -0
  1010. data/vendor/ggml/examples/yolo/yolov3-tiny.cpp +661 -0
  1011. data/vendor/ggml/ggml.pc.in +10 -0
  1012. data/vendor/ggml/include/ggml-alloc.h +85 -0
  1013. data/vendor/ggml/include/ggml-backend.h +431 -0
  1014. data/vendor/ggml/include/ggml-blas.h +25 -0
  1015. data/vendor/ggml/include/ggml-cann.h +123 -0
  1016. data/vendor/ggml/include/ggml-cpp.h +39 -0
  1017. data/vendor/ggml/include/ggml-cpu.h +151 -0
  1018. data/vendor/ggml/include/ggml-cuda.h +50 -0
  1019. data/vendor/ggml/include/ggml-hexagon.h +19 -0
  1020. data/vendor/ggml/include/ggml-metal.h +61 -0
  1021. data/vendor/ggml/include/ggml-opencl.h +26 -0
  1022. data/vendor/ggml/include/ggml-openvino.h +37 -0
  1023. data/vendor/ggml/include/ggml-opt.h +256 -0
  1024. data/vendor/ggml/include/ggml-rpc.h +35 -0
  1025. data/vendor/ggml/include/ggml-sycl.h +49 -0
  1026. data/vendor/ggml/include/ggml-virtgpu.h +14 -0
  1027. data/vendor/ggml/include/ggml-vulkan.h +29 -0
  1028. data/vendor/ggml/include/ggml-webgpu.h +19 -0
  1029. data/vendor/ggml/include/ggml-zdnn.h +17 -0
  1030. data/vendor/ggml/include/ggml-zendnn.h +22 -0
  1031. data/vendor/ggml/include/ggml.h +2845 -0
  1032. data/vendor/ggml/include/gguf.h +204 -0
  1033. data/vendor/ggml/requirements.txt +12 -0
  1034. data/vendor/ggml/scripts/gen-authors.sh +9 -0
  1035. data/vendor/ggml/scripts/release.sh +296 -0
  1036. data/vendor/ggml/scripts/sync-llama-am.sh +167 -0
  1037. data/vendor/ggml/scripts/sync-llama.last +1 -0
  1038. data/vendor/ggml/scripts/sync-llama.sh +21 -0
  1039. data/vendor/ggml/scripts/sync-whisper-am.sh +138 -0
  1040. data/vendor/ggml/scripts/sync-whisper.last +1 -0
  1041. data/vendor/ggml/scripts/sync-whisper.sh +17 -0
  1042. data/vendor/ggml/src/CMakeLists.txt +493 -0
  1043. data/vendor/ggml/src/ggml-alloc.c +1248 -0
  1044. data/vendor/ggml/src/ggml-backend-dl.cpp +48 -0
  1045. data/vendor/ggml/src/ggml-backend-dl.h +45 -0
  1046. data/vendor/ggml/src/ggml-backend-impl.h +275 -0
  1047. data/vendor/ggml/src/ggml-backend-meta.cpp +2144 -0
  1048. data/vendor/ggml/src/ggml-backend-reg.cpp +586 -0
  1049. data/vendor/ggml/src/ggml-backend.cpp +2371 -0
  1050. data/vendor/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  1051. data/vendor/ggml/src/ggml-blas/ggml-blas.cpp +522 -0
  1052. data/vendor/ggml/src/ggml-cann/CMakeLists.txt +89 -0
  1053. data/vendor/ggml/src/ggml-cann/acl_tensor.cpp +195 -0
  1054. data/vendor/ggml/src/ggml-cann/acl_tensor.h +349 -0
  1055. data/vendor/ggml/src/ggml-cann/aclnn_ops.cpp +4436 -0
  1056. data/vendor/ggml/src/ggml-cann/aclnn_ops.h +1190 -0
  1057. data/vendor/ggml/src/ggml-cann/common.h +651 -0
  1058. data/vendor/ggml/src/ggml-cann/ggml-cann.cpp +3062 -0
  1059. data/vendor/ggml/src/ggml-common.h +1900 -0
  1060. data/vendor/ggml/src/ggml-cpu/CMakeLists.txt +731 -0
  1061. data/vendor/ggml/src/ggml-cpu/amx/amx.cpp +249 -0
  1062. data/vendor/ggml/src/ggml-cpu/amx/amx.h +8 -0
  1063. data/vendor/ggml/src/ggml-cpu/amx/common.h +115 -0
  1064. data/vendor/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  1065. data/vendor/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  1066. data/vendor/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  1067. data/vendor/ggml/src/ggml-cpu/arch/arm/quants.c +4245 -0
  1068. data/vendor/ggml/src/ggml-cpu/arch/arm/repack.cpp +5156 -0
  1069. data/vendor/ggml/src/ggml-cpu/arch/loongarch/quants.c +2158 -0
  1070. data/vendor/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  1071. data/vendor/ggml/src/ggml-cpu/arch/powerpc/quants.c +2304 -0
  1072. data/vendor/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  1073. data/vendor/ggml/src/ggml-cpu/arch/riscv/quants.c +4553 -0
  1074. data/vendor/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1703 -0
  1075. data/vendor/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  1076. data/vendor/ggml/src/ggml-cpu/arch/s390/quants.c +1465 -0
  1077. data/vendor/ggml/src/ggml-cpu/arch/wasm/quants.c +1220 -0
  1078. data/vendor/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  1079. data/vendor/ggml/src/ggml-cpu/arch/x86/quants.c +3970 -0
  1080. data/vendor/ggml/src/ggml-cpu/arch/x86/repack.cpp +6407 -0
  1081. data/vendor/ggml/src/ggml-cpu/arch-fallback.h +348 -0
  1082. data/vendor/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  1083. data/vendor/ggml/src/ggml-cpu/binary-ops.h +16 -0
  1084. data/vendor/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  1085. data/vendor/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  1086. data/vendor/ggml/src/ggml-cpu/common.h +95 -0
  1087. data/vendor/ggml/src/ggml-cpu/ggml-cpu-impl.h +539 -0
  1088. data/vendor/ggml/src/ggml-cpu/ggml-cpu.c +3835 -0
  1089. data/vendor/ggml/src/ggml-cpu/ggml-cpu.cpp +703 -0
  1090. data/vendor/ggml/src/ggml-cpu/hbm.cpp +55 -0
  1091. data/vendor/ggml/src/ggml-cpu/hbm.h +8 -0
  1092. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.cpp +939 -0
  1093. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  1094. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1513 -0
  1095. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  1096. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4051 -0
  1097. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  1098. data/vendor/ggml/src/ggml-cpu/ops.cpp +11373 -0
  1099. data/vendor/ggml/src/ggml-cpu/ops.h +119 -0
  1100. data/vendor/ggml/src/ggml-cpu/quants.c +1288 -0
  1101. data/vendor/ggml/src/ggml-cpu/quants.h +103 -0
  1102. data/vendor/ggml/src/ggml-cpu/repack.cpp +4836 -0
  1103. data/vendor/ggml/src/ggml-cpu/repack.h +245 -0
  1104. data/vendor/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  1105. data/vendor/ggml/src/ggml-cpu/simd-mappings.h +1319 -0
  1106. data/vendor/ggml/src/ggml-cpu/spacemit/ime.cpp +1740 -0
  1107. data/vendor/ggml/src/ggml-cpu/spacemit/ime.h +21 -0
  1108. data/vendor/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +1027 -0
  1109. data/vendor/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  1110. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  1111. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  1112. data/vendor/ggml/src/ggml-cpu/spacemit/ime_kernels.h +189 -0
  1113. data/vendor/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  1114. data/vendor/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  1115. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  1116. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  1117. data/vendor/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  1118. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  1119. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  1120. data/vendor/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  1121. data/vendor/ggml/src/ggml-cpu/traits.cpp +36 -0
  1122. data/vendor/ggml/src/ggml-cpu/traits.h +38 -0
  1123. data/vendor/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  1124. data/vendor/ggml/src/ggml-cpu/unary-ops.h +35 -0
  1125. data/vendor/ggml/src/ggml-cpu/vec.cpp +629 -0
  1126. data/vendor/ggml/src/ggml-cpu/vec.h +1588 -0
  1127. data/vendor/ggml/src/ggml-cuda/CMakeLists.txt +268 -0
  1128. data/vendor/ggml/src/ggml-cuda/acc.cu +61 -0
  1129. data/vendor/ggml/src/ggml-cuda/acc.cuh +5 -0
  1130. data/vendor/ggml/src/ggml-cuda/add-id.cu +58 -0
  1131. data/vendor/ggml/src/ggml-cuda/add-id.cuh +3 -0
  1132. data/vendor/ggml/src/ggml-cuda/allreduce.cu +971 -0
  1133. data/vendor/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  1134. data/vendor/ggml/src/ggml-cuda/arange.cu +34 -0
  1135. data/vendor/ggml/src/ggml-cuda/arange.cuh +5 -0
  1136. data/vendor/ggml/src/ggml-cuda/argmax.cu +91 -0
  1137. data/vendor/ggml/src/ggml-cuda/argmax.cuh +3 -0
  1138. data/vendor/ggml/src/ggml-cuda/argsort.cu +266 -0
  1139. data/vendor/ggml/src/ggml-cuda/argsort.cuh +19 -0
  1140. data/vendor/ggml/src/ggml-cuda/binbcast.cu +534 -0
  1141. data/vendor/ggml/src/ggml-cuda/binbcast.cuh +12 -0
  1142. data/vendor/ggml/src/ggml-cuda/clamp.cu +45 -0
  1143. data/vendor/ggml/src/ggml-cuda/clamp.cuh +5 -0
  1144. data/vendor/ggml/src/ggml-cuda/common.cuh +1489 -0
  1145. data/vendor/ggml/src/ggml-cuda/concat.cu +204 -0
  1146. data/vendor/ggml/src/ggml-cuda/concat.cuh +5 -0
  1147. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cu +86 -0
  1148. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  1149. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  1150. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  1151. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cu +115 -0
  1152. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cuh +5 -0
  1153. data/vendor/ggml/src/ggml-cuda/conv2d.cu +166 -0
  1154. data/vendor/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  1155. data/vendor/ggml/src/ggml-cuda/convert.cu +892 -0
  1156. data/vendor/ggml/src/ggml-cuda/convert.cuh +66 -0
  1157. data/vendor/ggml/src/ggml-cuda/count-equal.cu +64 -0
  1158. data/vendor/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  1159. data/vendor/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  1160. data/vendor/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  1161. data/vendor/ggml/src/ggml-cuda/cpy.cu +558 -0
  1162. data/vendor/ggml/src/ggml-cuda/cpy.cuh +7 -0
  1163. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cu +177 -0
  1164. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  1165. data/vendor/ggml/src/ggml-cuda/cumsum.cu +307 -0
  1166. data/vendor/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  1167. data/vendor/ggml/src/ggml-cuda/dequantize.cuh +99 -0
  1168. data/vendor/ggml/src/ggml-cuda/diag.cu +77 -0
  1169. data/vendor/ggml/src/ggml-cuda/diag.cuh +5 -0
  1170. data/vendor/ggml/src/ggml-cuda/diagmask.cu +40 -0
  1171. data/vendor/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  1172. data/vendor/ggml/src/ggml-cuda/fattn-common.cuh +1212 -0
  1173. data/vendor/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2020 -0
  1174. data/vendor/ggml/src/ggml-cuda/fattn-tile.cu +61 -0
  1175. data/vendor/ggml/src/ggml-cuda/fattn-tile.cuh +1347 -0
  1176. data/vendor/ggml/src/ggml-cuda/fattn-vec.cuh +600 -0
  1177. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cu +696 -0
  1178. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +51 -0
  1179. data/vendor/ggml/src/ggml-cuda/fattn.cu +562 -0
  1180. data/vendor/ggml/src/ggml-cuda/fattn.cuh +5 -0
  1181. data/vendor/ggml/src/ggml-cuda/fill.cu +37 -0
  1182. data/vendor/ggml/src/ggml-cuda/fill.cuh +3 -0
  1183. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cu +311 -0
  1184. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  1185. data/vendor/ggml/src/ggml-cuda/getrows.cu +300 -0
  1186. data/vendor/ggml/src/ggml-cuda/getrows.cuh +15 -0
  1187. data/vendor/ggml/src/ggml-cuda/ggml-cuda.cu +5684 -0
  1188. data/vendor/ggml/src/ggml-cuda/gla.cu +93 -0
  1189. data/vendor/ggml/src/ggml-cuda/gla.cuh +3 -0
  1190. data/vendor/ggml/src/ggml-cuda/im2col.cu +267 -0
  1191. data/vendor/ggml/src/ggml-cuda/im2col.cuh +6 -0
  1192. data/vendor/ggml/src/ggml-cuda/mean.cu +75 -0
  1193. data/vendor/ggml/src/ggml-cuda/mean.cuh +3 -0
  1194. data/vendor/ggml/src/ggml-cuda/mma.cuh +1456 -0
  1195. data/vendor/ggml/src/ggml-cuda/mmf.cu +191 -0
  1196. data/vendor/ggml/src/ggml-cuda/mmf.cuh +908 -0
  1197. data/vendor/ggml/src/ggml-cuda/mmid.cu +164 -0
  1198. data/vendor/ggml/src/ggml-cuda/mmid.cuh +5 -0
  1199. data/vendor/ggml/src/ggml-cuda/mmq.cu +372 -0
  1200. data/vendor/ggml/src/ggml-cuda/mmq.cuh +4176 -0
  1201. data/vendor/ggml/src/ggml-cuda/mmvf.cu +862 -0
  1202. data/vendor/ggml/src/ggml-cuda/mmvf.cuh +14 -0
  1203. data/vendor/ggml/src/ggml-cuda/mmvq.cu +1161 -0
  1204. data/vendor/ggml/src/ggml-cuda/mmvq.cuh +16 -0
  1205. data/vendor/ggml/src/ggml-cuda/norm.cu +672 -0
  1206. data/vendor/ggml/src/ggml-cuda/norm.cuh +18 -0
  1207. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  1208. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  1209. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  1210. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  1211. data/vendor/ggml/src/ggml-cuda/out-prod.cu +84 -0
  1212. data/vendor/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  1213. data/vendor/ggml/src/ggml-cuda/pad.cu +106 -0
  1214. data/vendor/ggml/src/ggml-cuda/pad.cuh +5 -0
  1215. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  1216. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  1217. data/vendor/ggml/src/ggml-cuda/pool2d.cu +94 -0
  1218. data/vendor/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  1219. data/vendor/ggml/src/ggml-cuda/quantize.cu +443 -0
  1220. data/vendor/ggml/src/ggml-cuda/quantize.cuh +41 -0
  1221. data/vendor/ggml/src/ggml-cuda/reduce_rows.cuh +39 -0
  1222. data/vendor/ggml/src/ggml-cuda/roll.cu +67 -0
  1223. data/vendor/ggml/src/ggml-cuda/roll.cuh +5 -0
  1224. data/vendor/ggml/src/ggml-cuda/rope.cu +665 -0
  1225. data/vendor/ggml/src/ggml-cuda/rope.cuh +9 -0
  1226. data/vendor/ggml/src/ggml-cuda/scale.cu +34 -0
  1227. data/vendor/ggml/src/ggml-cuda/scale.cuh +5 -0
  1228. data/vendor/ggml/src/ggml-cuda/set-rows.cu +330 -0
  1229. data/vendor/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  1230. data/vendor/ggml/src/ggml-cuda/set.cu +39 -0
  1231. data/vendor/ggml/src/ggml-cuda/set.cuh +7 -0
  1232. data/vendor/ggml/src/ggml-cuda/snake.cu +72 -0
  1233. data/vendor/ggml/src/ggml-cuda/snake.cuh +8 -0
  1234. data/vendor/ggml/src/ggml-cuda/softcap.cu +34 -0
  1235. data/vendor/ggml/src/ggml-cuda/softcap.cuh +5 -0
  1236. data/vendor/ggml/src/ggml-cuda/softmax.cu +472 -0
  1237. data/vendor/ggml/src/ggml-cuda/softmax.cuh +7 -0
  1238. data/vendor/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  1239. data/vendor/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  1240. data/vendor/ggml/src/ggml-cuda/ssm-conv.cu +197 -0
  1241. data/vendor/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  1242. data/vendor/ggml/src/ggml-cuda/ssm-scan.cu +342 -0
  1243. data/vendor/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  1244. data/vendor/ggml/src/ggml-cuda/sum.cu +41 -0
  1245. data/vendor/ggml/src/ggml-cuda/sum.cuh +5 -0
  1246. data/vendor/ggml/src/ggml-cuda/sumrows.cu +43 -0
  1247. data/vendor/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  1248. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +6 -0
  1249. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  1250. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +12 -0
  1251. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  1252. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  1253. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +12 -0
  1254. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +6 -0
  1255. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  1256. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +12 -0
  1257. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +12 -0
  1258. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  1259. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  1260. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +6 -0
  1261. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  1262. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +12 -0
  1263. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +12 -0
  1264. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  1265. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  1266. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  1267. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +12 -0
  1268. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +12 -0
  1269. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  1270. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  1271. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  1272. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  1273. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  1274. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  1275. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  1276. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  1277. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  1278. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  1279. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  1280. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  1281. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  1282. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  1283. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  1284. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  1285. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  1286. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  1287. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  1288. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  1289. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  1290. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  1291. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  1292. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  1293. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  1294. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  1295. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  1296. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  1297. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  1298. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  1299. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  1300. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  1301. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  1302. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  1303. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  1304. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  1305. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  1306. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  1307. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  1308. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  1309. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  1310. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  1311. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  1312. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  1313. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  1314. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  1315. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  1316. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  1317. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  1318. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  1319. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  1320. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  1321. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  1322. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  1323. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  1324. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  1325. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  1326. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  1327. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  1328. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  1329. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  1330. data/vendor/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +110 -0
  1331. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  1332. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  1333. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  1334. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  1335. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  1336. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  1337. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  1338. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  1339. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  1340. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  1341. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  1342. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  1343. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  1344. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  1345. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  1346. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  1347. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  1348. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  1349. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  1350. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  1351. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  1352. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  1353. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  1354. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  1355. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  1356. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  1357. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  1358. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  1359. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  1360. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  1361. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  1362. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  1363. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  1364. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  1365. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  1366. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  1367. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  1368. data/vendor/ggml/src/ggml-cuda/top-k.cu +95 -0
  1369. data/vendor/ggml/src/ggml-cuda/top-k.cuh +3 -0
  1370. data/vendor/ggml/src/ggml-cuda/topk-moe.cu +415 -0
  1371. data/vendor/ggml/src/ggml-cuda/topk-moe.cuh +27 -0
  1372. data/vendor/ggml/src/ggml-cuda/tri.cu +136 -0
  1373. data/vendor/ggml/src/ggml-cuda/tri.cuh +5 -0
  1374. data/vendor/ggml/src/ggml-cuda/tsembd.cu +47 -0
  1375. data/vendor/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  1376. data/vendor/ggml/src/ggml-cuda/unary.cu +640 -0
  1377. data/vendor/ggml/src/ggml-cuda/unary.cuh +114 -0
  1378. data/vendor/ggml/src/ggml-cuda/upscale.cu +293 -0
  1379. data/vendor/ggml/src/ggml-cuda/upscale.cuh +5 -0
  1380. data/vendor/ggml/src/ggml-cuda/vecdotq.cuh +1317 -0
  1381. data/vendor/ggml/src/ggml-cuda/vendors/cuda.h +28 -0
  1382. data/vendor/ggml/src/ggml-cuda/vendors/hip.h +304 -0
  1383. data/vendor/ggml/src/ggml-cuda/vendors/musa.h +150 -0
  1384. data/vendor/ggml/src/ggml-cuda/wkv.cu +199 -0
  1385. data/vendor/ggml/src/ggml-cuda/wkv.cuh +7 -0
  1386. data/vendor/ggml/src/ggml-hexagon/CMakeLists.txt +118 -0
  1387. data/vendor/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3680 -0
  1388. data/vendor/ggml/src/ggml-hexagon/htp/CMakeLists.txt +78 -0
  1389. data/vendor/ggml/src/ggml-hexagon/htp/act-ops.c +782 -0
  1390. data/vendor/ggml/src/ggml-hexagon/htp/argsort-ops.c +293 -0
  1391. data/vendor/ggml/src/ggml-hexagon/htp/binary-ops.c +872 -0
  1392. data/vendor/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  1393. data/vendor/ggml/src/ggml-hexagon/htp/cpy-ops.c +275 -0
  1394. data/vendor/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  1395. data/vendor/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  1396. data/vendor/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  1397. data/vendor/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +727 -0
  1398. data/vendor/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +955 -0
  1399. data/vendor/ggml/src/ggml-hexagon/htp/get-rows-ops.c +124 -0
  1400. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  1401. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  1402. data/vendor/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  1403. data/vendor/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  1404. data/vendor/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  1405. data/vendor/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1841 -0
  1406. data/vendor/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +1785 -0
  1407. data/vendor/ggml/src/ggml-hexagon/htp/hmx-ops.h +71 -0
  1408. data/vendor/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  1409. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  1410. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  1411. data/vendor/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  1412. data/vendor/ggml/src/ggml-hexagon/htp/htp-ctx.h +111 -0
  1413. data/vendor/ggml/src/ggml-hexagon/htp/htp-ops.h +181 -0
  1414. data/vendor/ggml/src/ggml-hexagon/htp/htp_iface.idl +22 -0
  1415. data/vendor/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  1416. data/vendor/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  1417. data/vendor/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  1418. data/vendor/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  1419. data/vendor/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  1420. data/vendor/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  1421. data/vendor/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  1422. data/vendor/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  1423. data/vendor/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  1424. data/vendor/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  1425. data/vendor/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  1426. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  1427. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  1428. data/vendor/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  1429. data/vendor/ggml/src/ggml-hexagon/htp/hvx-utils.h +19 -0
  1430. data/vendor/ggml/src/ggml-hexagon/htp/main.c +880 -0
  1431. data/vendor/ggml/src/ggml-hexagon/htp/matmul-ops.c +3173 -0
  1432. data/vendor/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  1433. data/vendor/ggml/src/ggml-hexagon/htp/rope-ops.c +494 -0
  1434. data/vendor/ggml/src/ggml-hexagon/htp/set-rows-ops.c +184 -0
  1435. data/vendor/ggml/src/ggml-hexagon/htp/softmax-ops.c +407 -0
  1436. data/vendor/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  1437. data/vendor/ggml/src/ggml-hexagon/htp/ssm-conv.c +340 -0
  1438. data/vendor/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  1439. data/vendor/ggml/src/ggml-hexagon/htp/unary-ops.c +657 -0
  1440. data/vendor/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  1441. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  1442. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  1443. data/vendor/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  1444. data/vendor/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  1445. data/vendor/ggml/src/ggml-hexagon/libdl.h +79 -0
  1446. data/vendor/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  1447. data/vendor/ggml/src/ggml-hexagon/op-desc.h +153 -0
  1448. data/vendor/ggml/src/ggml-hip/CMakeLists.txt +157 -0
  1449. data/vendor/ggml/src/ggml-impl.h +783 -0
  1450. data/vendor/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  1451. data/vendor/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  1452. data/vendor/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  1453. data/vendor/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  1454. data/vendor/ggml/src/ggml-metal/ggml-metal-context.m +739 -0
  1455. data/vendor/ggml/src/ggml-metal/ggml-metal-device.cpp +2053 -0
  1456. data/vendor/ggml/src/ggml-metal/ggml-metal-device.h +296 -0
  1457. data/vendor/ggml/src/ggml-metal/ggml-metal-device.m +1829 -0
  1458. data/vendor/ggml/src/ggml-metal/ggml-metal-impl.h +1175 -0
  1459. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.cpp +4606 -0
  1460. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.h +97 -0
  1461. data/vendor/ggml/src/ggml-metal/ggml-metal.cpp +950 -0
  1462. data/vendor/ggml/src/ggml-metal/ggml-metal.metal +10679 -0
  1463. data/vendor/ggml/src/ggml-musa/CMakeLists.txt +124 -0
  1464. data/vendor/ggml/src/ggml-musa/mudnn.cu +112 -0
  1465. data/vendor/ggml/src/ggml-musa/mudnn.cuh +12 -0
  1466. data/vendor/ggml/src/ggml-opencl/CMakeLists.txt +189 -0
  1467. data/vendor/ggml/src/ggml-opencl/ggml-opencl.cpp +16374 -0
  1468. data/vendor/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  1469. data/vendor/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  1470. data/vendor/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  1471. data/vendor/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  1472. data/vendor/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  1473. data/vendor/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  1474. data/vendor/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  1475. data/vendor/ggml/src/ggml-opencl/kernels/cpy.cl +229 -0
  1476. data/vendor/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  1477. data/vendor/ggml/src/ggml-opencl/kernels/cvt.cl +1471 -0
  1478. data/vendor/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  1479. data/vendor/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  1480. data/vendor/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  1481. data/vendor/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  1482. data/vendor/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  1483. data/vendor/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  1484. data/vendor/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  1485. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  1486. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  1487. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  1488. data/vendor/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  1489. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  1490. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +302 -0
  1491. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +252 -0
  1492. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +254 -0
  1493. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +256 -0
  1494. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +258 -0
  1495. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  1496. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl +139 -0
  1497. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  1498. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  1499. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  1500. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  1501. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  1502. data/vendor/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  1503. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  1504. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +161 -0
  1505. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +116 -0
  1506. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +119 -0
  1507. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +119 -0
  1508. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +121 -0
  1509. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  1510. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl +274 -0
  1511. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl +268 -0
  1512. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  1513. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  1514. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  1515. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  1516. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  1517. data/vendor/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  1518. data/vendor/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  1519. data/vendor/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  1520. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  1521. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  1522. data/vendor/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  1523. data/vendor/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  1524. data/vendor/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  1525. data/vendor/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  1526. data/vendor/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  1527. data/vendor/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  1528. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  1529. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  1530. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  1531. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  1532. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  1533. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  1534. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  1535. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  1536. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  1537. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  1538. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  1539. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  1540. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  1541. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  1542. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  1543. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  1544. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  1545. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  1546. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  1547. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  1548. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  1549. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  1550. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  1551. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  1552. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  1553. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  1554. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  1555. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  1556. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  1557. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  1558. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  1559. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  1560. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  1561. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  1562. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  1563. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  1564. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  1565. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  1566. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  1567. data/vendor/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  1568. data/vendor/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  1569. data/vendor/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  1570. data/vendor/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  1571. data/vendor/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  1572. data/vendor/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  1573. data/vendor/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  1574. data/vendor/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  1575. data/vendor/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  1576. data/vendor/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  1577. data/vendor/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  1578. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  1579. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  1580. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  1581. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  1582. data/vendor/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  1583. data/vendor/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  1584. data/vendor/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  1585. data/vendor/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  1586. data/vendor/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  1587. data/vendor/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  1588. data/vendor/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  1589. data/vendor/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  1590. data/vendor/ggml/src/ggml-opencl/kernels/transpose.cl +143 -0
  1591. data/vendor/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  1592. data/vendor/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  1593. data/vendor/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  1594. data/vendor/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  1595. data/vendor/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  1596. data/vendor/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  1597. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  1598. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  1599. data/vendor/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  1600. data/vendor/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  1601. data/vendor/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  1602. data/vendor/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  1603. data/vendor/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  1604. data/vendor/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  1605. data/vendor/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  1606. data/vendor/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  1607. data/vendor/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  1608. data/vendor/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  1609. data/vendor/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  1610. data/vendor/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  1611. data/vendor/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  1612. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  1613. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  1614. data/vendor/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  1615. data/vendor/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  1616. data/vendor/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  1617. data/vendor/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  1618. data/vendor/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  1619. data/vendor/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  1620. data/vendor/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  1621. data/vendor/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  1622. data/vendor/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  1623. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  1624. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  1625. data/vendor/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  1626. data/vendor/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  1627. data/vendor/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  1628. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  1629. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  1630. data/vendor/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  1631. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  1632. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  1633. data/vendor/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  1634. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  1635. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  1636. data/vendor/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  1637. data/vendor/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  1638. data/vendor/ggml/src/ggml-openvino/utils.cpp +880 -0
  1639. data/vendor/ggml/src/ggml-openvino/utils.h +143 -0
  1640. data/vendor/ggml/src/ggml-opt.cpp +1094 -0
  1641. data/vendor/ggml/src/ggml-quants.c +5491 -0
  1642. data/vendor/ggml/src/ggml-quants.h +112 -0
  1643. data/vendor/ggml/src/ggml-rpc/CMakeLists.txt +33 -0
  1644. data/vendor/ggml/src/ggml-rpc/ggml-rpc.cpp +1974 -0
  1645. data/vendor/ggml/src/ggml-rpc/transport.cpp +683 -0
  1646. data/vendor/ggml/src/ggml-rpc/transport.h +34 -0
  1647. data/vendor/ggml/src/ggml-sycl/CMakeLists.txt +207 -0
  1648. data/vendor/ggml/src/ggml-sycl/add-id.cpp +81 -0
  1649. data/vendor/ggml/src/ggml-sycl/add-id.hpp +8 -0
  1650. data/vendor/ggml/src/ggml-sycl/backend.hpp +48 -0
  1651. data/vendor/ggml/src/ggml-sycl/binbcast.cpp +346 -0
  1652. data/vendor/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  1653. data/vendor/ggml/src/ggml-sycl/common.cpp +155 -0
  1654. data/vendor/ggml/src/ggml-sycl/common.hpp +1002 -0
  1655. data/vendor/ggml/src/ggml-sycl/concat.cpp +202 -0
  1656. data/vendor/ggml/src/ggml-sycl/concat.hpp +20 -0
  1657. data/vendor/ggml/src/ggml-sycl/conv.cpp +101 -0
  1658. data/vendor/ggml/src/ggml-sycl/conv.hpp +20 -0
  1659. data/vendor/ggml/src/ggml-sycl/convert.cpp +825 -0
  1660. data/vendor/ggml/src/ggml-sycl/convert.hpp +64 -0
  1661. data/vendor/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  1662. data/vendor/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  1663. data/vendor/ggml/src/ggml-sycl/cpy.cpp +602 -0
  1664. data/vendor/ggml/src/ggml-sycl/cpy.hpp +223 -0
  1665. data/vendor/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  1666. data/vendor/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  1667. data/vendor/ggml/src/ggml-sycl/dequantize.hpp +975 -0
  1668. data/vendor/ggml/src/ggml-sycl/diag.cpp +67 -0
  1669. data/vendor/ggml/src/ggml-sycl/diag.hpp +5 -0
  1670. data/vendor/ggml/src/ggml-sycl/dmmv.cpp +1579 -0
  1671. data/vendor/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  1672. data/vendor/ggml/src/ggml-sycl/dpct/helper.hpp +3774 -0
  1673. data/vendor/ggml/src/ggml-sycl/element_wise.cpp +1124 -0
  1674. data/vendor/ggml/src/ggml-sycl/element_wise.hpp +94 -0
  1675. data/vendor/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  1676. data/vendor/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  1677. data/vendor/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  1678. data/vendor/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  1679. data/vendor/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  1680. data/vendor/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  1681. data/vendor/ggml/src/ggml-sycl/fattn.cpp +227 -0
  1682. data/vendor/ggml/src/ggml-sycl/fattn.hpp +22 -0
  1683. data/vendor/ggml/src/ggml-sycl/fill.cpp +55 -0
  1684. data/vendor/ggml/src/ggml-sycl/fill.hpp +5 -0
  1685. data/vendor/ggml/src/ggml-sycl/gated_delta_net.cpp +307 -0
  1686. data/vendor/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  1687. data/vendor/ggml/src/ggml-sycl/gemm.hpp +93 -0
  1688. data/vendor/ggml/src/ggml-sycl/getrows.cpp +219 -0
  1689. data/vendor/ggml/src/ggml-sycl/getrows.hpp +20 -0
  1690. data/vendor/ggml/src/ggml-sycl/ggml-sycl.cpp +5520 -0
  1691. data/vendor/ggml/src/ggml-sycl/gla.cpp +106 -0
  1692. data/vendor/ggml/src/ggml-sycl/gla.hpp +8 -0
  1693. data/vendor/ggml/src/ggml-sycl/im2col.cpp +400 -0
  1694. data/vendor/ggml/src/ggml-sycl/im2col.hpp +23 -0
  1695. data/vendor/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  1696. data/vendor/ggml/src/ggml-sycl/mmq.hpp +33 -0
  1697. data/vendor/ggml/src/ggml-sycl/mmvq.cpp +1380 -0
  1698. data/vendor/ggml/src/ggml-sycl/mmvq.hpp +43 -0
  1699. data/vendor/ggml/src/ggml-sycl/norm.cpp +656 -0
  1700. data/vendor/ggml/src/ggml-sycl/norm.hpp +28 -0
  1701. data/vendor/ggml/src/ggml-sycl/outprod.cpp +47 -0
  1702. data/vendor/ggml/src/ggml-sycl/outprod.hpp +10 -0
  1703. data/vendor/ggml/src/ggml-sycl/pad.cpp +97 -0
  1704. data/vendor/ggml/src/ggml-sycl/pad.hpp +24 -0
  1705. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  1706. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  1707. data/vendor/ggml/src/ggml-sycl/presets.hpp +79 -0
  1708. data/vendor/ggml/src/ggml-sycl/quantize.hpp +133 -0
  1709. data/vendor/ggml/src/ggml-sycl/quants.hpp +156 -0
  1710. data/vendor/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  1711. data/vendor/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  1712. data/vendor/ggml/src/ggml-sycl/roll.cpp +122 -0
  1713. data/vendor/ggml/src/ggml-sycl/roll.hpp +20 -0
  1714. data/vendor/ggml/src/ggml-sycl/rope.cpp +641 -0
  1715. data/vendor/ggml/src/ggml-sycl/rope.hpp +26 -0
  1716. data/vendor/ggml/src/ggml-sycl/set.cpp +73 -0
  1717. data/vendor/ggml/src/ggml-sycl/set.hpp +5 -0
  1718. data/vendor/ggml/src/ggml-sycl/set_rows.cpp +240 -0
  1719. data/vendor/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  1720. data/vendor/ggml/src/ggml-sycl/softmax.cpp +426 -0
  1721. data/vendor/ggml/src/ggml-sycl/softmax.hpp +24 -0
  1722. data/vendor/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  1723. data/vendor/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  1724. data/vendor/ggml/src/ggml-sycl/ssm_conv.cpp +132 -0
  1725. data/vendor/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  1726. data/vendor/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  1727. data/vendor/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  1728. data/vendor/ggml/src/ggml-sycl/sycl_hw.cpp +67 -0
  1729. data/vendor/ggml/src/ggml-sycl/sycl_hw.hpp +38 -0
  1730. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  1731. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  1732. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  1733. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  1734. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  1735. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  1736. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  1737. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  1738. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  1739. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  1740. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  1741. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  1742. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  1743. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  1744. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  1745. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  1746. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  1747. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  1748. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  1749. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  1750. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  1751. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  1752. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  1753. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  1754. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  1755. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  1756. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  1757. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  1758. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  1759. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  1760. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  1761. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  1762. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  1763. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  1764. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  1765. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  1766. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  1767. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  1768. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  1769. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  1770. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  1771. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  1772. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  1773. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  1774. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  1775. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  1776. data/vendor/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  1777. data/vendor/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  1778. data/vendor/ggml/src/ggml-sycl/type.hpp +112 -0
  1779. data/vendor/ggml/src/ggml-sycl/upscale.cpp +410 -0
  1780. data/vendor/ggml/src/ggml-sycl/upscale.hpp +9 -0
  1781. data/vendor/ggml/src/ggml-sycl/vecdotq.hpp +1508 -0
  1782. data/vendor/ggml/src/ggml-sycl/wkv.cpp +293 -0
  1783. data/vendor/ggml/src/ggml-sycl/wkv.hpp +10 -0
  1784. data/vendor/ggml/src/ggml-threading.cpp +12 -0
  1785. data/vendor/ggml/src/ggml-threading.h +14 -0
  1786. data/vendor/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  1787. data/vendor/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  1788. data/vendor/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  1789. data/vendor/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  1790. data/vendor/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  1791. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  1792. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  1793. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  1794. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  1795. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  1796. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  1797. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  1798. data/vendor/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  1799. data/vendor/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  1800. data/vendor/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  1801. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  1802. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  1803. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  1804. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  1805. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  1806. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  1807. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  1808. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  1809. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  1810. data/vendor/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  1811. data/vendor/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  1812. data/vendor/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  1813. data/vendor/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  1814. data/vendor/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  1815. data/vendor/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  1816. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  1817. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  1818. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  1819. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  1820. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  1821. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  1822. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  1823. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  1824. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  1825. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  1826. data/vendor/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  1827. data/vendor/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  1828. data/vendor/ggml/src/ggml-vulkan/CMakeLists.txt +220 -0
  1829. data/vendor/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  1830. data/vendor/ggml/src/ggml-vulkan/ggml-vulkan.cpp +17208 -0
  1831. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  1832. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  1833. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +37 -0
  1834. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +69 -0
  1835. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  1836. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  1837. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  1838. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +60 -0
  1839. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +86 -0
  1840. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  1841. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  1842. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  1843. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  1844. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  1845. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  1846. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  1847. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  1848. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  1849. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  1850. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +320 -0
  1851. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  1852. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  1853. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  1854. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  1855. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  1856. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  1857. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  1858. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  1859. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +653 -0
  1860. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +768 -0
  1861. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl +13 -0
  1862. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  1863. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  1864. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  1865. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  1866. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +49 -0
  1867. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +40 -0
  1868. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +51 -0
  1869. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  1870. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  1871. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  1872. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  1873. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  1874. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  1875. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  1876. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  1877. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  1878. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  1879. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  1880. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  1881. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  1882. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  1883. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  1884. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +28 -0
  1885. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  1886. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  1887. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  1888. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  1889. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp +7 -0
  1890. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp +7 -0
  1891. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp +7 -0
  1892. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp +7 -0
  1893. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  1894. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +756 -0
  1895. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +255 -0
  1896. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +626 -0
  1897. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +427 -0
  1898. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +123 -0
  1899. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  1900. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  1901. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +121 -0
  1902. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  1903. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +190 -0
  1904. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  1905. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  1906. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  1907. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  1908. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  1909. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  1910. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +65 -0
  1911. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +11 -0
  1912. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +83 -0
  1913. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +42 -0
  1914. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +51 -0
  1915. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +28 -0
  1916. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +39 -0
  1917. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  1918. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  1919. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  1920. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +93 -0
  1921. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +124 -0
  1922. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +44 -0
  1923. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  1924. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +17 -0
  1925. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  1926. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  1927. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  1928. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +230 -0
  1929. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  1930. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +132 -0
  1931. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +95 -0
  1932. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  1933. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +105 -0
  1934. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  1935. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  1936. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  1937. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +124 -0
  1938. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +156 -0
  1939. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +128 -0
  1940. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  1941. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +134 -0
  1942. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +165 -0
  1943. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  1944. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  1945. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +503 -0
  1946. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +464 -0
  1947. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +624 -0
  1948. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +600 -0
  1949. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  1950. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +311 -0
  1951. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  1952. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +93 -0
  1953. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +194 -0
  1954. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  1955. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  1956. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  1957. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  1958. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +64 -0
  1959. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  1960. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +127 -0
  1961. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  1962. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  1963. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  1964. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  1965. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +150 -0
  1966. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  1967. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  1968. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  1969. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  1970. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +19 -0
  1971. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +17 -0
  1972. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +17 -0
  1973. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +17 -0
  1974. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +31 -0
  1975. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +17 -0
  1976. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  1977. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  1978. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  1979. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  1980. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  1981. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  1982. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  1983. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +195 -0
  1984. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +54 -0
  1985. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  1986. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  1987. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  1988. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  1989. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  1990. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  1991. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  1992. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  1993. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  1994. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  1995. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  1996. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  1997. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +47 -0
  1998. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  1999. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  2000. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  2001. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  2002. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +42 -0
  2003. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  2004. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  2005. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  2006. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +42 -0
  2007. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  2008. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +1846 -0
  2009. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +178 -0
  2010. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  2011. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1183 -0
  2012. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  2013. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  2014. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  2015. data/vendor/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  2016. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3231 -0
  2017. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu.cpp +4461 -0
  2018. data/vendor/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  2019. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  2020. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  2021. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  2022. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  2023. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +139 -0
  2024. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +905 -0
  2025. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  2026. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  2027. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +81 -0
  2028. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  2029. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +89 -0
  2030. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +706 -0
  2031. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +351 -0
  2032. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  2033. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  2034. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +720 -0
  2035. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +132 -0
  2036. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +773 -0
  2037. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  2038. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  2039. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  2040. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +747 -0
  2041. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +1210 -0
  2042. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  2043. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +55 -0
  2044. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  2045. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  2046. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +200 -0
  2047. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +133 -0
  2048. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1433 -0
  2049. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  2050. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  2051. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  2052. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl +224 -0
  2053. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  2054. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  2055. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  2056. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  2057. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl +245 -0
  2058. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  2059. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  2060. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  2061. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  2062. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +210 -0
  2063. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  2064. data/vendor/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  2065. data/vendor/ggml/src/ggml-zdnn/common.hpp +59 -0
  2066. data/vendor/ggml/src/ggml-zdnn/ggml-zdnn.cpp +637 -0
  2067. data/vendor/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  2068. data/vendor/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  2069. data/vendor/ggml/src/ggml-zdnn/utils.cpp +79 -0
  2070. data/vendor/ggml/src/ggml-zdnn/utils.hpp +19 -0
  2071. data/vendor/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  2072. data/vendor/ggml/src/ggml-zendnn/ggml-zendnn.cpp +669 -0
  2073. data/vendor/ggml/src/ggml.c +7777 -0
  2074. data/vendor/ggml/src/ggml.cpp +26 -0
  2075. data/vendor/ggml/src/gguf.cpp +1556 -0
  2076. data/vendor/ggml/tests/CMakeLists.txt +356 -0
  2077. data/vendor/ggml/tests/test-arange.cpp +100 -0
  2078. data/vendor/ggml/tests/test-backend-ops.cpp +9786 -0
  2079. data/vendor/ggml/tests/test-cont.c +170 -0
  2080. data/vendor/ggml/tests/test-conv-transpose-1d.cpp +691 -0
  2081. data/vendor/ggml/tests/test-conv-transpose.c +248 -0
  2082. data/vendor/ggml/tests/test-conv1d-dw-c1.cpp +243 -0
  2083. data/vendor/ggml/tests/test-conv1d-dw-c2.cpp +243 -0
  2084. data/vendor/ggml/tests/test-conv1d.cpp +289 -0
  2085. data/vendor/ggml/tests/test-conv2d-dw.cpp +153 -0
  2086. data/vendor/ggml/tests/test-conv2d.cpp +391 -0
  2087. data/vendor/ggml/tests/test-customop.c +300 -0
  2088. data/vendor/ggml/tests/test-dup.c +111 -0
  2089. data/vendor/ggml/tests/test-interpolate.cpp +166 -0
  2090. data/vendor/ggml/tests/test-opt.cpp +1003 -0
  2091. data/vendor/ggml/tests/test-pad-reflect-1d.cpp +213 -0
  2092. data/vendor/ggml/tests/test-pool.c +274 -0
  2093. data/vendor/ggml/tests/test-quantize-fns.cpp +196 -0
  2094. data/vendor/ggml/tests/test-quantize-perf.cpp +356 -0
  2095. data/vendor/ggml/tests/test-rel-pos.c +87 -0
  2096. data/vendor/ggml/tests/test-roll.cpp +128 -0
  2097. data/vendor/ggml/tests/test-timestep_embedding.cpp +180 -0
  2098. data/vendor-patches/0001-cuda-buffer_from_ptr.patch +253 -0
  2099. data/vendor-patches/0002-cuda-buffer_from_ptr-reuse-iface.patch +117 -0
  2100. data/vendor-patches/0003-cuda-buffer_from_ptr-copy-mode.patch +128 -0
  2101. data/vendor-patches/0004-cuda-cpy-strided.patch +61 -0
  2102. data/vendor-patches/0005-concat-backward.patch +36 -0
  2103. data/vendor-patches/0006-getrows-back-large-vocab.patch +69 -0
  2104. data/vendor-patches/0007-gpt2-backward-kernels.patch +438 -0
  2105. data/vendor-patches/0008-mul-mat-backward-mixed-precision.patch +50 -0
  2106. data/vendor-patches/0009-sched-unsupported-node-diagnostic.patch +26 -0
  2107. metadata +2161 -0
@@ -0,0 +1,4176 @@
1
+ #pragma once
2
+
3
+ #include "common.cuh"
4
+ #include "vecdotq.cuh"
5
+ #include "mma.cuh"
6
+
7
+ #include <climits>
8
+ #include <cstdint>
9
+
10
+ using namespace ggml_cuda_mma;
11
+
12
+ #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
13
+ #define MMQ_ITER_K 256
14
+ #define MMQ_ITER_K_FP4 512
15
+ #define MMQ_NWARPS 8
16
+
17
+ typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
18
+ typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00);
19
+ typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t * __restrict__ get_rows_to_sorted,
20
+ float * __restrict__ dst, const int stride, const int i_max, const int j_max);
21
+
22
+ enum mmq_q8_1_ds_layout {
23
+ MMQ_Q8_1_DS_LAYOUT_D4,
24
+ MMQ_Q8_1_DS_LAYOUT_DS4,
25
+ MMQ_Q8_1_DS_LAYOUT_D2S6,
26
+ };
27
+
28
+ struct block_q8_1_mmq {
29
+ // The y float data is converted to a data layout that can simply be copied to shared memory as a contiguous block.
30
+ // The y float data is first grouped as blocks of 128 values.
31
+ // These blocks are then treated as individual data values and transposed.
32
+ //
33
+ // To avoid shared memory bank conflicts each block is padded with 16 bytes.
34
+ // This padding is also used to store block scales/partial sums.
35
+ // The scales multiplied with the quantized data are equal to the unquantized values.
36
+ // The partial sums are obtained by summing up a subgroup of the contained values (prior to quantization)
37
+ // and are only needed for performance reasons.
38
+ //
39
+ // The exact data stored depends on the x data type.
40
+ union {
41
+ float d4[4]; // 1 32 bit scale per 32 values, stored as d0,d1,d2,d3
42
+ half2 ds4[4]; // 1 16 bit scale + 1 16 bit partial sum per 32 values, stored as d0,s0,d1,s1,d2,s2,d3,s3
43
+ half d2s6[8]; // 1 16 bit scale per 64 values + 1 16 bit partial sum per 16 values for the first 96 values,
44
+ // stored as d0,d1,s1,s2,s3,s4,s5
45
+ };
46
+ int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each
47
+ };
48
+
49
+ // this struct is used for fp4 data types (currently only used for Blackwell)
50
+ // mxfp4 has block size 32, each int32 of d4 contains 2 e8m0 scales in the lower 16 bits
51
+ // nvfp4 has block size 16, each int32 of d4 contains 4 ue4m3 scales
52
+ struct block_fp4_mmq {
53
+ uint32_t d4[4];
54
+ int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte)
55
+ };
56
+
57
+ static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
58
+ static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1), "Unexpected block_q8_1_mmq size");
59
+ static_assert(sizeof(block_fp4_mmq) == sizeof(block_q8_1_mmq), "Unexpected block_fp4_mmq size");
60
+
61
+ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
62
+ switch (type_x) {
63
+ case GGML_TYPE_Q1_0:
64
+ return MMQ_Q8_1_DS_LAYOUT_D4;
65
+ case GGML_TYPE_Q4_0:
66
+ case GGML_TYPE_Q4_1:
67
+ return MMQ_Q8_1_DS_LAYOUT_DS4;
68
+ case GGML_TYPE_Q5_0:
69
+ return MMQ_Q8_1_DS_LAYOUT_D4;
70
+ case GGML_TYPE_Q5_1:
71
+ return MMQ_Q8_1_DS_LAYOUT_DS4;
72
+ case GGML_TYPE_Q8_0:
73
+ return MMQ_Q8_1_DS_LAYOUT_D4;
74
+ case GGML_TYPE_MXFP4:
75
+ return MMQ_Q8_1_DS_LAYOUT_D4;
76
+ case GGML_TYPE_NVFP4:
77
+ return MMQ_Q8_1_DS_LAYOUT_D4;
78
+ case GGML_TYPE_Q2_K:
79
+ return MMQ_Q8_1_DS_LAYOUT_D2S6;
80
+ case GGML_TYPE_Q3_K:
81
+ return MMQ_Q8_1_DS_LAYOUT_D4;
82
+ case GGML_TYPE_Q4_K:
83
+ case GGML_TYPE_Q5_K:
84
+ return MMQ_Q8_1_DS_LAYOUT_DS4;
85
+ case GGML_TYPE_Q6_K:
86
+ case GGML_TYPE_IQ2_XXS:
87
+ case GGML_TYPE_IQ2_XS:
88
+ case GGML_TYPE_IQ2_S:
89
+ case GGML_TYPE_IQ3_XXS:
90
+ case GGML_TYPE_IQ3_S:
91
+ return MMQ_Q8_1_DS_LAYOUT_D4;
92
+ case GGML_TYPE_IQ1_S:
93
+ return MMQ_Q8_1_DS_LAYOUT_DS4;
94
+ case GGML_TYPE_IQ4_XS:
95
+ case GGML_TYPE_IQ4_NL:
96
+ return MMQ_Q8_1_DS_LAYOUT_D4;
97
+ default:
98
+ GGML_ABORT("fatal error");
99
+ break;
100
+ }
101
+ }
102
+
103
+ struct tile_x_sizes {
104
+ int qs;
105
+ int dm;
106
+ int sc;
107
+ };
108
+
109
+ static int get_mmq_x_max_host(const int cc) {
110
+ return (turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
111
+ GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
112
+ #ifdef GGML_CUDA_FORCE_MMQ
113
+ 128 : 64;
114
+ #else
115
+ MMQ_DP4A_MAX_BATCH_SIZE : 64;
116
+ #endif // GGML_CUDA_FORCE_MMQ
117
+ }
118
+
119
+ static constexpr __device__ int get_mmq_x_max_device() {
120
+ #if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
121
+ return 128;
122
+ #else // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
123
+
124
+ #if defined(GGML_USE_HIP)
125
+ return 64;
126
+ #else // defined(GGML_USE_HIP)
127
+
128
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
129
+ #ifdef GGML_CUDA_FORCE_MMQ
130
+ return 128;
131
+ #else // GGML_CUDA_FORCE_MMQ
132
+ return MMQ_DP4A_MAX_BATCH_SIZE;
133
+ #endif // GGML_CUDA_FORCE_MMQ
134
+ #else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
135
+ return 64;
136
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
137
+
138
+ #endif // defined(GGML_USE_HIP)
139
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
140
+ }
141
+
142
+ static int get_mmq_y_host(const int cc) {
143
+ return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
144
+ ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
145
+ }
146
+
147
+ static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) {
148
+ #if defined(BLACKWELL_MMA_AVAILABLE)
149
+ if (type == GGML_TYPE_NVFP4 || type == GGML_TYPE_MXFP4) {
150
+ return MMQ_ITER_K_FP4;
151
+ }
152
+ #endif // defined(BLACKWELL_MMA_AVAILABLE)
153
+ return MMQ_ITER_K;
154
+ }
155
+
156
+ static constexpr __device__ int get_mmq_y_device() {
157
+ #if defined(GGML_USE_HIP)
158
+ #if defined(RDNA1)
159
+ return 64;
160
+ #else
161
+ return 128;
162
+ #endif // defined RDNA1
163
+ #else
164
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
165
+ return 128;
166
+ #else
167
+ return 64;
168
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
169
+ #endif // defined(GGML_USE_HIP)
170
+ }
171
+
172
+ // Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes.
173
+ // The K dimension of the tiles has either,
174
+ // 1*MMQ_TILE_NE_K==32 (always for TILE_Y_K) or 2*MMQ_TILE_NE_K==64 (typically for TILE_X_K),
175
+ // 32 bit elements for the quantized data (does not include scales).
176
+ // In other words, the size of the quantized data in the K dimension is a multiple of MMQ_TILE_NE_K.
177
+ // The final tile size in K direction is padded to avoid shared memory bank conflicts,
178
+ // in terms of 32 bit elements that means K % 2 == 1 for dp4a or K % 8 == 4 for mma.
179
+ #define MMQ_TILE_NE_K 32
180
+
181
+ #define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*MMQ_TILE_NE_K + mmq_y, mmq_y*MMQ_TILE_NE_K/QI4_0 + mmq_y/QI4_0, 0}
182
+ #define MMQ_DP4A_TXS_Q4_1 tile_x_sizes{mmq_y*MMQ_TILE_NE_K + mmq_y, mmq_y*MMQ_TILE_NE_K/QI4_1 + mmq_y/QI4_1, 0}
183
+ #define MMQ_DP4A_TXS_Q8_0 tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K*2/QI8_0 + mmq_y/(QI8_0/2), 0}
184
+ #define MMQ_DP4A_TXS_Q8_0_16 tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K*4/QI8_0 + mmq_y/(QI8_0/4), 0}
185
+ #define MMQ_DP4A_TXS_Q8_1 tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K*2/QI8_1 + mmq_y/(QI8_1/2), 0}
186
+ #define MMQ_DP4A_TXS_Q2_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K + mmq_y, 0}
187
+ #define MMQ_DP4A_TXS_Q3_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y, mmq_y*MMQ_TILE_NE_K/8 + mmq_y/8}
188
+ #define MMQ_DP4A_TXS_Q4_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K + mmq_y, mmq_y*MMQ_TILE_NE_K/QI4_K, mmq_y*MMQ_TILE_NE_K/8 + mmq_y/8}
189
+ #define MMQ_DP4A_TXS_Q5_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K/QI5_K + mmq_y/QI5_K, mmq_y*MMQ_TILE_NE_K/8 + mmq_y/8}
190
+ #define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K/QI6_K + mmq_y/QI6_K, mmq_y*MMQ_TILE_NE_K/8 + mmq_y/8}
191
+
192
+ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
193
+ switch (type) {
194
+ case GGML_TYPE_Q1_0: return MMQ_DP4A_TXS_Q8_0;
195
+ case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0;
196
+ case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1;
197
+ case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0;
198
+ case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1;
199
+ case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0;
200
+ case GGML_TYPE_MXFP4: return MMQ_DP4A_TXS_Q8_1;
201
+ case GGML_TYPE_NVFP4: return MMQ_DP4A_TXS_Q8_0_16;
202
+ case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K;
203
+ case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K;
204
+ case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K;
205
+ case GGML_TYPE_Q5_K: return MMQ_DP4A_TXS_Q5_K;
206
+ case GGML_TYPE_Q6_K: return MMQ_DP4A_TXS_Q6_K;
207
+ case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0;
208
+ case GGML_TYPE_IQ2_XS: return MMQ_DP4A_TXS_Q8_0_16;
209
+ case GGML_TYPE_IQ2_S: return MMQ_DP4A_TXS_Q8_0_16;
210
+ case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0;
211
+ case GGML_TYPE_IQ3_S: return MMQ_DP4A_TXS_Q8_0;
212
+ case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0;
213
+ case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0;
214
+ case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0;
215
+ default: return tile_x_sizes{0, 0, 0};
216
+ }
217
+ }
218
+
219
+ #define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
220
+ #define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4) // MXFP4 and NVFP4 Blackwell
221
+ #define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) // NVFP4 Generic
222
+ #define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
223
+ #define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4)
224
+ #define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4)
225
+ #define MMQ_MMA_TILE_X_K_Q6_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI6_K + MMQ_TILE_NE_K/8 + 7)
226
+
227
+ static_assert(MMQ_MMA_TILE_X_K_Q8_0 % 8 == 4, "Wrong padding.");
228
+ static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding.");
229
+ static_assert(MMQ_MMA_TILE_X_K_Q2_K % 8 == 4, "Wrong padding.");
230
+ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
231
+ static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
232
+ static_assert(MMQ_MMA_TILE_X_K_FP4 % 8 == 4, "Wrong padding.");
233
+ static_assert(MMQ_MMA_TILE_X_K_FP4 == MMQ_MMA_TILE_X_K_Q8_1, "Wrong tile size for MXFP4");
234
+ static_assert(MMQ_MMA_TILE_X_K_NVFP4 % 8 == 4, "Wrong padding.");
235
+
236
+
237
+ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
238
+ switch (type) {
239
+ case GGML_TYPE_Q1_0: return MMQ_MMA_TILE_X_K_Q8_0;
240
+ case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0;
241
+ case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1;
242
+ case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0;
243
+ case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1;
244
+ case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0;
245
+ // tile sizes are the same for Q8_1 and FP4 for blackwell
246
+ case GGML_TYPE_MXFP4: return MMQ_MMA_TILE_X_K_Q8_1;
247
+ #if defined(BLACKWELL_MMA_AVAILABLE)
248
+ case GGML_TYPE_NVFP4: return MMQ_MMA_TILE_X_K_FP4;
249
+ #else
250
+ case GGML_TYPE_NVFP4: return MMQ_MMA_TILE_X_K_NVFP4;
251
+ #endif // defined(BLACKWELL_MMA_AVAILABLE)
252
+ case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K;
253
+ case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K;
254
+ case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1;
255
+ case GGML_TYPE_Q5_K: return MMQ_MMA_TILE_X_K_Q8_1;
256
+ case GGML_TYPE_Q6_K: return MMQ_MMA_TILE_X_K_Q6_K;
257
+ case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
258
+ case GGML_TYPE_IQ2_XS: return MMQ_MMA_TILE_X_K_Q3_K;
259
+ case GGML_TYPE_IQ2_S: return MMQ_MMA_TILE_X_K_Q3_K;
260
+ case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
261
+ case GGML_TYPE_IQ3_S: return MMQ_MMA_TILE_X_K_Q8_0;
262
+ case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0;
263
+ case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0;
264
+ case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0;
265
+ default: return 0;
266
+ }
267
+ }
268
+
269
+ // block_q8_1_mmq has (128 8-bit ints == 32 32-bit ints + 4 32-bit scales)
270
+ #define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K / QI8_1)
271
+ #define MMQ_TILE_Y_FP4_K MMQ_TILE_Y_K
272
+
273
+ static int mmq_get_granularity_host(const int mmq_x, const int cc) {
274
+ if (amd_mfma_available(cc) || amd_wmma_available(cc)) {
275
+ return mmq_x >= 128 ? 32 : 16;
276
+ } else if (turing_mma_available(cc) && mmq_x >= 48) {
277
+ return 16;
278
+ } else {
279
+ return 8;
280
+ }
281
+ }
282
+
283
+ #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
284
+ static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) {
285
+ return mmq_x >= 128 ? 32 : 16;
286
+ }
287
+ #elif defined(TURING_MMA_AVAILABLE)
288
+ static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) {
289
+ return mmq_x >= 48 ? 16 : 8;
290
+ }
291
+ #else
292
+ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/) {
293
+ return 8;
294
+ }
295
+ #endif // AMD_MFMA_AVAILABLE
296
+
297
+ #if defined(GGML_USE_HIP)
298
+ static int mmq_get_nwarps_host(const int cc, const int warp_size) {
299
+ return amd_mfma_available(cc) ? 8 : 256/warp_size;
300
+ }
301
+ #else
302
+ static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) {
303
+ return 256/warp_size;
304
+ }
305
+ #endif // (GGML_USE_HIP)
306
+
307
+ static constexpr __device__ int mmq_get_nwarps_device() {
308
+ #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
309
+ return 8;
310
+ #else
311
+ return 256/ggml_cuda_get_physical_warp_size();
312
+ #endif // AMD_MFMA_AVAILABLE
313
+ }
314
+
315
+ // ------------------------------------------------------------
316
+
317
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q1_0(
318
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
319
+ constexpr int nwarps = mmq_get_nwarps_device();
320
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
321
+
322
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
323
+ int * x_qs = (int *) x_tile;
324
+ float * x_df = (float *) (x_qs + 2*MMQ_TILE_NE_K);
325
+ #else
326
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
327
+ int * x_qs = (int *) x_tile;
328
+ float * x_df = (float *) (x_qs + txs.qs);
329
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
330
+
331
+ constexpr int blocks_per_iter = MMQ_ITER_K / QK1_0;
332
+ constexpr int threads_per_row = blocks_per_iter * QI1_0;
333
+ constexpr int nrows = warp_size / threads_per_row;
334
+ constexpr int scale_entries_per_block = QK1_0 / QK8_1;
335
+ constexpr int scale_entries_per_row = blocks_per_iter * scale_entries_per_block;
336
+
337
+ const int txi = threadIdx.x % threads_per_row;
338
+ const int kbx = txi / QI1_0;
339
+ const int kqsx = txi % QI1_0;
340
+
341
+ #pragma unroll
342
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
343
+ int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
344
+
345
+ if (need_check) {
346
+ i = min(i, i_max);
347
+ }
348
+
349
+ const block_q1_0 * bxi = (const block_q1_0 *) x + kbx0 + i*stride + kbx;
350
+ const int qs_offset = 4*kqsx;
351
+ const int qs0 = bxi->qs[qs_offset + 0] | (bxi->qs[qs_offset + 1] << 8) |
352
+ (bxi->qs[qs_offset + 2] << 16) | (bxi->qs[qs_offset + 3] << 24);
353
+
354
+ int unpacked_bytes[8];
355
+ #pragma unroll
356
+ for (int j = 0; j < 8; ++j) {
357
+ const int shift = j * 4;
358
+ const int bits4 = (qs0 >> shift) & 0x0F;
359
+ const int b0 = (bits4 & 0x01) ? 1 : -1;
360
+ const int b1 = (bits4 & 0x02) ? 1 : -1;
361
+ const int b2 = (bits4 & 0x04) ? 1 : -1;
362
+ const int b3 = (bits4 & 0x08) ? 1 : -1;
363
+ unpacked_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
364
+ }
365
+
366
+ const int dst_offset = kbx*(scale_entries_per_block*QI8_0) + kqsx*QI8_0;
367
+ #pragma unroll
368
+ for (int j = 0; j < 8; ++j) {
369
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
370
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + dst_offset + j] = unpacked_bytes[j];
371
+ #else
372
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + dst_offset + j] = unpacked_bytes[j];
373
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
374
+ }
375
+ }
376
+
377
+ const int ksx = threadIdx.x % scale_entries_per_row;
378
+ const int scale_block = ksx / scale_entries_per_block;
379
+
380
+ #pragma unroll
381
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
382
+ int i = i0 + threadIdx.y;
383
+
384
+ if (need_check) {
385
+ i = min(i, i_max);
386
+ }
387
+
388
+ const block_q1_0 * bxi = (const block_q1_0 *) x + kbx0 + i*stride + scale_block;
389
+
390
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
391
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + ksx] = bxi->d;
392
+ #else
393
+ x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + ksx] = bxi->d;
394
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
395
+ }
396
+ }
397
+
398
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
399
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
400
+ constexpr int nwarps = mmq_get_nwarps_device();
401
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
402
+
403
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
404
+ int * x_qs = (int *) x_tile;
405
+ float * x_df = (float *) (x_qs + 2*MMQ_TILE_NE_K);
406
+ #else
407
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
408
+ int * x_qs = (int *) x_tile;
409
+ float * x_df = (float *) (x_qs + txs.qs);
410
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
411
+
412
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_0);
413
+ constexpr int nrows = warp_size / threads_per_row;
414
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
415
+ const int kbx = txi / QI4_0;
416
+ const int kqsx = txi % QI4_0;
417
+
418
+ #pragma unroll
419
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
420
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
421
+
422
+ if (need_check) {
423
+ i = min(i, i_max);
424
+ }
425
+
426
+ const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbx;
427
+ const int qs0 = get_int_b2(bxi->qs, kqsx);
428
+
429
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
430
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + 0] = __vsubss4((qs0 >> 0) & 0x0F0F0F0F, 0x08080808);
431
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + QI4_0] = __vsubss4((qs0 >> 4) & 0x0F0F0F0F, 0x08080808);
432
+ #else
433
+ x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0;
434
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
435
+ }
436
+
437
+ constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_0;
438
+ constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
439
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
440
+
441
+ #pragma unroll
442
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
443
+ int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
444
+
445
+ if (need_check) {
446
+ i = min(i, i_max);
447
+ }
448
+
449
+ const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbxd;
450
+
451
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
452
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d;
453
+ #else
454
+ x_df[i*(MMQ_TILE_NE_K/QI4_0) + i/QI4_0 + kbxd] = bxi->d;
455
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
456
+ }
457
+ }
458
+
459
+ template <int mmq_x, int mmq_y>
460
+ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a(
461
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
462
+ constexpr int nwarps = mmq_get_nwarps_device();
463
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
464
+
465
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
466
+ const int * x_qs = (const int *) x;
467
+ const float * x_df = (const float *) x_qs + txs.qs;
468
+ const int * y_qs = (const int *) y + 4;
469
+ const half2 * y_ds = (const half2 *) y;
470
+
471
+ // #pragma unroll
472
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_0*VDR_Q4_0_Q8_1_MMQ) {
473
+ const int k0 = k00 + k01;
474
+
475
+ #pragma unroll
476
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
477
+ const int j = j0 + threadIdx.y;
478
+
479
+ #pragma unroll
480
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
481
+ const int i = i0 + threadIdx.x;
482
+ const int kyqs = QI8_1 * ((k01/2) / (QI8_1/2)) + (k01/2) % (QI8_1/2);
483
+
484
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
485
+
486
+ constexpr int max_cpy = ggml_cuda_get_max_cpy_bytes();
487
+ constexpr int mcpy_int = max_cpy / sizeof(int);
488
+ static_assert(VDR_Q4_0_Q8_1_MMQ == 4, "bad VDR_Q4_0_Q8_1_MMQ");
489
+
490
+ int tmp0[4], tmp1[4];
491
+
492
+ #pragma unroll
493
+ for (int l0 = 0; l0 < 4 / mcpy_int; ++l0) {
494
+ ggml_cuda_memcpy_1<max_cpy>(tmp0 + l0 * mcpy_int, &y_qs[j*MMQ_TILE_Y_K + kyqs + l0 * mcpy_int] );
495
+ ggml_cuda_memcpy_1<max_cpy>(tmp1 + l0 * mcpy_int, &y_qs[j*MMQ_TILE_Y_K + kyqs + QI4_0 + l0 * mcpy_int]);
496
+ }
497
+
498
+ u[0]=tmp0[0]; u[2]=tmp0[1]; u[4]=tmp0[2]; u[6]=tmp0[3];
499
+ u[1]=tmp1[0]; u[3]=tmp1[1]; u[5]=tmp1[2]; u[7]=tmp1[3];
500
+
501
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
502
+ (&x_qs[i*(MMQ_TILE_NE_K + 1) + k0/QR4_0], u,
503
+ x_df[i*(MMQ_TILE_NE_K/QI4_0) + i/QI4_0 + k0/(QR4_0*QI4_0)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
504
+ }
505
+ }
506
+ }
507
+ }
508
+
509
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
510
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
511
+ constexpr int nwarps = mmq_get_nwarps_device();
512
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
513
+
514
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
515
+ int * x_qs = (int *) x_tile;
516
+ half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K);
517
+ #else
518
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
519
+ int * x_qs = (int *) x_tile;
520
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
521
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
522
+
523
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_1);
524
+ constexpr int nrows = warp_size / threads_per_row;
525
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
526
+ const int kbx = txi / QI4_1;
527
+ const int kqsx = txi % QI4_1;
528
+
529
+ #pragma unroll
530
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
531
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
532
+
533
+ if (need_check) {
534
+ i = min(i, i_max);
535
+ }
536
+
537
+ const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbx;
538
+ const int qs0 = get_int_b4(bxi->qs, kqsx);
539
+
540
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
541
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + 0] = (qs0 >> 0) & 0x0F0F0F0F;
542
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + QI4_1] = (qs0 >> 4) & 0x0F0F0F0F;
543
+ #else
544
+ x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0;
545
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
546
+ }
547
+
548
+ constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_1;
549
+ constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
550
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
551
+
552
+ #pragma unroll
553
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
554
+ int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
555
+
556
+ if (need_check) {
557
+ i = min(i, i_max);
558
+ }
559
+
560
+ const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbxd;
561
+
562
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
563
+ x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm;
564
+ #else
565
+ x_dm[i*(MMQ_TILE_NE_K/QI4_1) + i/QI4_1 + kbxd] = bxi->dm;
566
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
567
+ }
568
+ }
569
+
570
+ template <int mmq_x, int mmq_y>
571
+ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a(
572
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
573
+ constexpr int nwarps = mmq_get_nwarps_device();
574
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
575
+
576
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
577
+ const int * x_qs = (const int *) x;
578
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
579
+ const int * y_qs = (const int *) y + 4;
580
+ const half2 * y_ds = (const half2 *) y;
581
+
582
+ // #pragma unroll
583
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_1*VDR_Q4_1_Q8_1_MMQ) {
584
+ const int k0 = k00 + k01;
585
+
586
+ #pragma unroll
587
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
588
+ const int j = j0 + threadIdx.y;
589
+
590
+ #pragma unroll
591
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
592
+ const int i = i0 + threadIdx.x;
593
+ const int kyqs = QI8_1 * ((k01/2) / (QI8_1/2)) + (k01/2) % (QI8_1/2);
594
+
595
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
596
+
597
+ constexpr int max_cpy = ggml_cuda_get_max_cpy_bytes();
598
+ constexpr int mcpy_int = max_cpy / sizeof(int);
599
+ static_assert(VDR_Q4_0_Q8_1_MMQ == 4, "bad VDR_Q4_0_Q8_1_MMQ");
600
+
601
+ int tmp0[4], tmp1[4];
602
+
603
+ #pragma unroll
604
+ for (int l0 = 0; l0 < 4 / mcpy_int; ++l0) {
605
+ ggml_cuda_memcpy_1<max_cpy>(tmp0 + l0 * mcpy_int, &y_qs[j*MMQ_TILE_Y_K + kyqs + l0 * mcpy_int] );
606
+ ggml_cuda_memcpy_1<max_cpy>(tmp1 + l0 * mcpy_int, &y_qs[j*MMQ_TILE_Y_K + kyqs + QI4_1 + l0 * mcpy_int]);
607
+ }
608
+
609
+ u[0]=tmp0[0]; u[2]=tmp0[1]; u[4]=tmp0[2]; u[6]=tmp0[3];
610
+ u[1]=tmp1[0]; u[3]=tmp1[1]; u[5]=tmp1[2]; u[7]=tmp1[3];
611
+
612
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
613
+ (&x_qs[i*(MMQ_TILE_NE_K + 1) + k0/QR4_1], u,
614
+ x_dm[i*(MMQ_TILE_NE_K/QI4_1) + i/QI4_1 + k0/(QR4_1*QI4_1)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
615
+ }
616
+ }
617
+ }
618
+ }
619
+
620
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
621
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
622
+ constexpr int nwarps = mmq_get_nwarps_device();
623
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
624
+
625
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
626
+ int * x_qs = (int *) x_tile;
627
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
628
+ #else
629
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_0, mmq_y);
630
+ int * x_qs = (int *) x_tile;
631
+ float * x_df = (float *) (x_qs + txs.qs);
632
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
633
+
634
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR5_0);
635
+ constexpr int nrows = warp_size / threads_per_row;
636
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
637
+ const int kbx = txi / QI5_0;
638
+ const int kqsx = txi % QI5_0;
639
+
640
+ #pragma unroll
641
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
642
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
643
+
644
+ if (need_check) {
645
+ i = min(i, i_max);
646
+ }
647
+
648
+ const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbx;
649
+
650
+ const int ql = get_int_b2(bxi->qs, kqsx);
651
+ const int qh = get_int_b2(bxi->qh, 0) >> (4 * kqsx);
652
+
653
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
654
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
655
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
656
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
657
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
658
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
659
+
660
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
661
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
662
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
663
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
664
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
665
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
666
+
667
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
668
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + 0] = qs0;
669
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1;
670
+ #else
671
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_0) + kqsx + 0] = qs0;
672
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1;
673
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
674
+ }
675
+
676
+ constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI5_0;
677
+ constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
678
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
679
+
680
+ #pragma unroll
681
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
682
+ int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
683
+
684
+ if (need_check) {
685
+ i = min(i, i_max);
686
+ }
687
+
688
+ const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbxd;
689
+
690
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
691
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d;
692
+ #else
693
+ x_df[i*(MMQ_TILE_NE_K/QI5_0) + i/QI5_0 + kbxd] = bxi->d;
694
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
695
+ }
696
+ }
697
+
698
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
699
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
700
+ constexpr int nwarps = mmq_get_nwarps_device();
701
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
702
+
703
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
704
+ int * x_qs = (int *) x_tile;
705
+ half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K);
706
+ #else
707
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y);
708
+ int * x_qs = (int *) x_tile;
709
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
710
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
711
+
712
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR5_1);
713
+ constexpr int nrows = warp_size / threads_per_row;
714
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
715
+ const int kbx = txi / QI5_1;
716
+ const int kqsx = txi % QI5_1;
717
+
718
+ #pragma unroll
719
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
720
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
721
+
722
+ if (need_check) {
723
+ i = min(i, i_max);
724
+ }
725
+
726
+ const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbx;
727
+
728
+ const int ql = get_int_b4(bxi->qs, kqsx);
729
+ const int qh = get_int_b4(bxi->qh, 0) >> (4 * kqsx);
730
+
731
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
732
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
733
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
734
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
735
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
736
+
737
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
738
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
739
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
740
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
741
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
742
+
743
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
744
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + 0] = qs0;
745
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1;
746
+ #else
747
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_1) + kqsx + 0] = qs0;
748
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1;
749
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
750
+ }
751
+
752
+ constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI5_1;
753
+ constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
754
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
755
+
756
+ #pragma unroll
757
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
758
+ int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
759
+
760
+ if (need_check) {
761
+ i = min(i, i_max);
762
+ }
763
+
764
+ const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbxd;
765
+
766
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
767
+ x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm;
768
+ #else
769
+ x_dm[i*(MMQ_TILE_NE_K/QI5_1) + i/QI5_1 + kbxd] = bxi->dm;
770
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
771
+ }
772
+ }
773
+
774
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
775
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
776
+ constexpr int nwarps = mmq_get_nwarps_device();
777
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
778
+
779
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
780
+ int * x_qs = (int *) x_tile;
781
+ float * x_df = (float *) (x_tile + 2*MMQ_TILE_NE_K);
782
+ #else
783
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
784
+ int * x_qs = (int *) x_tile;
785
+ float * x_df = (float *) (x_qs + txs.qs);
786
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
787
+
788
+ // MMQ_ITER_K / (4 * QR8_0) == 64 required. but NV has only 32 threads per warp
789
+ constexpr int threads_per_row = 32;
790
+ constexpr int nrows = warp_size / threads_per_row;
791
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
792
+ const int kbx = txi / QI8_0;
793
+ const int kqsx = txi % QI8_0;
794
+
795
+ #pragma unroll
796
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
797
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
798
+
799
+ if (need_check) {
800
+ i = min(i, i_max);
801
+ }
802
+
803
+ const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbx;
804
+
805
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
806
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 0 + txi] = get_int_b2(bxi[0].qs, kqsx);
807
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + MMQ_TILE_NE_K + txi] = get_int_b2(bxi[MMQ_TILE_NE_K/QI8_0].qs, kqsx);
808
+ #else
809
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 0 + txi] = get_int_b2(bxi[0].qs, kqsx);
810
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + MMQ_TILE_NE_K + txi] = get_int_b2(bxi[MMQ_TILE_NE_K/QI8_0].qs, kqsx);
811
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
812
+ }
813
+
814
+ constexpr int blocks_per_tile_x_row = 2*MMQ_TILE_NE_K / QI8_0;
815
+ constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
816
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
817
+
818
+ #pragma unroll
819
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
820
+ int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
821
+
822
+ if (need_check) {
823
+ i = min(i, i_max);
824
+ }
825
+
826
+ const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbxd;
827
+
828
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
829
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d;
830
+ #else
831
+ x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + kbxd] = bxi->d;
832
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
833
+ }
834
+ }
835
+
836
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_mxfp4(
837
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
838
+ constexpr int nwarps = mmq_get_nwarps_device();
839
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
840
+
841
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
842
+ int * x_qs = (int *) x_tile;
843
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
844
+ #else
845
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_MXFP4, mmq_y);
846
+ int * x_qs = (int *) x_tile;
847
+ float * x_df = (float *) (x_qs + txs.qs);
848
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
849
+
850
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR_MXFP4);
851
+ constexpr int nrows = warp_size / threads_per_row;
852
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
853
+ const int kbx = txi / QI_MXFP4;
854
+ const int kqsx = txi % QI_MXFP4;
855
+
856
+ #pragma unroll
857
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
858
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
859
+
860
+ if (need_check) {
861
+ i = min(i, i_max);
862
+ }
863
+
864
+ const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i*stride + kbx;
865
+
866
+ const int aux_q4 = get_int_b1(bxi->qs, kqsx);
867
+ const int2 v = get_int_from_table_16(aux_q4, kvalues_mxfp4);
868
+ const int k0 = kbx * (2 * QI_MXFP4) + kqsx;
869
+
870
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
871
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + k0 + 0] = v.x;
872
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + k0 + QI_MXFP4] = v.y;
873
+ #else
874
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x;
875
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI_MXFP4] = v.y;
876
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
877
+ }
878
+
879
+ constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI_MXFP4;
880
+ constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
881
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
882
+
883
+ #pragma unroll
884
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
885
+ int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
886
+
887
+ if (need_check) {
888
+ i = min(i, i_max);
889
+ }
890
+
891
+ const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i*stride + kbxd;
892
+
893
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
894
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = ggml_cuda_e8m0_to_fp32(bxi->e)*0.5f;
895
+ #else
896
+ x_df[i*(MMQ_TILE_NE_K/QI_MXFP4) + i/QI_MXFP4 + kbxd] = ggml_cuda_e8m0_to_fp32(bxi->e)*0.5f;
897
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
898
+ }
899
+ }
900
+
901
+ template <int mmq_y, bool need_check>
902
+ static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restrict__ x,
903
+ int * __restrict__ x_tile,
904
+ const int kbx0,
905
+ const int i_max,
906
+ const int stride) {
907
+ constexpr int nwarps = mmq_get_nwarps_device();
908
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
909
+
910
+ int * x_qs = (int *) x_tile;
911
+ uint32_t * x_sc = (uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
912
+
913
+ const int txi = threadIdx.x;
914
+
915
+ constexpr int iter_k = get_iter_k(GGML_TYPE_MXFP4);
916
+
917
+ constexpr int threads_per_row = iter_k / QK_MXFP4; // each thread processes 1 block
918
+ constexpr int rows_per_warp = warp_size / threads_per_row;
919
+ const int kbx = txi % threads_per_row;
920
+ const int row_in_warp = txi / threads_per_row;
921
+
922
+ #pragma unroll
923
+ for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
924
+ int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
925
+
926
+ if constexpr (need_check) {
927
+ i = min(i, i_max);
928
+ }
929
+
930
+ const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i * stride + kbx;
931
+
932
+ // quantize_mxfp4_mmq permutes nibbles to match the quantized format
933
+ const int k0 = kbx * 4;
934
+ memcpy(x_qs + i * MMQ_MMA_TILE_X_K_FP4 + k0, bxi->qs, 16);
935
+
936
+ // Load E8M0 scales: pack 2 consecutive scales into one uint32
937
+ if (kbx % 2 == 0) {
938
+ uint32_t e = bxi->e;
939
+ e |= ((bxi + 1)->e << 8);
940
+ x_sc[i * MMQ_MMA_TILE_X_K_FP4 + kbx / 2] = e;
941
+ }
942
+ }
943
+ }
944
+
945
+ #ifdef BLACKWELL_MMA_AVAILABLE
946
+ template <int mmq_y, bool need_check>
947
+ static __device__ __forceinline__ void load_tiles_nvfp4_nvfp4(const char * __restrict__ x,
948
+ int * __restrict__ x_tile,
949
+ const int kbx0,
950
+ const int i_max,
951
+ const int stride) {
952
+ constexpr int nwarps = mmq_get_nwarps_device();
953
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
954
+ constexpr int iter_k = get_iter_k(GGML_TYPE_NVFP4);
955
+ constexpr int threads_per_row = iter_k / QK_NVFP4; // each thread processes 1 block
956
+ constexpr int rows_per_warp = warp_size / threads_per_row;
957
+
958
+ uint32_t * x_u32 = (uint32_t *) x_tile;
959
+
960
+ const int txi = threadIdx.x;
961
+ const int kbx = txi % threads_per_row;
962
+ const int row_in_warp = txi / threads_per_row;
963
+
964
+ const block_nvfp4 * bxi_base = (const block_nvfp4 *) x + kbx0 + kbx;
965
+ uint32_t * x_u32_scale = x_u32 + 64 + kbx;
966
+
967
+ #pragma unroll
968
+ for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
969
+ int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
970
+
971
+ if constexpr (need_check) {
972
+ i = min(i, i_max);
973
+ }
974
+
975
+ const block_nvfp4 * bxi = bxi_base + i * stride;
976
+ const int row_base = i * MMQ_MMA_TILE_X_K_FP4;
977
+ const int q_base = row_base + 8 * kbx;
978
+
979
+ const uint32_t * src_qs = reinterpret_cast<const uint32_t *>(bxi->qs);
980
+
981
+ #pragma unroll
982
+ for (int sub = 0; sub < QK_NVFP4 / QK_NVFP4_SUB; ++sub) {
983
+ x_u32[q_base + 2 * sub + 0] = src_qs[2 * sub + 0];
984
+ x_u32[q_base + 2 * sub + 1] = src_qs[2 * sub + 1];
985
+ }
986
+
987
+ x_u32_scale[row_base] = get_int_b4(bxi->d, 0);
988
+ }
989
+ }
990
+
991
+ // Shared MMA kernel for MXFP4 and NVFP4 on Blackwell.
992
+ // Both quantizations encode values as e2m1 (FP4) and produce one uint32 scale per
993
+ // m16n8k64 MMA call; only the PTX kind (scale_vec::2X ue8m0 vs scale_vec::4X ue4m3)
994
+ // and the per-type stride constant differ.
995
+ template <int mmq_x, int mmq_y, ggml_type type>
996
+ static __device__ __forceinline__ void vec_dot_fp4_fp4_mma(const int * __restrict__ x,
997
+ const int * __restrict__ y,
998
+ float * __restrict__ sum,
999
+ const int k00) {
1000
+ static_assert(type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4,
1001
+ "vec_dot_fp4_fp4_mma: type must be MXFP4 or NVFP4");
1002
+
1003
+ typedef tile<16, 8, int> tile_A;
1004
+ typedef tile<8, 8, int> tile_B;
1005
+ typedef tile<16, 8, float> tile_C;
1006
+
1007
+ constexpr int stride = MMQ_MMA_TILE_X_K_FP4;
1008
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1009
+ constexpr int rows_per_warp = 2 * granularity;
1010
+ constexpr int ntx = rows_per_warp / tile_C::I;
1011
+ constexpr int nfrags = MMQ_TILE_NE_K / tile_A::J;
1012
+
1013
+ y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_K);
1014
+
1015
+ const int * x_qs = (const int *) x;
1016
+ const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
1017
+ const int * y_qs = (const int *) y + 4;
1018
+ const uint32_t * y_sc = (const uint32_t *) y;
1019
+
1020
+ // 2 threads per quad supply the packed scale register to the block_scale MMA,
1021
+ // see https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
1022
+ const int tidx_A = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
1023
+ const int tidx_B = threadIdx.x / 4;
1024
+ const int i0 = (threadIdx.y / ntx) * rows_per_warp;
1025
+
1026
+ tile_A A[ntx][nfrags];
1027
+ uint32_t scaleA[ntx][nfrags];
1028
+
1029
+ #pragma unroll
1030
+ for (int n = 0; n < ntx; ++n) {
1031
+ #pragma unroll
1032
+ for (int frag = 0; frag < nfrags; ++frag) {
1033
+ const int k0 = k00 + frag * tile_A::J;
1034
+ load_ldmatrix(A[n][frag], x_qs + (i0 + n * tile_A::I) * stride + k0, stride);
1035
+ scaleA[n][frag] = x_sc[(i0 + n * tile_A::I + tidx_A) * stride + k0 / tile_A::J];
1036
+ }
1037
+ }
1038
+
1039
+ #pragma unroll
1040
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
1041
+ tile_B B[nfrags];
1042
+ uint32_t scaleB[nfrags];
1043
+
1044
+ #pragma unroll
1045
+ for (int frag = 0; frag < nfrags; ++frag) {
1046
+ const int k0 = frag * tile_B::J;
1047
+ load_generic(B[frag], y_qs + j0 * MMQ_TILE_Y_K + k0, MMQ_TILE_Y_K);
1048
+ scaleB[frag] = y_sc[(j0 + tidx_B) * MMQ_TILE_Y_K + frag];
1049
+ }
1050
+
1051
+ #pragma unroll
1052
+ for (int n = 0; n < ntx; ++n) {
1053
+ #pragma unroll
1054
+ for (int frag = 0; frag < nfrags; ++frag) {
1055
+ tile_C C = {};
1056
+ mma_block_scaled_fp4<type>(C, A[n][frag], B[frag], scaleA[n][frag], scaleB[frag]);
1057
+ #pragma unroll
1058
+ for (int l = 0; l < tile_C::ne; ++l) {
1059
+ sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
1060
+ }
1061
+ }
1062
+ }
1063
+ }
1064
+ }
1065
+ #endif // BLACKWELL_MMA_AVAILABLE
1066
+
1067
+
1068
+ template <int mmq_y, bool need_check>
1069
+ static __device__ __forceinline__ void load_tiles_nvfp4(const char * __restrict__ x,
1070
+ int * __restrict__ x_tile,
1071
+ const int kb0,
1072
+ const int i_max,
1073
+ const int stride) {
1074
+ constexpr int nwarps = mmq_get_nwarps_device();
1075
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
1076
+
1077
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1078
+ int * x_qs = (int *) x_tile;
1079
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
1080
+ #else
1081
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_NVFP4, mmq_y);
1082
+ int * x_qs = (int *) x_tile;
1083
+ float * x_df = (float *) (x_qs + txs.qs);
1084
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1085
+
1086
+ constexpr int threads_per_row = MMQ_ITER_K / QK_NVFP4;
1087
+ constexpr int rows_per_warp = warp_size / threads_per_row;
1088
+ const int kbx = threadIdx.x % threads_per_row;
1089
+ const int row_in_warp = threadIdx.x / threads_per_row;
1090
+
1091
+ #pragma unroll
1092
+ for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
1093
+ int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
1094
+
1095
+ if constexpr (need_check) {
1096
+ i = min(i, i_max);
1097
+ }
1098
+
1099
+ const block_nvfp4 * bxi = (const block_nvfp4 *) x + kb0 + i * stride + kbx;
1100
+ const uint32_t * __restrict__ src_qs = reinterpret_cast<const uint32_t *>(bxi->qs);
1101
+ const int kqs = 16 * kbx;
1102
+ const int ksc = 4 * kbx;
1103
+
1104
+ #pragma unroll
1105
+ for (int sub = 0; sub < QK_NVFP4 / QK_NVFP4_SUB; ++sub) {
1106
+ const int2 q0 = get_int_from_table_16(src_qs[2 * sub + 0], kvalues_mxfp4);
1107
+ const int2 q1 = get_int_from_table_16(src_qs[2 * sub + 1], kvalues_mxfp4);
1108
+
1109
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1110
+ x_qs[i * MMQ_MMA_TILE_X_K_NVFP4 + kqs + 4 * sub + 0] = q0.x;
1111
+ x_qs[i * MMQ_MMA_TILE_X_K_NVFP4 + kqs + 4 * sub + 1] = q1.x;
1112
+ x_qs[i * MMQ_MMA_TILE_X_K_NVFP4 + kqs + 4 * sub + 2] = q0.y;
1113
+ x_qs[i * MMQ_MMA_TILE_X_K_NVFP4 + kqs + 4 * sub + 3] = q1.y;
1114
+ x_df[i * MMQ_MMA_TILE_X_K_NVFP4 + ksc + sub] = ggml_cuda_ue4m3_to_fp32(bxi->d[sub]);
1115
+ #else
1116
+ x_qs[i * (2 * MMQ_TILE_NE_K + 1) + kqs + 4 * sub + 0] = q0.x;
1117
+ x_qs[i * (2 * MMQ_TILE_NE_K + 1) + kqs + 4 * sub + 1] = q1.x;
1118
+ x_qs[i * (2 * MMQ_TILE_NE_K + 1) + kqs + 4 * sub + 2] = q0.y;
1119
+ x_qs[i * (2 * MMQ_TILE_NE_K + 1) + kqs + 4 * sub + 3] = q1.y;
1120
+ x_df[i * (2 * MMQ_TILE_NE_K * 2 / QI_NVFP4) + i / (QK_NVFP4_SUB / QI_NVFP4) + ksc + sub] = ggml_cuda_ue4m3_to_fp32(bxi->d[sub]);
1121
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1122
+ }
1123
+ }
1124
+ }
1125
+
1126
+ template <int mmq_x, int mmq_y>
1127
+ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
1128
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1129
+ constexpr int nwarps = mmq_get_nwarps_device();
1130
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
1131
+
1132
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
1133
+ const int * x_qs = (const int *) x;
1134
+ const float * x_df = (const float *) x_qs + txs.qs;
1135
+ const int * y_qs = (const int *) y + 4;
1136
+ const float * y_df = (const float *) y;
1137
+
1138
+ // #pragma unroll
1139
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += VDR_Q8_0_Q8_1_MMQ) {
1140
+ const int k0 = k00 + k01;
1141
+
1142
+ #pragma unroll
1143
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1144
+ const int j = j0 + threadIdx.y;
1145
+
1146
+ #pragma unroll
1147
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
1148
+ const int i = i0 + threadIdx.x;
1149
+
1150
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
1151
+ (&x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0 % MMQ_TILE_NE_K],
1152
+ x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + k0/QI8_0], y_df[j*MMQ_TILE_Y_K + (k0/QI8_1) % (MMQ_TILE_NE_K/QI8_1)]);
1153
+ }
1154
+ }
1155
+ }
1156
+ }
1157
+
1158
+ template <int mmq_x, int mmq_y, mmq_q8_1_ds_layout ds_layout>
1159
+ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
1160
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1161
+ #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1162
+ constexpr data_layout input_layout = get_input_data_layout();
1163
+ typedef tile<16, 8, int, input_layout> tile_A;
1164
+ typedef tile<16, 8, int, input_layout> tile_B;
1165
+ typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
1166
+
1167
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1168
+ constexpr int rows_per_warp = granularity;
1169
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1170
+
1171
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
1172
+
1173
+ const int * x_qs = (const int *) x;
1174
+ const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K;
1175
+ const int * y_qs = (const int *) y + 4;
1176
+ const float * y_df = (const float *) y;
1177
+ const half2 * y_ds = (const half2 *) y;
1178
+
1179
+ const int i0 = (threadIdx.y / ntx) * rows_per_warp;
1180
+
1181
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) {
1182
+ const int k0 = k00 + k01;
1183
+
1184
+ tile_A A[ntx];
1185
+ #pragma unroll
1186
+ for (int n = 0; n < ntx; ++n) {
1187
+ load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
1188
+ }
1189
+
1190
+ #pragma unroll
1191
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1192
+ tile_B B;
1193
+ load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
1194
+
1195
+ float dB;
1196
+ const int j = j0 + tile_C::get_j(0);
1197
+ if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) {
1198
+ dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
1199
+ } else {
1200
+ dB = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
1201
+ }
1202
+
1203
+ #pragma unroll
1204
+ for (int n = 0; n < ntx; ++n) {
1205
+ tile_C C;
1206
+ mma(C, A[n], B);
1207
+
1208
+ #pragma unroll
1209
+ for (int l = 0; l < tile_C::ne; ++l) {
1210
+ const int i = i0 + n*tile_A::I + tile_C::get_i(l);
1211
+ const float dA = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0];
1212
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l]*dA*dB;
1213
+ }
1214
+ }
1215
+ }
1216
+ }
1217
+ #else
1218
+ typedef tile<16, 8, int> tile_A;
1219
+ typedef tile< 8, 8, int> tile_B;
1220
+ typedef tile<16, 8, int> tile_C;
1221
+
1222
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1223
+ constexpr int rows_per_warp = 2 * granularity;
1224
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1225
+
1226
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
1227
+
1228
+ const int * x_qs = (const int *) x;
1229
+ const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K;
1230
+ const int * y_qs = (const int *) y + 4;
1231
+ const float * y_df = (const float *) y;
1232
+ const half2 * y_ds = (const half2 *) y;
1233
+
1234
+ tile_A A[ntx][MMQ_TILE_NE_K/QI8_0];
1235
+ float dA[ntx][tile_C::ne/2][MMQ_TILE_NE_K/QI8_0];
1236
+
1237
+ const int i0 = (threadIdx.y/ntx)*rows_per_warp;
1238
+
1239
+ #pragma unroll
1240
+ for (int n = 0; n < ntx; ++n) {
1241
+ #pragma unroll
1242
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) {
1243
+ const int k0 = k00 + k01;
1244
+
1245
+ load_ldmatrix(A[n][k01/QI8_0], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
1246
+ }
1247
+
1248
+ #pragma unroll
1249
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1250
+ const int i = i0 + n*tile_A::I + tile_C::get_i(2*l);
1251
+
1252
+ #pragma unroll
1253
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) {
1254
+ const int k0 = k00 + k01;
1255
+
1256
+ dA[n][l][k01/QI8_0] = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0];
1257
+ }
1258
+ }
1259
+ }
1260
+
1261
+ #pragma unroll
1262
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1263
+ #pragma unroll
1264
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) {
1265
+ tile_B B;
1266
+ float dB[tile_C::ne/2];
1267
+
1268
+ load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); // faster than load_ldmatrix
1269
+
1270
+ #pragma unroll
1271
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1272
+ const int j = j0 + tile_C::get_j(l);
1273
+
1274
+ if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) {
1275
+ dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
1276
+ } else {
1277
+ dB[l] = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
1278
+ }
1279
+ }
1280
+
1281
+ #pragma unroll
1282
+ for (int n = 0; n < ntx; ++n) {
1283
+ tile_C C;
1284
+ mma(C, A[n][k01/QI8_0], B);
1285
+
1286
+ #pragma unroll
1287
+ for (int l = 0; l < tile_C::ne; ++l) {
1288
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l]*dA[n][l/2][k01/QI8_0]*dB[l%2];
1289
+ }
1290
+ }
1291
+ }
1292
+ }
1293
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1294
+ }
1295
+
1296
+
1297
+ template <int mmq_x, int mmq_y>
1298
+ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
1299
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1300
+ constexpr int nwarps = mmq_get_nwarps_device();
1301
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
1302
+
1303
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y);
1304
+ const int * x_qs = (const int *) x;
1305
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
1306
+ const int * y_qs = (const int *) y + 4;
1307
+ const half2 * y_ds = (const half2 *) y;
1308
+
1309
+ // #pragma unroll
1310
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += VDR_Q8_0_Q8_1_MMQ) {
1311
+ const int k0 = k00 + k01;
1312
+
1313
+ #pragma unroll
1314
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1315
+ const int j = j0 + threadIdx.y;
1316
+
1317
+ #pragma unroll
1318
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
1319
+ const int i = i0 + threadIdx.x;
1320
+
1321
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
1322
+ (&x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
1323
+ x_dm[i*(MMQ_TILE_NE_K/QI5_1) + i/QI5_1 + k0/QI8_1], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
1324
+ }
1325
+ }
1326
+ }
1327
+ }
1328
+
1329
+ template <int mmq_x, int mmq_y>
1330
+ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
1331
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1332
+ #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1333
+ constexpr data_layout input_layout = get_input_data_layout();
1334
+ typedef tile<16, 8, int, input_layout> tile_A;
1335
+ typedef tile<16, 8, int, input_layout> tile_B;
1336
+ typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
1337
+
1338
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1339
+ constexpr int rows_per_warp = granularity;
1340
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1341
+
1342
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
1343
+
1344
+ const int * x_qs = (const int *) x;
1345
+ const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K;
1346
+ const int * y_qs = (const int *) y + 4;
1347
+ const half2 * y_dm = (const half2 *) y;
1348
+
1349
+ const int i0 = (threadIdx.y / ntx) * rows_per_warp;
1350
+
1351
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) {
1352
+ const int k0 = k00 + k01;
1353
+
1354
+ tile_A A[ntx];
1355
+ #pragma unroll
1356
+ for (int n = 0; n < ntx; ++n) {
1357
+ load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
1358
+ }
1359
+
1360
+ #pragma unroll
1361
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1362
+ tile_B B;
1363
+ load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
1364
+
1365
+ const int j = j0 + tile_C::get_j(0);
1366
+ const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
1367
+
1368
+ #pragma unroll
1369
+ for (int n = 0; n < ntx; ++n) {
1370
+ tile_C C;
1371
+ mma(C, A[n], B);
1372
+
1373
+ #pragma unroll
1374
+ for (int l = 0; l < tile_C::ne; ++l) {
1375
+ const int i = i0 + n*tile_A::I + tile_C::get_i(l);
1376
+ float2 dmA = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1]);
1377
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += dmA.x*dsB.x*C.x[l];
1378
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += dmA.y*dsB.y;
1379
+ }
1380
+ }
1381
+ }
1382
+ }
1383
+ #else
1384
+ typedef tile<16, 8, int> tile_A;
1385
+ typedef tile< 8, 8, int> tile_B;
1386
+ typedef tile<16, 8, int> tile_C;
1387
+
1388
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1389
+ constexpr int rows_per_warp = 2 * granularity;
1390
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1391
+
1392
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
1393
+
1394
+ const int * x_qs = (const int *) x;
1395
+ const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K;
1396
+ const int * y_qs = (const int *) y + 4;
1397
+ const half2 * y_dm = (const half2 *) y;
1398
+
1399
+ tile_A A[ntx][MMQ_TILE_NE_K/QI8_1];
1400
+ float2 dmA[ntx][tile_C::ne/2][MMQ_TILE_NE_K/QI8_1];
1401
+
1402
+ const int i0 = (threadIdx.y/ntx)*rows_per_warp;
1403
+
1404
+ #pragma unroll
1405
+ for (int n = 0; n < ntx; ++n) {
1406
+ #pragma unroll
1407
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) {
1408
+ const int k0 = k00 + k01;
1409
+
1410
+ load_ldmatrix(A[n][k01/QI8_1], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
1411
+ }
1412
+
1413
+ #pragma unroll
1414
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1415
+ const int i = i0 + n*tile_A::I + tile_C::get_i(2*l);
1416
+
1417
+ #pragma unroll
1418
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) {
1419
+ const int k0 = k00 + k01;
1420
+
1421
+ dmA[n][l][k01/QI8_1] = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1]);
1422
+ }
1423
+ }
1424
+ }
1425
+
1426
+ #pragma unroll
1427
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1428
+ #pragma unroll
1429
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) {
1430
+ tile_B B;
1431
+ float2 dsB[tile_C::ne/2];
1432
+
1433
+ load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); // faster than load_ldmatrix
1434
+
1435
+ #pragma unroll
1436
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1437
+ const int j = j0 + tile_C::get_j(l);
1438
+
1439
+ dsB[l] = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
1440
+ }
1441
+
1442
+ #pragma unroll
1443
+ for (int n = 0; n < ntx; ++n) {
1444
+ tile_C C;
1445
+ mma(C, A[n][k01/QI8_1], B);
1446
+
1447
+ #pragma unroll
1448
+ for (int l = 0; l < tile_C::ne; ++l) {
1449
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += dmA[n][l/2][k01/QI8_1].x*dsB[l%2].x*C.x[l];
1450
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += dmA[n][l/2][k01/QI8_1].y*dsB[l%2].y;
1451
+ }
1452
+ }
1453
+ }
1454
+ }
1455
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1456
+ }
1457
+
1458
+ // Used for NVFP4, Q3_K, IQ2_S, and IQ2_XS
1459
+ template <int mmq_x, int mmq_y>
1460
+ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
1461
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1462
+ constexpr int nwarps = mmq_get_nwarps_device();
1463
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
1464
+
1465
+ constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
1466
+ const int * x_qs = (const int *) x;
1467
+ const float * x_df = (const float *) x_qs + txs.qs;
1468
+ const int * y_qs = (const int *) y + 4;
1469
+ const float * y_df = (const float *) y;
1470
+
1471
+ // #pragma unroll
1472
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) {
1473
+ const int k0 = k00 + k01;
1474
+
1475
+ #pragma unroll
1476
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1477
+ const int j = j0 + threadIdx.y;
1478
+
1479
+ #pragma unroll
1480
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
1481
+ const int i = i0 + threadIdx.x;
1482
+
1483
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q8_0_16_q8_1_impl<QI8_0>(
1484
+ &x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0],
1485
+ &y_qs[j*MMQ_TILE_Y_K + k01],
1486
+ &x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + k0/(QI8_0/2)],
1487
+ y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
1488
+ }
1489
+ }
1490
+ }
1491
+ }
1492
+
1493
+ // Used for Q3_K, IQ2_S, and IQ2_XS:
1494
+ template <int mmq_x, int mmq_y>
1495
+ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
1496
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1497
+ #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1498
+ constexpr data_layout input_layout = get_input_data_layout();
1499
+ typedef tile<16, 4, int, input_layout> tile_A;
1500
+ typedef tile<16, 4, int, input_layout> tile_B;
1501
+ typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
1502
+
1503
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1504
+ constexpr int rows_per_warp = granularity;
1505
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1506
+
1507
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
1508
+
1509
+ const int * x_qs = (const int *) x;
1510
+ const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
1511
+ const int * y_qs = (const int *) y + 4;
1512
+ const float * y_df = (const float *) y;
1513
+
1514
+ const int i0 = (threadIdx.y / ntx) * rows_per_warp;
1515
+
1516
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
1517
+ const int k0 = k00 + k01;
1518
+
1519
+ tile_A A[ntx];
1520
+ #pragma unroll
1521
+ for (int n = 0; n < ntx; ++n) {
1522
+ load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
1523
+ }
1524
+
1525
+ #pragma unroll
1526
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1527
+ tile_B B;
1528
+ load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
1529
+
1530
+ const int j = j0 + tile_C::get_j(0);
1531
+ const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
1532
+
1533
+ #pragma unroll
1534
+ for (int n = 0; n < ntx; ++n) {
1535
+ tile_C C;
1536
+ mma(C, A[n], B);
1537
+
1538
+ #pragma unroll
1539
+ for (int l = 0; l < tile_C::ne; ++l) {
1540
+ const int i = i0 + n*tile_C::I + tile_C::get_i(l);
1541
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4] * dB;
1542
+ }
1543
+ }
1544
+ }
1545
+ }
1546
+ #elif defined(TURING_MMA_AVAILABLE)
1547
+
1548
+ typedef tile<16, 4, int> tile_A;
1549
+ typedef tile<16, 8, int> tile_A_8;
1550
+ typedef tile< 8, 4, int> tile_B;
1551
+ typedef tile<16, 8, int> tile_C;
1552
+
1553
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1554
+ constexpr int rows_per_warp = 2 * granularity;
1555
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1556
+
1557
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
1558
+
1559
+ const int * x_qs = (const int *) x;
1560
+ const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
1561
+ const int * y_qs = (const int *) y + 4;
1562
+ const float * y_df = (const float *) y;
1563
+
1564
+ const int i0 = (threadIdx.y / ntx) * (ntx*tile_A::I);
1565
+
1566
+ tile_A A[ntx][8];
1567
+ float dA[ntx][tile_C::ne/2][8];
1568
+
1569
+ #pragma unroll
1570
+ for (int n = 0; n < ntx; ++n) {
1571
+ #pragma unroll
1572
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 8) {
1573
+ const int k0 = k00 + k01;
1574
+
1575
+ load_ldmatrix(((tile_A_8 *) A[n])[k01/8], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
1576
+ }
1577
+
1578
+ #pragma unroll
1579
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1580
+ const int i = i0 + n*tile_C::I + tile_C::get_i(2*l);
1581
+
1582
+ #pragma unroll
1583
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
1584
+ const int k0 = k00 + k01;
1585
+
1586
+ dA[n][l][k01/4] = x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4];
1587
+ }
1588
+ }
1589
+ }
1590
+
1591
+ #pragma unroll
1592
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1593
+ #pragma unroll
1594
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) {
1595
+ tile_B B[2];
1596
+ float dB[tile_C::ne/2];
1597
+
1598
+ // Here load_generic is faster than load_ldmatrix.
1599
+ load_generic(B[0], y_qs + j0*MMQ_TILE_Y_K + (k01 + 0), MMQ_TILE_Y_K);
1600
+ load_generic(B[1], y_qs + j0*MMQ_TILE_Y_K + (k01 + tile_B::J), MMQ_TILE_Y_K);
1601
+
1602
+ #pragma unroll
1603
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1604
+ const int j = j0 + tile_C::get_j(l);
1605
+
1606
+ dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
1607
+ }
1608
+
1609
+ #pragma unroll
1610
+ for (int n = 0; n < ntx; ++n) {
1611
+ tile_C C[2];
1612
+ mma(C[0], A[n][k01/4 + 0], B[0]);
1613
+ mma(C[1], A[n][k01/4 + 1], B[1]);
1614
+
1615
+ #pragma unroll
1616
+ for (int l = 0; l < tile_C::ne; ++l) {
1617
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += dB[l%2]*(C[0].x[l]*dA[n][l/2][k01/4 + 0] + C[1].x[l]*dA[n][l/2][k01/4 + 1]);
1618
+ }
1619
+ }
1620
+ }
1621
+ }
1622
+ #else
1623
+ GGML_UNUSED_VARS(x, y, sum, k00);
1624
+ NO_DEVICE_CODE;
1625
+ #endif // AMD_MFMA_AVAILABLE || AMD_WMMA_AVAILABLE
1626
+ }
1627
+
1628
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
1629
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
1630
+ constexpr int nwarps = mmq_get_nwarps_device();
1631
+
1632
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1633
+ int * x_qs = (int *) x_tile;
1634
+ half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K);
1635
+ #else
1636
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y);
1637
+ int * x_qs = (int *) x_tile;
1638
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
1639
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1640
+
1641
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR2_K);
1642
+ constexpr int nrows = ggml_cuda_get_physical_warp_size() / threads_per_row;
1643
+ const int kqsx = threadIdx.x % threads_per_row;
1644
+
1645
+ #pragma unroll
1646
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
1647
+ int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
1648
+
1649
+ if (need_check) {
1650
+ i = min(i, i_max);
1651
+ }
1652
+
1653
+ const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride;
1654
+
1655
+ const int x_ql_0 = get_int_b2(bxi->qs, kqsx);
1656
+
1657
+ #pragma unroll
1658
+ for (int l = 0; l < QR2_K; ++l) {
1659
+ const int k = (kqsx/8)*32 + l*8 + kqsx % 8;
1660
+
1661
+ const int x_qs_k = (x_ql_0 >> (2*l)) & 0x03030303;
1662
+
1663
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1664
+ x_qs[i*MMQ_MMA_TILE_X_K_Q2_K + k] = x_qs_k;
1665
+ #else
1666
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + k] = x_qs_k;
1667
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1668
+ }
1669
+
1670
+ const int sc_m = bxi->scales[kqsx];
1671
+ #ifdef FAST_FP16_AVAILABLE
1672
+ const half2 x_dm_ik = __hmul2(bxi->dm, make_half2(sc_m & 0x0F, sc_m >> 4));
1673
+ #else
1674
+ const float2 bxi_dmf = __half22float2(bxi->dm);
1675
+ const half2 x_dm_ik = make_half2(bxi_dmf.x*(sc_m & 0x0F), bxi_dmf.y*(sc_m >> 4));
1676
+ #endif // FAST_FP16_AVAILABLE
1677
+
1678
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1679
+ x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + kqsx] = x_dm_ik;
1680
+ #else
1681
+ x_dm[i*(MMQ_TILE_NE_K + 1) + kqsx] = x_dm_ik;
1682
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1683
+ }
1684
+ }
1685
+
1686
+ template <int mmq_x, int mmq_y>
1687
+ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
1688
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1689
+ constexpr int nwarps = mmq_get_nwarps_device();
1690
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
1691
+
1692
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y);
1693
+ const int * x_qs = (const int *) x;
1694
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
1695
+ const int * y_qs = (const int *) y + 4;
1696
+ const half2 * y_ds = (const half2 *) y;
1697
+
1698
+ float2 y_df[mmq_x/nwarps];
1699
+ #pragma unroll
1700
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1701
+ const int j = j0 + threadIdx.y;
1702
+
1703
+ y_df[j0/nwarps] = __half22float2(y_ds[j*MMQ_TILE_Y_K]);
1704
+ }
1705
+
1706
+ #pragma unroll
1707
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K/2; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
1708
+ const int k0 = k00 + k01;
1709
+
1710
+ #pragma unroll
1711
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1712
+ const int j = j0 + threadIdx.y;
1713
+
1714
+ #pragma unroll
1715
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
1716
+ const int i = i0 + threadIdx.x;
1717
+
1718
+ constexpr int ns = 2;
1719
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
1720
+ &x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
1721
+ &x_dm[i*(MMQ_TILE_NE_K + 1) + k0/4], k01 < MMQ_TILE_NE_K/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
1722
+ &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1723
+ }
1724
+ }
1725
+ }
1726
+
1727
+ // Some compilers fail to unroll the loop over k01 if there is a conditional statement for ns in the inner loop.
1728
+ // As a workaround 2 separate loops are used instead.
1729
+ #pragma unroll
1730
+ for (int k01 = MMQ_TILE_NE_K/2; k01 < MMQ_TILE_NE_K; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
1731
+ const int k0 = k00 + k01;
1732
+
1733
+ #pragma unroll
1734
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1735
+ const int j = j0 + threadIdx.y;
1736
+
1737
+ #pragma unroll
1738
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
1739
+ const int i = i0 + threadIdx.x;
1740
+
1741
+ constexpr int ns = 1;
1742
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
1743
+ &x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
1744
+ &x_dm[i*(MMQ_TILE_NE_K + 1) + k0/4], k01 < MMQ_TILE_NE_K/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
1745
+ &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1746
+ }
1747
+ }
1748
+ }
1749
+ }
1750
+
1751
+ template <int mmq_x, int mmq_y>
1752
+ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
1753
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
1754
+ #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1755
+ constexpr data_layout input_layout = get_input_data_layout();
1756
+ typedef tile<16, 4, int, input_layout> tile_A;
1757
+ typedef tile<16, 4, int, input_layout> tile_B;
1758
+ typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
1759
+
1760
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1761
+ constexpr int rows_per_warp = granularity;
1762
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1763
+
1764
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
1765
+
1766
+ const int * x_qs = (const int *) x;
1767
+ const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2;
1768
+ const int * y_qs = (const int *) y + 4;
1769
+ const half2 * y_ds = (const half2 *) y;
1770
+
1771
+ const int i0 = (threadIdx.y / ntx) * rows_per_warp;
1772
+
1773
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
1774
+ const int k0 = k00 + k01;
1775
+
1776
+ tile_A A[ntx];
1777
+ #pragma unroll
1778
+ for (int n = 0; n < ntx; ++n) {
1779
+ load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
1780
+ }
1781
+
1782
+ #pragma unroll
1783
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1784
+ tile_B B;
1785
+ load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
1786
+
1787
+ const int j = j0 + tile_C::get_j(0);
1788
+ const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y;
1789
+ const float sB = (k01 >= MMQ_TILE_NE_K * 3/4) ? 0
1790
+ : (((k01/4)%2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).y
1791
+ : __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).x);
1792
+
1793
+ tile_C Cm;
1794
+ if (k01 >= MMQ_TILE_NE_K * 3/4) {
1795
+ tile_A A1;
1796
+ #pragma unroll
1797
+ for (int l = 0; l < tile_A::ne; ++l) {
1798
+ A1.x[l] = 0x01010101;
1799
+ }
1800
+ mma(Cm, A1, B);
1801
+ }
1802
+
1803
+ #pragma unroll
1804
+ for (int n = 0; n < ntx; ++n) {
1805
+ tile_C Cd;
1806
+ mma(Cd, A[n], B);
1807
+
1808
+ #pragma unroll
1809
+ for (int l = 0; l < tile_C::ne; ++l) {
1810
+ const int i = i0 + n*tile_C::I + tile_C::get_i(l);
1811
+ const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/4]);
1812
+ float tmp = Cd.x[l]*dm.x;
1813
+ if (k01 >= MMQ_TILE_NE_K * 3/4) {
1814
+ tmp -= Cm.x[l]*dm.y;
1815
+ }
1816
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*dB;
1817
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] -= dm.y*sB;
1818
+ }
1819
+ }
1820
+ }
1821
+ }
1822
+ #elif defined(TURING_MMA_AVAILABLE)
1823
+
1824
+ typedef tile<16, 4, int> tile_A;
1825
+ typedef tile<16, 8, int> tile_A_8;
1826
+ typedef tile< 8, 4, int> tile_B;
1827
+ typedef tile<16, 8, int> tile_C;
1828
+
1829
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
1830
+ constexpr int rows_per_warp = 2 * granularity;
1831
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
1832
+
1833
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
1834
+
1835
+ const int * x_qs = (const int *) x;
1836
+ const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2;
1837
+ const int * y_qs = (const int *) y + 4;
1838
+ const half2 * y_ds = (const half2 *) y;
1839
+
1840
+ const int i0 = (threadIdx.y / ntx) * (ntx*tile_A::I);
1841
+
1842
+ tile_A A[ntx][8];
1843
+ float dA[ntx][tile_C::ne/2][8];
1844
+ float mA[ntx][tile_C::ne/2][8];
1845
+
1846
+ #pragma unroll
1847
+ for (int n = 0; n < ntx; ++n) {
1848
+ #pragma unroll
1849
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) {
1850
+ const int k0 = k00 + k01;
1851
+
1852
+ load_ldmatrix(((tile_A_8 *) A[n])[k01/QI8_1], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
1853
+ }
1854
+ }
1855
+
1856
+ #pragma unroll
1857
+ for (int n = 0; n < ntx; ++n) {
1858
+ #pragma unroll
1859
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1860
+ const int i = i0 + n*tile_C::I + tile_C::get_i(2*l);
1861
+
1862
+ #pragma unroll
1863
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1/2) {
1864
+ const int k0 = k00 + k01;
1865
+
1866
+ const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/(QI8_1/2)]);
1867
+
1868
+ dA[n][l][k01/(QI8_1/2)] = dm.x;
1869
+ mA[n][l][k01/(QI8_1/2)] = dm.y;
1870
+ }
1871
+ }
1872
+ }
1873
+
1874
+ #pragma unroll
1875
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
1876
+ float2 dB[tile_C::ne/2];
1877
+
1878
+ #pragma unroll
1879
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1880
+ const int j = j0 + tile_C::get_j(l);
1881
+
1882
+ dB[l] = __half22float2(y_ds[j*MMQ_TILE_Y_K]);
1883
+ }
1884
+
1885
+ #pragma unroll
1886
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) {
1887
+ tile_B B[2];
1888
+
1889
+ // Here load_generic is faster than load_ldmatrix.
1890
+ load_generic(B[0], y_qs + j0*MMQ_TILE_Y_K + (k01 + 0), MMQ_TILE_Y_K);
1891
+ load_generic(B[1], y_qs + j0*MMQ_TILE_Y_K + (k01 + tile_B::J), MMQ_TILE_Y_K);
1892
+
1893
+ tile_C Cm[2];
1894
+ if (k01 >= MMQ_TILE_NE_K * 3/4) {
1895
+ tile_A A1;
1896
+ A1.x[0] = 0x01010101;
1897
+ A1.x[1] = 0x01010101;
1898
+ mma(Cm[0], A1, B[0]);
1899
+ mma(Cm[1], A1, B[1]);
1900
+ }
1901
+
1902
+ #pragma unroll
1903
+ for (int n = 0; n < ntx; ++n) {
1904
+ tile_C Cd[2];
1905
+
1906
+ mma(Cd[0], A[n][k01/4 + 0], B[0]);
1907
+ mma(Cd[1], A[n][k01/4 + 1], B[1]);
1908
+
1909
+ #pragma unroll
1910
+ for (int l = 0; l < tile_C::ne; ++l) {
1911
+ float tmp = Cd[0].x[l]*dA[n][l/2][k01/4 + 0] + Cd[1].x[l]*dA[n][l/2][k01/4 + 1];
1912
+ if (k01 >= MMQ_TILE_NE_K * 3/4) {
1913
+ tmp -= Cm[0].x[l]*mA[n][l/2][k01/4 + 0] + Cm[1].x[l]*mA[n][l/2][k01/4 + 1];
1914
+ }
1915
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*(k01 < MMQ_TILE_NE_K/2 ? dB[l%2].x : dB[l%2].y);
1916
+ }
1917
+ }
1918
+ }
1919
+
1920
+ #pragma unroll
1921
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K * 3/4; k01 += QI8_1) {
1922
+ float2 sB[tile_C::ne/2];
1923
+
1924
+ #pragma unroll
1925
+ for (int l = 0; l < tile_C::ne/2; ++l) {
1926
+ const int j = j0 + tile_C::get_j(l);
1927
+
1928
+ sB[l] = __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1929
+ }
1930
+
1931
+ #pragma unroll
1932
+ for (int n = 0; n < ntx; ++n) {
1933
+ #pragma unroll
1934
+ for (int l = 0; l < tile_C::ne; ++l) {
1935
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] -= mA[n][l/2][k01/4 + 0]*sB[l%2].x;
1936
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] -= mA[n][l/2][k01/4 + 1]*sB[l%2].y;
1937
+ }
1938
+ }
1939
+ }
1940
+ }
1941
+ #else
1942
+ GGML_UNUSED_VARS(x, y, sum, k00);
1943
+ NO_DEVICE_CODE;
1944
+ #endif // AMD_MFMA_AVAILABLE || AMD_WMMA_AVAILABLE
1945
+ }
1946
+
1947
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
1948
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
1949
+ constexpr int nwarps = mmq_get_nwarps_device();
1950
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
1951
+
1952
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1953
+ int * x_qs = (int *) x_tile;
1954
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
1955
+ #else
1956
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y);
1957
+ int * x_qs = (int *) x_tile;
1958
+ float * x_df = (float *) (x_qs + txs.qs);
1959
+ int * x_sc = (int *) (x_df + txs.dm);
1960
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
1961
+
1962
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR3_K);
1963
+ constexpr int nrows = warp_size / threads_per_row;
1964
+ const int kqsx = threadIdx.x % threads_per_row;
1965
+
1966
+ #pragma unroll
1967
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
1968
+ int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
1969
+
1970
+ if (need_check) {
1971
+ i = min(i, i_max);
1972
+ }
1973
+
1974
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
1975
+
1976
+ const int x_ql_0 = get_int_b2(bxi->qs, kqsx);
1977
+ const int x_qh_0 = get_int_b2(bxi->hmask, kqsx % (QI3_K/2)) >> (4 * (kqsx / (QI3_K/2)));
1978
+
1979
+ #pragma unroll
1980
+ for (int l = 0; l < QR3_K; ++l) {
1981
+ const int k = (kqsx/8)*32 + l*8 + kqsx % 8;
1982
+
1983
+ const int x_ql_k = (x_ql_0 >> (2*l)) & 0x03030303;
1984
+ const int x_qh_k = ((x_qh_0 >> l) << 2) & 0x04040404;
1985
+
1986
+ const int x_qs_k = __vsubss4(x_ql_k | x_qh_k, 0x04040404);
1987
+
1988
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1989
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + k] = x_qs_k;
1990
+ #else
1991
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + k] = x_qs_k;
1992
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
1993
+ }
1994
+ }
1995
+
1996
+ constexpr int rows_per_warp = warp_size / 4;
1997
+ #pragma unroll
1998
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
1999
+ int i = i0 + threadIdx.y*rows_per_warp + threadIdx.x/4;
2000
+
2001
+ if (need_check) {
2002
+ i = min(i, i_max);
2003
+ }
2004
+
2005
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
2006
+
2007
+ const int ksc = threadIdx.x % 4;
2008
+
2009
+ const int ksc_low = ksc % (QI3_K/8);
2010
+ const int shift_low = 4 * (ksc / (QI3_K/8));
2011
+ const int sc_low = (get_int_b2(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
2012
+
2013
+ const int ksc_high = QI3_K/8;
2014
+ const int shift_high = 2 * ksc;
2015
+ const int sc_high = ((get_int_b2(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
2016
+
2017
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
2018
+
2019
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2020
+ const int8_t * sc8 = (const int8_t *) &sc;
2021
+ const float d = bxi->d;
2022
+
2023
+ #pragma unroll
2024
+ for (int l = 0; l < int(sizeof(int)); ++l) {
2025
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*ksc + l] = d*sc8[l];
2026
+ }
2027
+ #else
2028
+ x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + ksc] = sc;
2029
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2030
+ }
2031
+
2032
+ #if !(defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE))
2033
+ #pragma unroll
2034
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) {
2035
+ int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y;
2036
+
2037
+ if (need_check) {
2038
+ i = min(i, i_max);
2039
+ }
2040
+
2041
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
2042
+
2043
+ x_df[i] = bxi->d;
2044
+ }
2045
+ #endif // !(defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)) || defined(AMD_WMMA_AVAILABLE)
2046
+ }
2047
+
2048
+ template <int mmq_x, int mmq_y>
2049
+ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a(
2050
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
2051
+ constexpr int nwarps = mmq_get_nwarps_device();
2052
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2053
+
2054
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y);
2055
+ const int * x_qs = (const int *) x;
2056
+ const float * x_df = (const float *) x_qs + txs.qs;
2057
+ const int * x_sc = (const int *) x_df + txs.dm;
2058
+ const int * y_qs = (const int *) y + 4;
2059
+ const float * y_df = (const float *) y;
2060
+
2061
+ // #pragma unroll
2062
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) {
2063
+ const int k0 = k00 + k01;
2064
+
2065
+ #pragma unroll
2066
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
2067
+ const int j = j0 + threadIdx.y;
2068
+
2069
+ #pragma unroll
2070
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
2071
+ const int i = i0 + threadIdx.x;
2072
+
2073
+ const int8_t * scales = ((const int8_t *) (x_sc + i*(MMQ_TILE_NE_K/8) + i/8)) + k0/4;
2074
+
2075
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q3_K_q8_1_impl_mmq(
2076
+ &x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], scales,
2077
+ x_df[i], y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
2078
+ }
2079
+ }
2080
+ }
2081
+ }
2082
+
2083
+ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, const int ksc) {
2084
+ // scale arrangement after the following two lines:
2085
+ // - ksc == 0: sc0, sc1, sc2, sc3
2086
+ // - ksc == 1: sc4, sc5, sc6, sc7
2087
+ // - ksc == 2: m0, m1, m2, m3
2088
+ // - ksc == 3: m4, m5, m6, m7
2089
+ return ((scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F) | // lower 4 bits
2090
+ ((scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030); // upper 2 bits
2091
+ }
2092
+
2093
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2094
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2095
+ constexpr int nwarps = mmq_get_nwarps_device();
2096
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2097
+
2098
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2099
+ int * x_qs = (int *) x_tile;
2100
+ half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K);
2101
+ #else
2102
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y);
2103
+ int * x_qs = (int *) x_tile;
2104
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
2105
+ int * x_sc = (int *) (x_dm + txs.dm);
2106
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2107
+
2108
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_K);
2109
+ constexpr int nrows = warp_size / threads_per_row;
2110
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
2111
+
2112
+ #pragma unroll
2113
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
2114
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
2115
+
2116
+ if (need_check) {
2117
+ i = min(i, i_max);
2118
+ }
2119
+
2120
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
2121
+ const int qs0 = get_int_b4(bxi->qs, txi);
2122
+
2123
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2124
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 0] = (qs0 >> 0) & 0x0F0F0F0F;
2125
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 8] = (qs0 >> 4) & 0x0F0F0F0F;
2126
+ #else
2127
+ x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0;
2128
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2129
+ }
2130
+
2131
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2132
+ constexpr int rows_per_warp = warp_size / 2;
2133
+ #pragma unroll
2134
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
2135
+ #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2136
+ // Need if on AMD instead of % because warp_size == 64
2137
+ // This causes double work and throughput loss (MI300X)
2138
+ // H100 loses about 100 t/s with 'if' condition over '%'
2139
+ int i = i0 + threadIdx.y*rows_per_warp + threadIdx.x/2;
2140
+ if (i < mmq_y) {
2141
+ #else
2142
+ int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/2) % mmq_y;
2143
+ {
2144
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2145
+ if (need_check) {
2146
+ i = min(i, i_max);
2147
+ }
2148
+
2149
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
2150
+
2151
+ const int * scales = (const int *) bxi->scales;
2152
+ const int ksc = threadIdx.x % 2;
2153
+
2154
+ const int sc32 = unpack_scales_q45_K(scales, ksc + 0);
2155
+ const int m32 = unpack_scales_q45_K(scales, ksc + 2);
2156
+
2157
+ const uint8_t * sc8 = (const uint8_t *) &sc32;
2158
+ const uint8_t * m8 = (const uint8_t *) &m32;
2159
+
2160
+ const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
2161
+
2162
+ #pragma unroll
2163
+ for (int l = 0; l < sizeof(int); ++l) {
2164
+ x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
2165
+ }
2166
+ }
2167
+ }
2168
+ #else
2169
+ #pragma unroll
2170
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) {
2171
+ int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y;
2172
+
2173
+ if (need_check) {
2174
+ i = min(i, i_max);
2175
+ }
2176
+
2177
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
2178
+
2179
+ x_dm[i] = bxi->dm;
2180
+ }
2181
+ constexpr int rows_per_warp = warp_size / 4;
2182
+ #pragma unroll
2183
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
2184
+ int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/(MMQ_TILE_NE_K/8)) % mmq_y;
2185
+
2186
+ if (need_check) {
2187
+ i = min(i, i_max);
2188
+ }
2189
+
2190
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + (threadIdx.x % (MMQ_TILE_NE_K/8)) / (QI4_K/8);
2191
+
2192
+ const int * scales = (const int *) bxi->scales;
2193
+
2194
+ const int ksc = threadIdx.x % (MMQ_TILE_NE_K/8);
2195
+ const int scales8 = unpack_scales_q45_K(scales, ksc);
2196
+
2197
+ x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + ksc] = scales8;
2198
+ }
2199
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2200
+ }
2201
+
2202
+ template <int mmq_x, int mmq_y>
2203
+ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
2204
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
2205
+ constexpr int nwarps = mmq_get_nwarps_device();
2206
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2207
+
2208
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y);
2209
+ const int * x_qs = (const int *) x;
2210
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
2211
+ const int * x_sc = (const int *) x_dm + txs.dm;
2212
+ const int * y_qs = (const int *) y + 4;
2213
+ const half2 * y_ds = (const half2 *) y;
2214
+
2215
+ // #pragma unroll
2216
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_K*VDR_Q4_K_Q8_1_MMQ) {
2217
+ const int k0 = k00 + k01;
2218
+
2219
+ #pragma unroll
2220
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
2221
+ const int j = j0 + threadIdx.y;
2222
+
2223
+ #pragma unroll
2224
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
2225
+ const int i = i0 + threadIdx.x;
2226
+
2227
+ const uint8_t * sc = (const uint8_t *) &x_sc[i * (MMQ_TILE_NE_K/8) + i/8 + k0/32] + 2*(k01/16);
2228
+
2229
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q4_K_q8_1_impl_mmq(
2230
+ &x_qs[i*(MMQ_TILE_NE_K + 1) + k0/2], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8,
2231
+ x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
2232
+ }
2233
+ }
2234
+ }
2235
+ }
2236
+
2237
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2238
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2239
+ constexpr int nwarps = mmq_get_nwarps_device();
2240
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2241
+
2242
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2243
+ int * x_qs = (int *) x_tile;
2244
+ half2 * x_dm = (half2 *) (x_qs + MMQ_TILE_NE_K*2);
2245
+ #else
2246
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y);
2247
+ int * x_qs = (int *) x_tile;
2248
+ half2 * x_dm = (half2 *) (x_qs + txs.qs);
2249
+ int * x_sc = (int *) (x_dm + txs.dm);
2250
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
2251
+
2252
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR5_K);
2253
+ constexpr int nrows = warp_size / threads_per_row;
2254
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
2255
+
2256
+ #pragma unroll
2257
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
2258
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
2259
+
2260
+ if (need_check) {
2261
+ i = min(i, i_max);
2262
+ }
2263
+
2264
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
2265
+ const int ky = QR5_K*txi;
2266
+
2267
+ const int ql = get_int_b4(bxi->qs, txi);
2268
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
2269
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2270
+
2271
+ const int qh = get_int_b4(bxi->qh, txi % (QI5_K/4));
2272
+ const int qh0 = ((qh >> (2 * (txi / (QI5_K/4)) + 0)) << 4) & 0x10101010;
2273
+ const int qh1 = ((qh >> (2 * (txi / (QI5_K/4)) + 1)) << 4) & 0x10101010;
2274
+
2275
+ const int kq0 = ky - ky % (QI5_K/2) + txi % (QI5_K/4) + 0;
2276
+ const int kq1 = ky - ky % (QI5_K/2) + txi % (QI5_K/4) + QI5_K/4;
2277
+
2278
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2279
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq0] = ql0 | qh0;
2280
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq1] = ql1 | qh1;
2281
+ #else
2282
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq0] = ql0 | qh0;
2283
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq1] = ql1 | qh1;
2284
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2285
+ }
2286
+
2287
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2288
+ constexpr int rows_per_warp = warp_size / 2;
2289
+ #pragma unroll
2290
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
2291
+ #if defined(AMD_MFMA_AVAILABLE)
2292
+ // Need if on AMD instead of % because warp_size == 64
2293
+ // This causes double work and throughput loss (MI300X)
2294
+ // H100 loses about 100 t/s with 'if' condition over '%'
2295
+ int i = i0 + threadIdx.y*rows_per_warp + threadIdx.x/2;
2296
+ if (i < mmq_y) {
2297
+ #else
2298
+ int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/2) % mmq_y;
2299
+ {
2300
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2301
+ if (need_check) {
2302
+ i = min(i, i_max);
2303
+ }
2304
+
2305
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
2306
+
2307
+ const int * scales = (const int *) bxi->scales;
2308
+ const int ksc = threadIdx.x % 2;
2309
+
2310
+ const int sc32 = unpack_scales_q45_K(scales, ksc + 0);
2311
+ const int m32 = unpack_scales_q45_K(scales, ksc + 2);
2312
+
2313
+ const uint8_t * sc8 = (const uint8_t *) &sc32;
2314
+ const uint8_t * m8 = (const uint8_t *) &m32;
2315
+
2316
+ const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
2317
+
2318
+ #pragma unroll
2319
+ for (int l = 0; l < int(sizeof(int)); ++l) {
2320
+ x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
2321
+ }
2322
+ }
2323
+ }
2324
+ #else
2325
+ #pragma unroll
2326
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) {
2327
+ int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y;
2328
+
2329
+ if (need_check) {
2330
+ i = min(i, i_max);
2331
+ }
2332
+
2333
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
2334
+
2335
+ x_dm[i] = bxi->dm;
2336
+ }
2337
+
2338
+ constexpr int rows_per_warp = warp_size / 4;
2339
+ #pragma unroll
2340
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
2341
+ int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/(MMQ_TILE_NE_K/8)) % mmq_y;
2342
+
2343
+ if (need_check) {
2344
+ i = min(i, i_max);
2345
+ }
2346
+
2347
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
2348
+
2349
+ const int * scales = (const int *) bxi->scales;
2350
+
2351
+ const int ksc = threadIdx.x % (MMQ_TILE_NE_K/8);
2352
+ const int scales8 = unpack_scales_q45_K(scales, ksc);
2353
+
2354
+ x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + ksc] = scales8;
2355
+ }
2356
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2357
+ }
2358
+
2359
+ template <int mmq_x, int mmq_y>
2360
+ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
2361
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
2362
+ constexpr int nwarps = mmq_get_nwarps_device();
2363
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2364
+
2365
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y);
2366
+ const int * x_qs = (const int *) x;
2367
+ const half2 * x_dm = (const half2 *) x_qs + txs.qs;
2368
+ const int * x_sc = (const int *) x_dm + txs.dm;
2369
+ const int * y_qs = (const int *) y + 4;
2370
+ const half2 * y_ds = (const half2 *) y;
2371
+
2372
+ // #pragma unroll
2373
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR5_K*VDR_Q5_K_Q8_1_MMQ) {
2374
+ const int k0 = k00 + k01;
2375
+
2376
+ #pragma unroll
2377
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
2378
+ const int j = j0 + threadIdx.y;
2379
+
2380
+ #pragma unroll
2381
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
2382
+ const int i = i0 + threadIdx.x;
2383
+
2384
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (MMQ_TILE_NE_K/8) + i/8 + k00/32]) + 2*(k01/16);
2385
+
2386
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q5_K_q8_1_impl_mmq(
2387
+ &x_qs[i*(QR5_K*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8,
2388
+ x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
2389
+ }
2390
+ }
2391
+ }
2392
+ }
2393
+
2394
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
2395
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2396
+ constexpr int nwarps = mmq_get_nwarps_device();
2397
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2398
+
2399
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2400
+ int * x_qs = (int *) x_tile;
2401
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
2402
+ int * x_sc = (int *) (x_df + MMQ_TILE_NE_K/QI6_K);
2403
+ #else
2404
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y);
2405
+ int * x_qs = (int *) x_tile;
2406
+ float * x_df = (float *) (x_qs + txs.qs);
2407
+ int * x_sc = (int *) (x_df + txs.dm);
2408
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2409
+
2410
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR6_K);
2411
+ constexpr int nrows = warp_size / threads_per_row;
2412
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
2413
+
2414
+ #pragma unroll
2415
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
2416
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
2417
+
2418
+ if (need_check) {
2419
+ i = min(i, i_max);
2420
+ }
2421
+
2422
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride;
2423
+
2424
+ const int ql = get_int_b2(bxi->ql, txi);
2425
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
2426
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2427
+
2428
+ const int qh = get_int_b2(bxi->qh, (QI6_K/4) * (txi / (QI6_K/2)) + txi % (QI6_K/4));
2429
+ const int qh0 = ((qh >> ((txi & 0x08) >> 2)) << 4) & 0x30303030;
2430
+ const int qh1 = (qh >> ((txi & 0x08) >> 2)) & 0x30303030;
2431
+
2432
+ const int kq0 = 2*txi - txi % (QI6_K/2) + 0;
2433
+ const int kq1 = 2*txi - txi % (QI6_K/2) + QI6_K/2;
2434
+
2435
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2436
+ x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
2437
+ x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
2438
+ #else
2439
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
2440
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
2441
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2442
+ }
2443
+
2444
+ #pragma unroll
2445
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) {
2446
+ int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y;
2447
+
2448
+ if (need_check) {
2449
+ i = min(i, i_max);
2450
+ }
2451
+
2452
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride;
2453
+
2454
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2455
+ x_df[i*MMQ_MMA_TILE_X_K_Q6_K] = bxi->d;
2456
+ #else
2457
+ x_df[i*(MMQ_TILE_NE_K/QI6_K) + i/QI6_K] = bxi->d;
2458
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2459
+ }
2460
+
2461
+ constexpr int rows_per_warp = warp_size / 4;
2462
+ #pragma unroll
2463
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
2464
+ int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/(MMQ_TILE_NE_K/8)) % mmq_y;
2465
+
2466
+ if (need_check) {
2467
+ i = min(i, i_max);
2468
+ }
2469
+
2470
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + (threadIdx.x % (MMQ_TILE_NE_K/8)) / 4;
2471
+
2472
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2473
+ x_sc[i*MMQ_MMA_TILE_X_K_Q6_K + threadIdx.x%4] = get_int_b2(bxi->scales, threadIdx.x % (MMQ_TILE_NE_K/8));
2474
+ #else
2475
+ x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + threadIdx.x%(MMQ_TILE_NE_K/8)] = get_int_b2(bxi->scales, threadIdx.x%(QI6_K/8));
2476
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2477
+ }
2478
+ }
2479
+
2480
+ template <int mmq_x, int mmq_y>
2481
+ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
2482
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
2483
+ constexpr int nwarps = mmq_get_nwarps_device();
2484
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2485
+
2486
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y);
2487
+ const int * x_qs = (const int *) x;
2488
+ const float * x_df = (const float *) x_qs + txs.qs;
2489
+ const int * x_sc = (const int *) x_df + txs.dm;
2490
+ const int * y_qs = (const int *) y + 4;
2491
+ const float * y_df = (const float *) y;
2492
+
2493
+ // #pragma unroll
2494
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR6_K*VDR_Q6_K_Q8_1_MMQ) {
2495
+ const int k0 = k00 + k01;
2496
+
2497
+ #pragma unroll
2498
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
2499
+ const int j = j0 + threadIdx.y;
2500
+
2501
+ #pragma unroll
2502
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
2503
+ const int i = i0 + threadIdx.x;
2504
+
2505
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (MMQ_TILE_NE_K/8) + i/8 + k0/16]);
2506
+
2507
+ sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q6_K_q8_1_impl_mmq(
2508
+ &x_qs[i*(QR6_K*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc,
2509
+ x_df[i*(MMQ_TILE_NE_K/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
2510
+ }
2511
+ }
2512
+ }
2513
+ }
2514
+
2515
+ template <int mmq_x, int mmq_y>
2516
+ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
2517
+ const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
2518
+ #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2519
+ constexpr data_layout input_layout = get_input_data_layout();
2520
+ typedef tile<16, 4, int, input_layout> tile_A;
2521
+ typedef tile<16, 4, int, input_layout> tile_B;
2522
+ typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
2523
+
2524
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
2525
+ constexpr int rows_per_warp = granularity;
2526
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
2527
+
2528
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
2529
+
2530
+ const int * x_qs = (const int *) x;
2531
+ const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
2532
+ const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K;
2533
+ const int * y_qs = (const int *) y + 4;
2534
+ const float * y_df = (const float *) y;
2535
+
2536
+ const int i0 = (threadIdx.y / ntx) * rows_per_warp;
2537
+
2538
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
2539
+ const int k0 = k00 + k01;
2540
+
2541
+ tile_A A[ntx];
2542
+ #pragma unroll
2543
+ for (int n = 0; n < ntx; ++n) {
2544
+ load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
2545
+ }
2546
+
2547
+ #pragma unroll
2548
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
2549
+ tile_B B;
2550
+ load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
2551
+
2552
+ const int j = j0 + tile_C::get_j(0);
2553
+ const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
2554
+
2555
+ #pragma unroll
2556
+ for (int n = 0; n < ntx; ++n) {
2557
+ tile_C C;
2558
+ mma(C, A[n], B);
2559
+
2560
+ #pragma unroll
2561
+ for (int l = 0; l < tile_C::ne; ++l) {
2562
+ const int i = i0 + n*tile_C::I + tile_C::get_i(l);
2563
+ const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16);
2564
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K] * dB;
2565
+ }
2566
+ }
2567
+ }
2568
+ }
2569
+ #elif defined(TURING_MMA_AVAILABLE)
2570
+
2571
+ typedef tile<16, 4, int> tile_A;
2572
+ typedef tile< 8, 4, int> tile_B;
2573
+ typedef tile<16, 8, int> tile_C;
2574
+
2575
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
2576
+ constexpr int rows_per_warp = 2 * granularity;
2577
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
2578
+
2579
+ y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
2580
+
2581
+ const int * x_qs = (const int *) x;
2582
+ const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
2583
+ const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K;
2584
+ const int * y_qs = (const int *) y + 4;
2585
+ const float * y_df = (const float *) y;
2586
+
2587
+ const int i0 = (threadIdx.y / ntx) * (ntx*tile_A::I);
2588
+
2589
+ tile_A A[ntx][8];
2590
+ int scA[ntx][tile_C::ne/2][8];
2591
+ float dA[ntx][tile_C::ne/2];
2592
+
2593
+ #pragma unroll
2594
+ for (int n = 0; n < ntx; ++n) {
2595
+ #pragma unroll
2596
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 8) {
2597
+ const int k0 = k00 + k01;
2598
+
2599
+ load_ldmatrix(A[n][k01/4 + 0], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + (k0 + 0), MMQ_MMA_TILE_X_K_Q6_K);
2600
+ load_ldmatrix(A[n][k01/4 + 1], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + (k0 + tile_A::J), MMQ_MMA_TILE_X_K_Q6_K);
2601
+ }
2602
+
2603
+ #pragma unroll
2604
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 16) {
2605
+ const int k0 = k00 + k01;
2606
+
2607
+ #pragma unroll
2608
+ for (int l = 0; l < tile_C::ne/2; ++l) {
2609
+ const int i = i0 + n*tile_C::I + tile_C::get_i(2*l);
2610
+
2611
+ const int sc_packed = x_sc[i*MMQ_MMA_TILE_X_K_Q6_K + k0/16];
2612
+ const int8_t * sc = (const int8_t *) &sc_packed;
2613
+
2614
+ #pragma unroll
2615
+ for (int ksc = 0; ksc < sizeof(int); ++ksc) {
2616
+ scA[n][l][k01/4 + ksc] = sc[ksc];
2617
+ }
2618
+ }
2619
+ }
2620
+
2621
+ #pragma unroll
2622
+ for (int l = 0; l < tile_C::ne/2; ++l) {
2623
+ const int i = i0 + n*tile_C::I + tile_C::get_i(2*l);
2624
+
2625
+ dA[n][l] = x_df[i*MMQ_MMA_TILE_X_K_Q6_K];
2626
+ }
2627
+ }
2628
+
2629
+ #pragma unroll
2630
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
2631
+ float tmp[ntx][tile_C::ne] = {{0.0f}};
2632
+
2633
+ #pragma unroll
2634
+ for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 8) {
2635
+ tile_B B[2];
2636
+ float dB[tile_C::ne/2];
2637
+
2638
+ // Here load_generic is faster than load_ldmatrix.
2639
+ load_generic(B[0], y_qs + j0*MMQ_TILE_Y_K + 0 + k01, MMQ_TILE_Y_K);
2640
+ load_generic(B[1], y_qs + j0*MMQ_TILE_Y_K + tile_B::J + k01, MMQ_TILE_Y_K);
2641
+
2642
+ #pragma unroll
2643
+ for (int l = 0; l < tile_C::ne/2; ++l) {
2644
+ const int j = j0 + tile_C::get_j(l);
2645
+
2646
+ dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
2647
+ }
2648
+
2649
+ #pragma unroll
2650
+ for (int n = 0; n < ntx; ++n) {
2651
+ tile_C C[2];
2652
+ mma(C[0], A[n][k01/4 + 0], B[0]);
2653
+ mma(C[1], A[n][k01/4 + 1], B[1]);
2654
+
2655
+ #pragma unroll
2656
+ for (int l = 0; l < tile_C::ne; ++l) {
2657
+ tmp[n][l] += (C[0].x[l]*scA[n][l/2][k01/4 + 0] + C[1].x[l]*scA[n][l/2][k01/4 + 1])*dB[l%2];
2658
+ }
2659
+ }
2660
+ }
2661
+
2662
+ #pragma unroll
2663
+ for (int n = 0; n < ntx; ++n) {
2664
+ #pragma unroll
2665
+ for (int l = 0; l < tile_C::ne; ++l) {
2666
+ sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp[n][l]*dA[n][l/2];
2667
+ }
2668
+ }
2669
+ }
2670
+ #else
2671
+ GGML_UNUSED_VARS(x, y, sum, k00);
2672
+ NO_DEVICE_CODE;
2673
+ #endif // AMD_MFMA_AVAILABLE || AMD_WMMA_AVAILABLE
2674
+ }
2675
+
2676
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_nl(
2677
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2678
+ constexpr int nwarps = mmq_get_nwarps_device();
2679
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2680
+
2681
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2682
+ int * x_qs = (int *) x_tile;
2683
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
2684
+ #else
2685
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_NL, mmq_y);
2686
+ int * x_qs = (int *) x_tile;
2687
+ float * x_df = (float *) (x_qs + txs.qs);
2688
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2689
+
2690
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_NL);
2691
+ constexpr int nrows = warp_size / threads_per_row;
2692
+ const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
2693
+ const int kbx = txi / QI4_NL;
2694
+ const int kqsx = txi % QI4_NL;
2695
+
2696
+ #pragma unroll
2697
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
2698
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
2699
+
2700
+ if (need_check) {
2701
+ i = min(i, i_max);
2702
+ }
2703
+
2704
+ const block_iq4_nl * bxi = (const block_iq4_nl *) x + kbx0 + i*stride + kbx;
2705
+
2706
+ const int aux_q4 = get_int_b2(bxi->qs, kqsx);
2707
+ const int2 v = get_int_from_table_16(aux_q4, kvalues_iq4nl);
2708
+ const int k0 = kbx * (2 * QI4_NL) + kqsx;
2709
+
2710
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2711
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
2712
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + QI4_NL] = v.y;
2713
+ #else
2714
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x;
2715
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI4_NL] = v.y;
2716
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2717
+ }
2718
+
2719
+ constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_NL;
2720
+ constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
2721
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
2722
+
2723
+ #pragma unroll
2724
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
2725
+ int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
2726
+
2727
+ if (need_check) {
2728
+ i = min(i, i_max);
2729
+ }
2730
+
2731
+ const block_iq4_nl * bxi = (const block_iq4_nl *) x + kbx0 + i*stride + kbxd;
2732
+
2733
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2734
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d);
2735
+ #else
2736
+ x_df[i*(MMQ_TILE_NE_K/QI4_NL) + i/QI4_NL + kbxd] = __half2float(bxi->d);
2737
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2738
+ }
2739
+ }
2740
+
2741
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xxs(
2742
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2743
+ constexpr int nwarps = mmq_get_nwarps_device();
2744
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2745
+
2746
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2747
+ int * x_qs = (int *) x_tile;
2748
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
2749
+ #else
2750
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_XXS, mmq_y);
2751
+ int * x_qs = (int *) x_tile;
2752
+ float * x_df = (float *) (x_qs + txs.qs);
2753
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2754
+
2755
+ constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR2_XXS)) / 2;
2756
+ constexpr int nrows = warp_size / threads_per_row;
2757
+ const int kqsx = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
2758
+
2759
+ #pragma unroll
2760
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) {
2761
+ int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
2762
+
2763
+ if (need_check) {
2764
+ i = min(i, i_max);
2765
+ }
2766
+
2767
+ const block_iq2_xxs * bxi = (const block_iq2_xxs *) x + kbx0 + i*stride;
2768
+
2769
+ const int q2 = get_int_b2(bxi->qs, 2*kqsx+0);
2770
+ const uint8_t * aux8 = (const uint8_t *) &q2;
2771
+ const uint32_t aux32 = get_int_b2(bxi->qs, 2*kqsx+1);
2772
+
2773
+ #pragma unroll
2774
+ for (int l = 0; l < QR2_XXS; ++l) {
2775
+ const uint2 grid_pos = ((const uint2*)iq2xxs_grid)[aux8[l]];
2776
+ const uint32_t signs = unpack_ksigns(aux32 >> (7 * l));
2777
+
2778
+ const int signs0 = __vcmpne4(signs & 0x08040201, 0);
2779
+ const int grid0 = __vsub4(grid_pos.x ^ signs0, signs0);
2780
+
2781
+ const int signs1 = __vcmpne4(signs & 0x80402010, 0);
2782
+ const int grid1 = __vsub4(grid_pos.y ^ signs1, signs1);
2783
+
2784
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2785
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0;
2786
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid1;
2787
+ #else
2788
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid0;
2789
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid1;
2790
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2791
+ }
2792
+
2793
+ const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
2794
+ const float d = bxi->d;
2795
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2796
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
2797
+ #else
2798
+ x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
2799
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2800
+ }
2801
+ }
2802
+
2803
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xs(
2804
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2805
+ constexpr int nwarps = mmq_get_nwarps_device();
2806
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2807
+
2808
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2809
+ int * x_qs = (int *) x_tile;
2810
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
2811
+ #else
2812
+ constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
2813
+ int * x_qs = (int *) x_tile;
2814
+ float * x_df = (float *) (x_qs + txs.qs);
2815
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2816
+
2817
+ constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR2_XS)) / 2;
2818
+ constexpr int nrows = warp_size / threads_per_row;
2819
+ const int kqsx = threadIdx.x % threads_per_row;
2820
+
2821
+ #pragma unroll
2822
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) {
2823
+ int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
2824
+
2825
+ if (need_check) {
2826
+ i = min(i, i_max);
2827
+ }
2828
+
2829
+ const block_iq2_xs * bxi = (const block_iq2_xs *) x + kbx0 + i*stride;
2830
+
2831
+ const int2 q2_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
2832
+ const uint16_t * q2 = (const uint16_t *) &q2_packed;
2833
+
2834
+ #pragma unroll
2835
+ for (int l = 0; l < QR2_XS; ++l) {
2836
+ const uint2 grid_pos = ((const uint2*)iq2xs_grid)[q2[l] & 0x1FF];
2837
+ const uint32_t signs = unpack_ksigns(q2[l] >> 9);
2838
+
2839
+ const int signs0 = __vcmpne4(signs & 0x08040201, 0);
2840
+ const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
2841
+
2842
+ const int signs1 = __vcmpne4(signs & 0x80402010, 0);
2843
+ const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
2844
+
2845
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2846
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
2847
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
2848
+ #else
2849
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid_l;
2850
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid_h;
2851
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2852
+ }
2853
+
2854
+ const int ls = bxi->scales[kqsx];
2855
+ const float d = bxi->d;
2856
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2857
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
2858
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
2859
+ #else
2860
+ x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
2861
+ x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
2862
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2863
+ }
2864
+ }
2865
+
2866
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_s(
2867
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2868
+ constexpr int nwarps = mmq_get_nwarps_device();
2869
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2870
+
2871
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2872
+ int * x_qs = (int *) x_tile;
2873
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
2874
+ #else
2875
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_S, mmq_y);
2876
+ int * x_qs = (int *) x_tile;
2877
+ float * x_df = (float *) (x_qs + txs.qs);
2878
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2879
+ constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR2_S)) / 2;
2880
+ constexpr int nrows = warp_size / threads_per_row;
2881
+ const int kqsx = threadIdx.x % threads_per_row;
2882
+
2883
+ #pragma unroll
2884
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) {
2885
+ int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
2886
+
2887
+ if (need_check) {
2888
+ i = min(i, i_max);
2889
+ }
2890
+
2891
+ const block_iq2_s * bxi = (const block_iq2_s *) x + kbx0 + i*stride;
2892
+
2893
+ const int qs_packed = get_int_b2(bxi->qs, kqsx);
2894
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
2895
+
2896
+ const int qh = bxi->qh[kqsx];
2897
+
2898
+ const int signs_packed_32 = get_int_b2(bxi->qs, QK_K/32 + kqsx);
2899
+ const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
2900
+
2901
+ #pragma unroll
2902
+ for (int l = 0; l < QR2_S; ++l) {
2903
+ const int * grid_pos = (const int *)(iq2s_grid + (qs[l] | ((qh << (8-2*l)) & 0x300)));
2904
+
2905
+ const int signs0 = __vcmpne4(((signs_packed_8[l] & 0x03) << 7) | ((signs_packed_8[l] & 0x0C) << 21), 0x00000000);
2906
+ const int signs1 = __vcmpne4(((signs_packed_8[l] & 0x30) << 3) | ((signs_packed_8[l] & 0xC0) << 17), 0x00000000);
2907
+
2908
+ const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
2909
+ const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
2910
+
2911
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2912
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
2913
+ x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
2914
+ #else
2915
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid_l;
2916
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid_h;
2917
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2918
+ }
2919
+
2920
+ const int ls = bxi->scales[kqsx];
2921
+ const float d = bxi->d;
2922
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2923
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
2924
+ x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
2925
+ #else
2926
+ x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
2927
+ x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
2928
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2929
+ }
2930
+ }
2931
+
2932
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_xxs(
2933
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2934
+ constexpr int nwarps = mmq_get_nwarps_device();
2935
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2936
+
2937
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2938
+ int * x_qs = (int *) x_tile;
2939
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
2940
+ #else
2941
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_XXS, mmq_y);
2942
+ int * x_qs = (int *) x_tile;
2943
+ float * x_df = (float *) (x_qs + txs.qs);
2944
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2945
+
2946
+ constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR3_XXS)) / 2;
2947
+ constexpr int nrows = warp_size / threads_per_row;
2948
+ const int kqsx = threadIdx.x % threads_per_row;
2949
+
2950
+ #pragma unroll
2951
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) {
2952
+ int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
2953
+
2954
+ if (need_check) {
2955
+ i = min(i, i_max);
2956
+ }
2957
+
2958
+ const block_iq3_xxs * bxi = (const block_iq3_xxs *) x + kbx0 + i*stride;
2959
+
2960
+ const int2 q3_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
2961
+ const uint8_t * q3 = (const uint8_t *) &q3_packed;
2962
+ const uint32_t aux32 = get_int_b2(bxi->qs, QK_K/16 + kqsx);
2963
+
2964
+ #pragma unroll
2965
+ for (int l = 0; l < QR3_XXS; ++l) {
2966
+ const int2 grid_pos = make_int2(iq3xxs_grid[q3[2*l+0]], iq3xxs_grid[q3[2*l+1]]);
2967
+ const uint32_t signs = unpack_ksigns(aux32 >> (7*l));
2968
+
2969
+ const int signs0 = __vcmpne4(signs & 0x08040201, 0);
2970
+ const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
2971
+
2972
+ const int signs1 = __vcmpne4(signs & 0x80402010, 0);
2973
+ const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
2974
+
2975
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2976
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l;
2977
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid_h;
2978
+ #else
2979
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid_l;
2980
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid_h;
2981
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2982
+ }
2983
+
2984
+ const int ls = aux32 >> 28;
2985
+ const float d = bxi->d;
2986
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2987
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/2;
2988
+ #else
2989
+ x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = (ls*d + d/2)/2;
2990
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
2991
+ }
2992
+ }
2993
+
2994
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_s(
2995
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
2996
+ constexpr int nwarps = mmq_get_nwarps_device();
2997
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
2998
+
2999
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3000
+ int * x_qs = (int *) x_tile;
3001
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
3002
+ #else
3003
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
3004
+ int * x_qs = (int *) x_tile;
3005
+ float * x_df = (float *) (x_qs + txs.qs);
3006
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3007
+
3008
+ constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR3_S)) / 2;
3009
+ constexpr int nrows = warp_size / threads_per_row;
3010
+ const int kqsx = threadIdx.x % threads_per_row;
3011
+
3012
+ #pragma unroll
3013
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) {
3014
+ int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
3015
+
3016
+ if (need_check) {
3017
+ i = min(i, i_max);
3018
+ }
3019
+
3020
+ const block_iq3_s * bxi = (const block_iq3_s *) x + kbx0 + i*stride;
3021
+
3022
+ const int2 qs_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
3023
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
3024
+
3025
+ const int qh = bxi->qh[kqsx];
3026
+
3027
+ const int signs_packed_32 = get_int_b2(bxi->signs, kqsx);
3028
+ const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
3029
+
3030
+ #pragma unroll
3031
+ for (int l = 0; l < QR3_S; ++l) {
3032
+ const int2 grid_pos = make_int2(
3033
+ iq3s_grid[qs[2*l+0] | ((qh << (8 - 2*l)) & 0x100)],
3034
+ iq3s_grid[qs[2*l+1] | ((qh << (7 - 2*l)) & 0x100)]);
3035
+
3036
+ const int signs0 = __vcmpne4(((signs_packed_8[l] & 0x03) << 7) | ((signs_packed_8[l] & 0x0C) << 21), 0x00000000);
3037
+ const int signs1 = __vcmpne4(((signs_packed_8[l] & 0x30) << 3) | ((signs_packed_8[l] & 0xC0) << 17), 0x00000000);
3038
+
3039
+ const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
3040
+ const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
3041
+
3042
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3043
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+0)] = grid_l;
3044
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+1)] = grid_h;
3045
+ #else
3046
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+0)] = grid_l;
3047
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+1)] = grid_h;
3048
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3049
+ }
3050
+
3051
+ const int ls = 1 + 2*((bxi->scales[kqsx/2] >> (((2*kqsx) << 1) & 0x04)) & 0x0F);
3052
+ const float d = bxi->d;
3053
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3054
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = ls*d;
3055
+ #else
3056
+ x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = ls*d;
3057
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3058
+ }
3059
+ }
3060
+
3061
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq1_s(
3062
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
3063
+ constexpr int nwarps = mmq_get_nwarps_device();
3064
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
3065
+
3066
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3067
+ int * x_qs = (int *) x_tile;
3068
+ half2 * x_ds = (half2 *) (x_qs + MMQ_TILE_NE_K*2);
3069
+ #else
3070
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
3071
+ int * x_qs = (int *) x_tile;
3072
+ half2 * x_ds = (half2 *) (x_qs + txs.qs);
3073
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3074
+
3075
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR1_S);
3076
+ constexpr int nrows = warp_size / threads_per_row;
3077
+ const int kqsx = threadIdx.x % threads_per_row;
3078
+
3079
+ #pragma unroll
3080
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) {
3081
+ int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
3082
+
3083
+ if (need_check) {
3084
+ i = min(i, i_max);
3085
+ }
3086
+
3087
+ const block_iq1_s * bxi = (const block_iq1_s *) x + kbx0 + i*stride;
3088
+
3089
+ const int qs_packed = get_int_b2(bxi->qs, kqsx);
3090
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
3091
+
3092
+ const int qh = bxi->qh[kqsx];
3093
+
3094
+ #pragma unroll
3095
+ for (int l = 0; l < QR1_S/2; ++l) {
3096
+ const int grid = iq1s_grid_gpu[qs[l] | (((qh >> (3*l)) & 0x07) << 8)];
3097
+
3098
+ const int grid0 = (grid >> 0) & 0x0F0F0F0F;
3099
+ const int grid1 = (grid >> 4) & 0x0F0F0F0F;
3100
+
3101
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3102
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+0)] = grid0;
3103
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+1)] = grid1;
3104
+ #else
3105
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+0)] = grid0;
3106
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+1)] = grid1;
3107
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3108
+ }
3109
+
3110
+ const float d1q = __half2float(bxi->d) * (((qh >> 11) & 0x0E) + 1);
3111
+ const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
3112
+
3113
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3114
+ x_ds[i*MMQ_MMA_TILE_X_K_Q8_1 + kqsx] = make_half2(d1q, d1q*delta);
3115
+ #else
3116
+ x_ds[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = make_half2(d1q, d1q*delta);
3117
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3118
+ }
3119
+ }
3120
+
3121
+ template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_xs(
3122
+ const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
3123
+ constexpr int nwarps = mmq_get_nwarps_device();
3124
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
3125
+
3126
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3127
+ int * x_qs = (int *) x_tile;
3128
+ float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
3129
+ #else
3130
+ constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
3131
+ int * x_qs = (int *) x_tile;
3132
+ float * x_df = (float *) (x_qs + txs.qs);
3133
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3134
+
3135
+ constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_XS);
3136
+ constexpr int nrows = warp_size / threads_per_row;
3137
+ const int kqsx = threadIdx.x % threads_per_row;
3138
+
3139
+ #pragma unroll
3140
+ for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
3141
+ int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
3142
+
3143
+ if (need_check) {
3144
+ i = min(i, i_max);
3145
+ }
3146
+
3147
+ const block_iq4_xs * bxi = (const block_iq4_xs *) x + kbx0 + i*stride;
3148
+
3149
+ const int aux_q4 = get_int_b4(bxi->qs, kqsx);
3150
+ const int2 v = get_int_from_table_16(aux_q4, kvalues_iq4nl);
3151
+ const int k0 = 8 * (kqsx / 4) + kqsx % 4;
3152
+
3153
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3154
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
3155
+ x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 4] = v.y;
3156
+ #else
3157
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x;
3158
+ x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 4] = v.y;
3159
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3160
+ }
3161
+
3162
+ constexpr int rows_per_warp = warp_size / 8;
3163
+ #pragma unroll
3164
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
3165
+ int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / (MMQ_TILE_NE_K/4);
3166
+
3167
+ if (need_check) {
3168
+ i = min(i, i_max);
3169
+ }
3170
+
3171
+ const block_iq4_xs * bxi = (const block_iq4_xs *) x + kbx0 + i*stride;
3172
+
3173
+ const float d = __half2float(bxi->d);
3174
+
3175
+ const int ls = ((bxi->scales_l[(threadIdx.x % 8)/2] >> (4*(threadIdx.x % 2))) & 0x0F)
3176
+ | (((bxi->scales_h >> (2*(threadIdx.x % 8))) & 0x03) << 4);
3177
+
3178
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3179
+ x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = d * (ls - 32);
3180
+ #else
3181
+ x_df[i*(MMQ_TILE_NE_K/4) + i/4 + threadIdx.x % 8] = d * (ls - 32);
3182
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3183
+ }
3184
+ }
3185
+
3186
+ template<int mmq_x, int mmq_y, bool need_check>
3187
+ static __device__ __forceinline__ void mmq_write_back_dp4a(
3188
+ const float * __restrict__ sum, const int32_t * __restrict__ ids_dst, float * __restrict__ dst,
3189
+ const int stride, const int i_max, const int j_max) {
3190
+ constexpr int nwarps = mmq_get_nwarps_device();
3191
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
3192
+
3193
+ #pragma unroll
3194
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
3195
+ const int j = j0 + threadIdx.y;
3196
+
3197
+ if (j > j_max) {
3198
+ return;
3199
+ }
3200
+
3201
+ #pragma unroll
3202
+ for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
3203
+ const int i = i0 + threadIdx.x;
3204
+
3205
+ if (need_check && i > i_max) {
3206
+ continue;
3207
+ }
3208
+
3209
+ dst[ids_dst[j]*stride + i] = sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
3210
+ }
3211
+ }
3212
+ }
3213
+
3214
+ template<ggml_type type, int mmq_x, int mmq_y, bool need_check>
3215
+ static __device__ __forceinline__ void mmq_write_back_mma(
3216
+ const float * __restrict__ sum, const int * __restrict__ ids_dst, float * __restrict__ dst,
3217
+ const int stride, const int i_max, const int j_max) {
3218
+
3219
+ constexpr int granularity = mmq_get_granularity_device(mmq_x);
3220
+ constexpr int nwarps = mmq_get_nwarps_device();
3221
+
3222
+ #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3223
+ constexpr int tileC_IJ = mmq_get_granularity_device(0);
3224
+ typedef tile<tileC_IJ, tileC_IJ, int, DATA_LAYOUT_J_MAJOR> tile_C;
3225
+ constexpr int rows_per_warp = granularity;
3226
+ #else
3227
+ typedef tile<16, 8, int> tile_C;
3228
+ constexpr int rows_per_warp = 2 * granularity;
3229
+ #endif // defined(AMD_MFMA_AVAILABLE)
3230
+ constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
3231
+
3232
+ const int i0 = (threadIdx.y / ntx) * (ntx*tile_C::I);
3233
+ #if defined(TURING_MMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3234
+ static_assert(nwarps*tile_C::I == mmq_y, "nwarps*tile_C::I != mmq_y");
3235
+ #else
3236
+ GGML_UNUSED(nwarps);
3237
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3238
+
3239
+ #pragma unroll
3240
+ for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
3241
+ #pragma unroll
3242
+ for (int n = 0; n < ntx; ++n) {
3243
+ #pragma unroll
3244
+ for (int l = 0; l < tile_C::ne; ++l) {
3245
+ const int j = j0 + (threadIdx.y % ntx) * tile_C::J + tile_C::get_j(l);
3246
+
3247
+ if (j > j_max) {
3248
+ continue;
3249
+ }
3250
+
3251
+ const int i = i0 + n*tile_C::I + tile_C::get_i(l);
3252
+
3253
+ if (need_check && i > i_max) {
3254
+ continue;
3255
+ }
3256
+
3257
+ dst[ids_dst[j]*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l];
3258
+ }
3259
+ }
3260
+ }
3261
+ }
3262
+
3263
+ // -------------------------------------------------------------------------------------------------------------------------------------
3264
+
3265
+ template <int mmq_x, int mmq_y, bool need_check, ggml_type type>
3266
+ struct mmq_type_traits;
3267
+
3268
+ template <int mmq_x, int mmq_y, bool need_check>
3269
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q1_0> {
3270
+ static constexpr int vdr = VDR_Q1_0_Q8_1_MMQ;
3271
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q1_0<mmq_y, need_check>;
3272
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
3273
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
3274
+ };
3275
+
3276
+ template <int mmq_x, int mmq_y, bool need_check>
3277
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_0> {
3278
+ static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;
3279
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, need_check>;
3280
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_DS4>;
3281
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_0_q8_1_dp4a<mmq_x, mmq_y>;
3282
+ };
3283
+
3284
+ template <int mmq_x, int mmq_y, bool need_check>
3285
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_1> {
3286
+ static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ;
3287
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, need_check>;
3288
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y>;
3289
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_1_q8_1_dp4a<mmq_x, mmq_y>;
3290
+ };
3291
+
3292
+ template <int mmq_x, int mmq_y, bool need_check>
3293
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q5_0> {
3294
+ static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ;
3295
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, need_check>;
3296
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
3297
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
3298
+ };
3299
+
3300
+ template <int mmq_x, int mmq_y, bool need_check>
3301
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q5_1> {
3302
+ static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ;
3303
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, need_check>;
3304
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y>;
3305
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a<mmq_x, mmq_y>;
3306
+ };
3307
+
3308
+ template <int mmq_x, int mmq_y, bool need_check>
3309
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q8_0> {
3310
+ static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ;
3311
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, need_check>;
3312
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
3313
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
3314
+ };
3315
+
3316
+ template <int mmq_x, int mmq_y, bool need_check>
3317
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
3318
+ static constexpr int vdr = VDR_MXFP4_Q8_1_MMQ;
3319
+ #ifdef BLACKWELL_MMA_AVAILABLE
3320
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4_fp4<mmq_y, need_check>;
3321
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_fp4_fp4_mma<mmq_x, mmq_y, GGML_TYPE_MXFP4>;
3322
+ #else
3323
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4<mmq_y, need_check>;
3324
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
3325
+ #endif // BLACKWELL_MMA_AVAILABLE
3326
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
3327
+ };
3328
+
3329
+ template <int mmq_x, int mmq_y, bool need_check>
3330
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_NVFP4> {
3331
+ static constexpr int vdr = VDR_NVFP4_Q8_1_MMQ;
3332
+ #ifdef BLACKWELL_MMA_AVAILABLE
3333
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_nvfp4_nvfp4<mmq_y, need_check>;
3334
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_fp4_fp4_mma<mmq_x, mmq_y, GGML_TYPE_NVFP4>;
3335
+ #else
3336
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_nvfp4<mmq_y, need_check>;
3337
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
3338
+ #endif // BLACKWELL_MMA_AVAILABLE
3339
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
3340
+ };
3341
+
3342
+ template <int mmq_x, int mmq_y, bool need_check>
3343
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q2_K> {
3344
+ static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ;
3345
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, need_check>;
3346
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q2_K_q8_1_mma<mmq_x, mmq_y>;
3347
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q2_K_q8_1_dp4a<mmq_x, mmq_y>;
3348
+ };
3349
+
3350
+ template <int mmq_x, int mmq_y, bool need_check>
3351
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q3_K> {
3352
+ static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ;
3353
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, need_check>;
3354
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
3355
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q3_K_q8_1_dp4a<mmq_x, mmq_y>;
3356
+ };
3357
+
3358
+ template <int mmq_x, int mmq_y, bool need_check>
3359
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_K> {
3360
+ static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ;
3361
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, need_check>;
3362
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y>;
3363
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_K_q8_1_dp4a<mmq_x, mmq_y>;
3364
+ };
3365
+
3366
+ template <int mmq_x, int mmq_y, bool need_check>
3367
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q5_K> {
3368
+ static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ;
3369
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, need_check>;
3370
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y>;
3371
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q5_K_q8_1_dp4a<mmq_x, mmq_y>;
3372
+ };
3373
+
3374
+ template <int mmq_x, int mmq_y, bool need_check>
3375
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q6_K> {
3376
+ static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ;
3377
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, need_check>;
3378
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q6_K_q8_1_mma<mmq_x, mmq_y>;
3379
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q6_K_q8_1_dp4a<mmq_x, mmq_y>;
3380
+ };
3381
+
3382
+ template <int mmq_x, int mmq_y, bool need_check>
3383
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ2_XXS> {
3384
+ static constexpr int vdr = VDR_IQ2_XXS_Q8_1_MMQ;
3385
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_xxs<mmq_y, need_check>;
3386
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
3387
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
3388
+ };
3389
+
3390
+ template <int mmq_x, int mmq_y, bool need_check>
3391
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ2_XS> {
3392
+ static constexpr int vdr = VDR_IQ2_XS_Q8_1_MMQ;
3393
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_xs<mmq_y, need_check>;
3394
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
3395
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
3396
+ };
3397
+
3398
+ template <int mmq_x, int mmq_y, bool need_check>
3399
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ2_S> {
3400
+ static constexpr int vdr = VDR_IQ2_S_Q8_1_MMQ;
3401
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_s<mmq_y, need_check>;
3402
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
3403
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
3404
+ };
3405
+
3406
+ template <int mmq_x, int mmq_y, bool need_check>
3407
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ3_XXS> {
3408
+ static constexpr int vdr = VDR_IQ3_XXS_Q8_1_MMQ;
3409
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_xxs<mmq_y, need_check>;
3410
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
3411
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
3412
+ };
3413
+
3414
+ template <int mmq_x, int mmq_y, bool need_check>
3415
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ3_S> {
3416
+ static constexpr int vdr = VDR_IQ3_S_Q8_1_MMQ;
3417
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_s<mmq_y, need_check>;
3418
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
3419
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
3420
+ };
3421
+
3422
+ template <int mmq_x, int mmq_y, bool need_check>
3423
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ1_S> {
3424
+ static constexpr int vdr = VDR_IQ1_S_Q8_1_MMQ;
3425
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq1_s<mmq_y, need_check>;
3426
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y>;
3427
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a<mmq_x, mmq_y>;
3428
+ };
3429
+
3430
+ template <int mmq_x, int mmq_y, bool need_check>
3431
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_NL> {
3432
+ static constexpr int vdr = VDR_IQ4_NL_Q8_1_MMQ;
3433
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_nl<mmq_y, need_check>;
3434
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
3435
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
3436
+ };
3437
+
3438
+ template <int mmq_x, int mmq_y, bool need_check>
3439
+ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_XS> {
3440
+ static constexpr int vdr = VDR_IQ4_XS_Q8_1_MMQ;
3441
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_xs<mmq_y, need_check>;
3442
+ static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
3443
+ static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
3444
+ };
3445
+
3446
+ template <ggml_type type, int mmq_x, bool need_check, bool fixup>
3447
+ static __device__ __forceinline__ void mul_mat_q_process_tile(
3448
+ const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
3449
+ const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
3450
+ const int stride_row_x, const int ncols_y, const int stride_col_dst,
3451
+ const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
3452
+
3453
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
3454
+ constexpr int nwarps = mmq_get_nwarps_device();
3455
+ constexpr int qk = ggml_cuda_type_traits<type>::qk;
3456
+ constexpr int mmq_y = get_mmq_y_device();
3457
+ constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, need_check, type>::load_tiles;
3458
+
3459
+ extern __shared__ int data_mul_mat_q[];
3460
+ int * tile_y = data_mul_mat_q + mmq_x;
3461
+ int * tile_x = tile_y + GGML_PAD(mmq_x*MMQ_TILE_Y_K, nwarps*warp_size);
3462
+
3463
+ #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3464
+ constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, need_check, type>::vec_dot_mma;
3465
+ constexpr mmq_write_back_t write_back = mmq_write_back_mma<type, mmq_x, mmq_y, need_check>;
3466
+ #else
3467
+ constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, need_check, type>::vec_dot_dp4a;
3468
+ constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, need_check>;
3469
+ #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
3470
+
3471
+ #if defined(BLACKWELL_MMA_AVAILABLE)
3472
+ // FP4 tile stores 8 blocks
3473
+ constexpr int ne_block = (type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4) ? QK_K : 4 * QK8_1;
3474
+ #else
3475
+ constexpr int ne_block = 4 * QK8_1;
3476
+ #endif // defined(BLACKWELL_MMA_AVAILABLE)
3477
+
3478
+ constexpr int ITER_K = get_iter_k(type);
3479
+ constexpr int blocks_per_iter = ITER_K / qk;
3480
+
3481
+ float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};
3482
+
3483
+ constexpr int sz = sizeof(block_q8_1_mmq) / sizeof(int);
3484
+
3485
+ for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
3486
+ load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
3487
+ {
3488
+ const int * by0 = y + ncols_y * (kb0 * qk / ne_block) * sz;
3489
+ #pragma unroll
3490
+ for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
3491
+ int l = l0 + threadIdx.y*warp_size + threadIdx.x;
3492
+
3493
+ tile_y[l] = by0[l];
3494
+ }
3495
+ }
3496
+
3497
+ __syncthreads();
3498
+
3499
+ vec_dot(tile_x, tile_y, sum, 0);
3500
+
3501
+ __syncthreads();
3502
+
3503
+ {
3504
+ const int * by0 = y + ncols_y * ((kb0 * qk / ne_block) * sz + sz);
3505
+ #pragma unroll
3506
+ for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
3507
+ int l = l0 + threadIdx.y*warp_size + threadIdx.x;
3508
+
3509
+ tile_y[l] = by0[l];
3510
+ }
3511
+ }
3512
+
3513
+ __syncthreads();
3514
+
3515
+ vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K);
3516
+
3517
+ __syncthreads();
3518
+ }
3519
+
3520
+ if (fixup) {
3521
+ write_back(sum, ids_dst, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x);
3522
+ } else {
3523
+ write_back(sum, ids_dst, dst, stride_col_dst, tile_x_max_i, tile_y_max_j);
3524
+ }
3525
+ }
3526
+
3527
+
3528
+ // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
3529
+
3530
+ template <ggml_type type, int mmq_x, bool need_check>
3531
+ #if defined(GGML_USE_HIP)
3532
+ #if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
3533
+ __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
3534
+ #endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
3535
+ #else
3536
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
3537
+ __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 1)
3538
+ #else
3539
+ __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
3540
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
3541
+ #endif // defined(GGML_USE_HIP)
3542
+ static __global__ void mul_mat_q(
3543
+ const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
3544
+ const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
3545
+ const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
3546
+ const uint3 channel_ratio, const uint3 nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
3547
+ const uint3 sample_ratio, const uint3 nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
3548
+ const uint3 ntx) {
3549
+
3550
+ // Skip unused template specializations for faster compilation:
3551
+ if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
3552
+ NO_DEVICE_CODE;
3553
+ return;
3554
+ }
3555
+
3556
+ constexpr int nwarps = mmq_get_nwarps_device();
3557
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
3558
+
3559
+ constexpr int qk = ggml_cuda_type_traits<type>::qk;
3560
+ constexpr int mmq_y = get_mmq_y_device();
3561
+
3562
+ const uint32_t nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
3563
+
3564
+ // Initialize the ids for writing back data with just the index.
3565
+ // For regular matrix multiplications this is never changed.
3566
+ // For MoE the correct indices are loaded from ids_dst.
3567
+ extern __shared__ int ids_dst_shared[]; // Stored at beginning of shared memory.
3568
+ #pragma unroll
3569
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps*warp_size) {
3570
+ const int j = j0 + threadIdx.y*warp_size + threadIdx.x;
3571
+
3572
+ if (j0 + nwarps*warp_size > mmq_x && j >= mmq_x) {
3573
+ break;
3574
+ }
3575
+
3576
+ ids_dst_shared[j] = j;
3577
+ }
3578
+ __syncthreads();
3579
+
3580
+ // On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
3581
+ #if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3582
+ {
3583
+ const uint2 tmp2 = fast_div_modulo(blockIdx.z, nchannels_y);
3584
+ const int wt = tmp2.x;
3585
+ const int zt = tmp2.y;
3586
+ const int jt = blockIdx.y;
3587
+ const int it = blockIdx.x;
3588
+
3589
+ // Defaults for regular matrix multiplication:
3590
+ int col_low = 0;
3591
+ int col_high = ncols_dst;
3592
+ int col_diff = ncols_dst;
3593
+ int offset_y = wt*stride_sample_y + zt*stride_channel_y;
3594
+ int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
3595
+
3596
+ if (ids_dst) {
3597
+ col_low = expert_bounds[zt + 0];
3598
+ col_high = expert_bounds[zt + 1];
3599
+ col_diff = col_high - col_low;
3600
+
3601
+ offset_y = 0;
3602
+ offset_dst = 0;
3603
+
3604
+ if (jt*mmq_x >= col_diff) {
3605
+ return;
3606
+ }
3607
+
3608
+ // __syncthreads(); // There is no previous tile that could cause a race condition.
3609
+ #pragma unroll
3610
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps*warp_size) {
3611
+ const int j = j0 + threadIdx.y*warp_size + threadIdx.x;
3612
+
3613
+ if (j0 + nwarps*warp_size > mmq_x && j >= mmq_x) {
3614
+ break;
3615
+ }
3616
+
3617
+ ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
3618
+ }
3619
+ __syncthreads();
3620
+ }
3621
+
3622
+ offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
3623
+ offset_dst += it*mmq_y;
3624
+
3625
+ const int tile_x_max_i = nrows_x - it*mmq_y - 1;
3626
+ const int tile_y_max_j = col_diff - jt*mmq_x - 1;
3627
+
3628
+ const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
3629
+
3630
+ constexpr bool fixup = false;
3631
+ mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
3632
+ (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
3633
+ tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z);
3634
+ return;
3635
+ }
3636
+ #endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3637
+
3638
+ constexpr int ITER_K = get_iter_k(type);
3639
+ constexpr int blocks_per_iter = ITER_K / qk;
3640
+
3641
+ // kbc == k block continuous, current index in continuous ijk space.
3642
+ int kbc = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
3643
+ int kbc_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
3644
+
3645
+ kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
3646
+ kbc_stop -= fastmodulo(kbc_stop, blocks_per_ne00) % blocks_per_iter;
3647
+
3648
+ // kb0 == k index when doing the matrix multiplication for an output tile.
3649
+ int kb0_start = fastmodulo(kbc, blocks_per_ne00);
3650
+ int kb0_stop = min(blocks_per_ne00.z, uint32_t(kb0_start + kbc_stop - kbc));
3651
+ while (kbc < kbc_stop && kb0_stop == int(blocks_per_ne00.z)) {
3652
+ int tmp = fastdiv(kbc, blocks_per_ne00);
3653
+ uint2 tmp2 = fast_div_modulo(tmp, ntx);
3654
+ const int jt = tmp2.y;
3655
+ tmp = tmp2.x;
3656
+ tmp2 = fast_div_modulo(tmp, nchannels_y);
3657
+ const int zt = tmp2.y;
3658
+ tmp = tmp2.x;
3659
+ tmp2 = fast_div_modulo(tmp, nsamples_y);
3660
+ const int wt = tmp2.y;
3661
+ const int it = tmp2.x;
3662
+
3663
+ // Defaults for regular matrix multiplication:
3664
+ int col_low = 0;
3665
+ int col_high = ncols_dst;
3666
+ int col_diff = ncols_dst;
3667
+ int offset_y = wt*stride_sample_y + zt*stride_channel_y;
3668
+ int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
3669
+
3670
+ if (ids_dst) {
3671
+ col_low = expert_bounds[zt + 0];
3672
+ col_high = expert_bounds[zt + 1];
3673
+ col_diff = col_high - col_low;
3674
+
3675
+ offset_y = 0;
3676
+ offset_dst = 0;
3677
+
3678
+ if (jt*mmq_x >= col_diff) {
3679
+ kbc += blocks_per_ne00.z;
3680
+ kbc -= fastmodulo(kbc, blocks_per_ne00);
3681
+
3682
+ kb0_start = 0;
3683
+ kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
3684
+
3685
+ continue;
3686
+ }
3687
+
3688
+ __syncthreads();
3689
+ #pragma unroll
3690
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps*warp_size) {
3691
+ const int j = j0 + threadIdx.y*warp_size + threadIdx.x;
3692
+
3693
+ if (j0 + nwarps*warp_size > mmq_x && j >= mmq_x) {
3694
+ break;
3695
+ }
3696
+
3697
+ ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
3698
+ }
3699
+ __syncthreads();
3700
+ }
3701
+
3702
+ offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int));
3703
+ offset_dst += it*mmq_y;
3704
+
3705
+ const int tile_x_max_i = nrows_x - it*mmq_y - 1;
3706
+ const int tile_y_max_j = col_diff - jt*mmq_x - 1;
3707
+
3708
+ const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
3709
+
3710
+ constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
3711
+ mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
3712
+ (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
3713
+ tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
3714
+
3715
+ kbc += blocks_per_ne00.z;
3716
+ kbc -= fastmodulo(kbc, blocks_per_ne00);
3717
+
3718
+ kb0_start = 0;
3719
+ kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
3720
+ }
3721
+
3722
+ if (kbc >= kbc_stop) {
3723
+ return;
3724
+ }
3725
+
3726
+ int tmp = fastdiv(kbc, blocks_per_ne00);
3727
+ uint2 tmp2 = fast_div_modulo(tmp, ntx);
3728
+ const int jt = tmp2.y;
3729
+ tmp = tmp2.x;
3730
+ tmp2 = fast_div_modulo(tmp, nchannels_y);
3731
+ const int zt = tmp2.y;
3732
+ tmp = tmp2.x;
3733
+ tmp2 = fast_div_modulo(tmp, nsamples_y);
3734
+ const int wt = tmp2.y;
3735
+ const int it = tmp2.x;
3736
+
3737
+ // Defaults for regular matrix multiplication:
3738
+ int col_low = 0;
3739
+ int col_high = ncols_dst;
3740
+ int col_diff = ncols_dst;
3741
+ int offset_y = wt*stride_sample_y + zt*stride_channel_y;
3742
+ int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
3743
+
3744
+ if (ids_dst) {
3745
+ col_low = expert_bounds[zt + 0];
3746
+ col_high = expert_bounds[zt + 1];
3747
+ col_diff = col_high - col_low;
3748
+
3749
+ offset_y = 0;
3750
+ offset_dst = 0;
3751
+
3752
+ if (jt*mmq_x >= col_diff) {
3753
+ return;
3754
+ }
3755
+
3756
+ // The memory layout for the fixup buffer is always contiguous, therefore reset ids:
3757
+ __syncthreads();
3758
+ #pragma unroll
3759
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps*warp_size) {
3760
+ const int j = j0 + threadIdx.y*warp_size + threadIdx.x;
3761
+
3762
+ if (j0 + nwarps*warp_size > mmq_x && j >= mmq_x) {
3763
+ break;
3764
+ }
3765
+
3766
+ ids_dst_shared[j] = j;
3767
+ }
3768
+ __syncthreads();
3769
+ }
3770
+
3771
+ offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int));
3772
+ offset_dst += it*mmq_y;
3773
+
3774
+ const int tile_x_max_i = nrows_x - it*mmq_y - 1;
3775
+ const int tile_y_max_j = col_diff - jt*mmq_x - 1;
3776
+
3777
+ const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
3778
+
3779
+ constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
3780
+ mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
3781
+ (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
3782
+ tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
3783
+ }
3784
+
3785
+ template <ggml_type type, int mmq_x, bool need_check>
3786
+ __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device()/2, 1)
3787
+ static __global__ void mul_mat_q_stream_k_fixup(
3788
+ const int32_t * __restrict__ ids_dst, const int32_t * __restrict__ expert_bounds, float * __restrict__ dst,
3789
+ float * __restrict__ tmp_last_tile, const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst,
3790
+ const int stride_col_dst, const uint3 nchannels_y, const int stride_channel_dst, const uint3 nsamples_y,
3791
+ const int stride_sample_dst, const uint3 ntx) {
3792
+ constexpr int mmq_y = get_mmq_y_device();
3793
+ constexpr int qk = ggml_cuda_type_traits<type>::qk;
3794
+ constexpr int ITER_K = get_iter_k(type);
3795
+ constexpr int blocks_per_iter = ITER_K / qk;
3796
+
3797
+ constexpr int nwarps = mmq_get_nwarps_device()/2;
3798
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
3799
+
3800
+ float sum[mmq_x / nwarps] = {0.0f};
3801
+ const int i = blockIdx.y*warp_size + threadIdx.x;
3802
+
3803
+ const int nty = (nrows_x + mmq_y - 1) / mmq_y;
3804
+
3805
+ const int bidx0 = blockIdx.x;
3806
+
3807
+ // kbc == k block continuous, current index in continuous ijk space.
3808
+ int kbc0 = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
3809
+ int kbc0_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
3810
+
3811
+ kbc0 -= fastmodulo(kbc0, blocks_per_ne00) % blocks_per_iter;
3812
+ kbc0_stop -= fastmodulo(kbc0_stop, blocks_per_ne00) % blocks_per_iter;
3813
+
3814
+ const bool did_not_have_any_data = kbc0 == kbc0_stop;
3815
+ const bool wrote_beginning_of_tile = fastmodulo(kbc0, blocks_per_ne00) == 0;
3816
+ const bool did_not_write_last = fastdiv(kbc0, blocks_per_ne00) == fastdiv(kbc0_stop, blocks_per_ne00) && fastmodulo(kbc0_stop, blocks_per_ne00) != 0;
3817
+ if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
3818
+ return;
3819
+ }
3820
+
3821
+ bool any_fixup = false;
3822
+
3823
+ // Iterate over previous blocks and sum up partial sums written to fixup buffer.
3824
+ // All CUDA blocks that get here must have a previous block that needs a fixup.
3825
+ int bidx = bidx0 - 1;
3826
+ int kbc_stop = kbc0;
3827
+ while(true) {
3828
+ int kbc = int64_t(bidx)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
3829
+ kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
3830
+
3831
+ if (kbc == kbc_stop) { // Did not have any data.
3832
+ bidx--;
3833
+ kbc_stop = kbc;
3834
+ continue;
3835
+ }
3836
+
3837
+ any_fixup = true;
3838
+
3839
+
3840
+ #pragma unroll
3841
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
3842
+ const int j = j0 + threadIdx.y;
3843
+
3844
+ sum[j0/nwarps] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
3845
+ }
3846
+
3847
+ // If this block started in a previous tile we are done and don't need to combine additional partial results.
3848
+ if (fastmodulo(kbc, blocks_per_ne00) == 0 || fastdiv(kbc, blocks_per_ne00) < fastdiv(kbc0, blocks_per_ne00)) {
3849
+ break;
3850
+ }
3851
+ bidx--;
3852
+ kbc_stop = kbc;
3853
+ }
3854
+
3855
+ if (!any_fixup) {
3856
+ return;
3857
+ }
3858
+
3859
+ int tmp = fastdiv(kbc0, blocks_per_ne00);
3860
+ uint2 tmp2 = fast_div_modulo(tmp, ntx);
3861
+ const int jt = tmp2.y;
3862
+ tmp = tmp2.x;
3863
+ tmp2 = fast_div_modulo(tmp, nchannels_y);
3864
+ const int zt = tmp2.y;
3865
+ tmp = tmp2.x;
3866
+ tmp2 = fast_div_modulo(tmp, nsamples_y);
3867
+ const int wt = tmp2.y;
3868
+ const int it = tmp2.x;
3869
+
3870
+ if (!ids_dst) {
3871
+ const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
3872
+ dst += offset_dst;
3873
+
3874
+ const int i_max = nrows_x - it*mmq_y - 1;
3875
+ const int j_max = ncols_dst - jt*mmq_x - 1;
3876
+ if (need_check && i > i_max) {
3877
+ return;
3878
+ }
3879
+
3880
+ #pragma unroll
3881
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
3882
+ const int j = j0 + threadIdx.y;
3883
+
3884
+ if (j > j_max) {
3885
+ return;
3886
+ }
3887
+
3888
+ dst[j*stride_col_dst + i] += sum[j0/nwarps];
3889
+ }
3890
+ return;
3891
+ }
3892
+
3893
+ __shared__ int ids_dst_shared[mmq_x];
3894
+ const int col_low = expert_bounds[zt + 0];
3895
+ const int col_high = expert_bounds[zt + 1];
3896
+ const int col_diff = col_high - col_low;
3897
+
3898
+ for (int j = threadIdx.y*warp_size + threadIdx.x; j < mmq_x; j += nwarps*warp_size) {
3899
+ ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
3900
+ }
3901
+ __syncthreads();
3902
+
3903
+ const int offset_dst = it*mmq_y;
3904
+ dst += offset_dst;
3905
+
3906
+ const int i_max = nrows_x - it*mmq_y - 1;
3907
+ const int j_max = col_diff - jt*mmq_x - 1;
3908
+ if (need_check && i > i_max) {
3909
+ return;
3910
+ }
3911
+
3912
+ #pragma unroll
3913
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
3914
+ const int j = j0 + threadIdx.y;
3915
+
3916
+ if (j > j_max) {
3917
+ return;
3918
+ }
3919
+
3920
+ dst[ids_dst_shared[j]*stride_col_dst + i] += sum[j0/nwarps];
3921
+ }
3922
+ }
3923
+
3924
+ struct mmq_args {
3925
+ const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
3926
+ int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst;
3927
+ int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
3928
+ int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
3929
+ bool use_stream_k; int64_t ncols_max;
3930
+ };
3931
+
3932
+ template<ggml_type type>
3933
+ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc, const int warp_size, const int nwarps) {
3934
+ const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y);
3935
+ const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
3936
+ const size_t nbs_ids = mmq_x*sizeof(int);
3937
+ const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
3938
+ const size_t nbs_y = mmq_x * (sizeof(block_q8_1_mmq));
3939
+ return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
3940
+ }
3941
+
3942
+ template <ggml_type type, int mmq_x>
3943
+ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
3944
+ const int id = ggml_cuda_get_device();
3945
+ const int cc = ggml_cuda_info().devices[id].cc;
3946
+ const int nsm = ggml_cuda_info().devices[id].nsm;
3947
+ const int warp_size = ggml_cuda_info().devices[id].warp_size;
3948
+ const int nwarps = mmq_get_nwarps_host(cc, warp_size);
3949
+ const int mmq_y = get_mmq_y_host(cc);
3950
+
3951
+ const dim3 block_dims(warp_size, nwarps, 1);
3952
+
3953
+ const int nbytes_shared = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc, warp_size, nwarps);
3954
+
3955
+ CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q<type, mmq_x, false>), nbytes_shared);
3956
+ CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q<type, mmq_x, true>), nbytes_shared);
3957
+
3958
+ const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
3959
+ const int ntx = (args.ncols_max + mmq_x - 1) / mmq_x;
3960
+ const int ntzw = args.nchannels_y * args.nsamples_y;
3961
+ const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
3962
+
3963
+ GGML_ASSERT(args.nchannels_y % args.nchannels_x == 0);
3964
+ GGML_ASSERT(args.nsamples_y % args.nsamples_x == 0);
3965
+ const int channel_ratio = args.nchannels_y / args.nchannels_x;
3966
+ const int sample_ratio = args.nsamples_y / args.nsamples_x;
3967
+
3968
+ const uint3 blocks_per_ne00_fd = init_fastdiv_values(args.ncols_x / ggml_cuda_type_traits<type>::qk);
3969
+ const uint3 ntx_fd = init_fastdiv_values(ntx);
3970
+ const uint3 nchannels_y_fd = init_fastdiv_values(args.nchannels_y);
3971
+ const uint3 nsamples_y_fd = init_fastdiv_values(args.nsamples_y);
3972
+ const uint3 channel_ratio_fd = init_fastdiv_values(channel_ratio);
3973
+ const uint3 sample_ratio_fd = init_fastdiv_values(sample_ratio);
3974
+
3975
+ if (!args.use_stream_k) {
3976
+ if (args.nrows_x % mmq_y == 0) {
3977
+ constexpr bool need_check = false;
3978
+ mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
3979
+ (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
3980
+ blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
3981
+ channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
3982
+ sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
3983
+ ntx_fd);
3984
+ } else {
3985
+ constexpr bool need_check = true;
3986
+ mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
3987
+ (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
3988
+ blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
3989
+ channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
3990
+ sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
3991
+ ntx_fd);
3992
+ }
3993
+ return;
3994
+ }
3995
+
3996
+ // For the stream-k kernel it is possible to run it with tiling by setting the number of CUDA blocks equal to the number of tiles.
3997
+ // This is worthwhile if the efficiency of tiling is high and skipping the fixup kernel is more important.
3998
+ const int ntiles_dst = ntx * nty * ntzw;
3999
+ const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm;
4000
+ const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves);
4001
+ const dim3 block_nums_stream_k(GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm, 1, 1);
4002
+
4003
+ GGML_ASSERT(ntiles_dst * blocks_per_ne00_fd.z < (1 << 30)); // Assert that variable kbc will not overflow.
4004
+
4005
+ const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;
4006
+
4007
+ ggml_cuda_pool & pool = ctx.pool(id);
4008
+ ggml_cuda_pool_alloc<float> tmp_fixup(pool);
4009
+ if (fixup_needed) {
4010
+ tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y);
4011
+ }
4012
+
4013
+ const dim3 block_nums_fixup(block_nums_stream_k.x, mmq_y/warp_size, 1);
4014
+ const dim3 block_dims_fixup(block_dims.x, block_dims.y/2, block_dims.z);
4015
+
4016
+ if (args.nrows_x % mmq_y == 0) {
4017
+ constexpr bool need_check = false;
4018
+ mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
4019
+ (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
4020
+ blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
4021
+ channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
4022
+ sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
4023
+ ntx_fd);
4024
+
4025
+ if (!fixup_needed) {
4026
+ return;
4027
+ }
4028
+
4029
+ CUDA_CHECK(cudaGetLastError());
4030
+ mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
4031
+ (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
4032
+ args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
4033
+ ntx_fd);
4034
+ } else {
4035
+ constexpr bool need_check = true;
4036
+ mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
4037
+ (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
4038
+ blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
4039
+ channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
4040
+ sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
4041
+ ntx_fd);
4042
+
4043
+ if (!fixup_needed) {
4044
+ return;
4045
+ }
4046
+
4047
+ CUDA_CHECK(cudaGetLastError());
4048
+ mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
4049
+ (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
4050
+ args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
4051
+ ntx_fd);
4052
+ }
4053
+ }
4054
+
4055
+ template <ggml_type type>
4056
+ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
4057
+ const int id = ggml_cuda_get_device();
4058
+ const int cc = ggml_cuda_info().devices[id].cc;
4059
+ const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
4060
+ const int warp_size = ggml_cuda_info().devices[id].warp_size;
4061
+ const int nwarps = mmq_get_nwarps_host(cc, warp_size);
4062
+
4063
+ const int mmq_x_max = get_mmq_x_max_host(cc);
4064
+ const int mmq_y = get_mmq_y_host(cc);
4065
+
4066
+ int mmq_x_best = 0;
4067
+ int ntiles_x_best = INT_MAX;
4068
+
4069
+ for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) {
4070
+ const int granularity = mmq_get_granularity_host(mmq_x, cc);
4071
+
4072
+ if (mmq_x % granularity != 0 || mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc, warp_size, nwarps) > smpbo) {
4073
+ continue;
4074
+ }
4075
+
4076
+ const int ntiles_x = (args.ncols_max + mmq_x - 1) / mmq_x;
4077
+
4078
+ if (ntiles_x < ntiles_x_best) {
4079
+ mmq_x_best = mmq_x;
4080
+ ntiles_x_best = ntiles_x;
4081
+ }
4082
+ }
4083
+
4084
+ switch (mmq_x_best) {
4085
+ case 8:
4086
+ launch_mul_mat_q<type, 8>(ctx, args, stream);
4087
+ break;
4088
+ case 16:
4089
+ launch_mul_mat_q<type, 16>(ctx, args, stream);
4090
+ break;
4091
+ case 24:
4092
+ launch_mul_mat_q<type, 24>(ctx, args, stream);
4093
+ break;
4094
+ case 32:
4095
+ launch_mul_mat_q<type, 32>(ctx, args, stream);
4096
+ break;
4097
+ case 40:
4098
+ launch_mul_mat_q<type, 40>(ctx, args, stream);
4099
+ break;
4100
+ case 48:
4101
+ launch_mul_mat_q<type, 48>(ctx, args, stream);
4102
+ break;
4103
+ case 56:
4104
+ launch_mul_mat_q<type, 56>(ctx, args, stream);
4105
+ break;
4106
+ case 64:
4107
+ launch_mul_mat_q<type, 64>(ctx, args, stream);
4108
+ break;
4109
+ case 72:
4110
+ launch_mul_mat_q<type, 72>(ctx, args, stream);
4111
+ break;
4112
+ case 80:
4113
+ launch_mul_mat_q<type, 80>(ctx, args, stream);
4114
+ break;
4115
+ case 88:
4116
+ launch_mul_mat_q<type, 88>(ctx, args, stream);
4117
+ break;
4118
+ case 96:
4119
+ launch_mul_mat_q<type, 96>(ctx, args, stream);
4120
+ break;
4121
+ case 104:
4122
+ launch_mul_mat_q<type, 104>(ctx, args, stream);
4123
+ break;
4124
+ case 112:
4125
+ launch_mul_mat_q<type, 112>(ctx, args, stream);
4126
+ break;
4127
+ case 120:
4128
+ launch_mul_mat_q<type, 120>(ctx, args, stream);
4129
+ break;
4130
+ case 128:
4131
+ launch_mul_mat_q<type, 128>(ctx, args, stream);
4132
+ break;
4133
+ default:
4134
+ fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
4135
+ GGML_ABORT("fatal error");
4136
+ break;
4137
+ }
4138
+ }
4139
+
4140
+ #define DECL_MMQ_CASE(type) \
4141
+ template void mul_mat_q_case<type>(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) \
4142
+
4143
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_0);
4144
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_1);
4145
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_0);
4146
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_1);
4147
+ extern DECL_MMQ_CASE(GGML_TYPE_Q8_0);
4148
+ extern DECL_MMQ_CASE(GGML_TYPE_MXFP4);
4149
+ extern DECL_MMQ_CASE(GGML_TYPE_NVFP4);
4150
+ extern DECL_MMQ_CASE(GGML_TYPE_Q2_K);
4151
+ extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
4152
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_K);
4153
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_K);
4154
+ extern DECL_MMQ_CASE(GGML_TYPE_Q6_K);
4155
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
4156
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
4157
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
4158
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
4159
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
4160
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
4161
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
4162
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
4163
+
4164
+ // -------------------------------------------------------------------------------------------------------------------------
4165
+
4166
+ void ggml_cuda_mul_mat_q(
4167
+ ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
4168
+
4169
+ void ggml_cuda_op_mul_mat_q(
4170
+ ggml_backend_cuda_context & ctx,
4171
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
4172
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
4173
+ const int64_t src1_padded_row_size, cudaStream_t stream);
4174
+
4175
+ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts);
4176
+