llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
  2. llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
  3. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
  4. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
  5. vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
  6. vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
  7. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
  8. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
  9. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
  10. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
  11. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
  12. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
  13. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
  14. vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
  15. vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
  16. vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
  17. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
  18. vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
  19. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
  20. vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
  21. vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
  22. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
  23. vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
  24. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
  25. vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
  26. vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
  27. vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
  28. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
  29. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
  30. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
  31. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
  32. vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
  33. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
  34. vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
  35. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
  36. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
  37. vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
  38. vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
  39. vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
  40. vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
  41. vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
  42. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
  43. vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  44. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
  45. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
  46. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
  47. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
  48. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
  49. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  50. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  51. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  52. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  53. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  54. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  55. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
  56. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
  57. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
  58. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
  59. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
  60. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
  61. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
  62. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
  63. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
  64. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
  65. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
  66. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
  67. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
  68. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
  69. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
  70. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
  71. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
  72. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
  73. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
  74. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
  75. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  76. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  77. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  78. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
  79. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
  80. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
  81. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
  82. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
  83. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
  84. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
  85. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
  86. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  87. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
  88. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
  89. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
  90. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
  91. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
  92. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
  93. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
  94. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
  95. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
  96. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
  97. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  98. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
  99. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
  100. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
  101. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  102. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
  103. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  104. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  105. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
  106. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
  107. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
  108. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
  109. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
  110. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
  111. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
  112. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
  113. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
  114. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  115. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
  116. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
  117. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
  118. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
  119. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
  120. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
  121. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
  122. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
  123. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
  124. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
  125. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
  126. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
  127. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  128. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
  129. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
  130. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
  131. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
  132. vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
  133. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
  134. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
  135. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
  136. vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
  137. vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
  138. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
  139. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
  140. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
  141. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
  142. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
  143. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
  144. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
  145. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
  146. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
  147. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
  148. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
  149. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
  150. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
  151. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
  152. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
  153. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
  154. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
  155. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
  156. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
  157. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
  158. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
  159. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
  160. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
  161. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
  162. vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
  163. vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
  164. vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
  165. vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  166. vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
  167. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  168. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  169. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
  170. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  171. vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
  172. vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
  173. vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
  174. vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  175. vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
  176. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
  177. vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  178. vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
  179. vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
  180. vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
  181. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
  182. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
  183. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
  184. vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
  185. vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
  186. vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
  187. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
  188. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
  189. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
  190. vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
  191. vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
  192. vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
  193. vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
  194. vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
  195. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
  196. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
  197. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
  198. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
  199. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
  200. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
  201. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
  202. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
  203. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
  204. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
  205. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
  206. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
  207. vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
  208. vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
  209. vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
  210. vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
  211. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  212. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
  213. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
  214. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
  215. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
  216. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
  217. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
  218. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
  219. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
  220. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
  221. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
  222. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
  223. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
  224. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
  225. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
  226. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
  227. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
  228. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
  229. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
  230. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
  231. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
  232. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
  233. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
  234. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
  235. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
  236. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
  237. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
  238. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
  239. llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
  240. vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
  241. {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
  242. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
@@ -531,7 +531,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
531
531
  for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::I) {
532
532
  #pragma unroll
533
533
  for (int l = 0; l < T_C_KQ::ne; ++l) {
534
- if (!oob_check || k0 + T_C_KQ::get_i(l) < k_VKQ_sup) {
534
+ if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
535
535
  KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
536
536
  }
537
537
  }
@@ -583,7 +583,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
583
583
  for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) {
584
584
  #pragma unroll
585
585
  for (int l = 0; l < T_C_KQ::ne; ++l) {
586
- if (!oob_check || k0 + T_C_KQ::get_j(l) < k_VKQ_sup) {
586
+ if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
587
587
  // Turing + Volta:
588
588
  KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
589
589
  }
@@ -19,6 +19,7 @@
19
19
  #include "ggml-cuda/count-equal.cuh"
20
20
  #include "ggml-cuda/cpy.cuh"
21
21
  #include "ggml-cuda/cross-entropy-loss.cuh"
22
+ #include "ggml-cuda/cumsum.cuh"
22
23
  #include "ggml-cuda/diagmask.cuh"
23
24
  #include "ggml-cuda/diag.cuh"
24
25
  #include "ggml-cuda/fattn.cuh"
@@ -44,6 +45,7 @@
44
45
  #include "ggml-cuda/ssm-scan.cuh"
45
46
  #include "ggml-cuda/sum.cuh"
46
47
  #include "ggml-cuda/sumrows.cuh"
48
+ #include "ggml-cuda/top-k.cuh"
47
49
  #include "ggml-cuda/mean.cuh"
48
50
  #include "ggml-cuda/tsembd.cuh"
49
51
  #include "ggml-cuda/topk-moe.cuh"
@@ -201,16 +203,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
201
203
  GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
202
204
 
203
205
  int64_t total_vram = 0;
204
- #ifdef GGML_CUDA_FORCE_MMQ
205
- GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
206
- #else
207
- GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
208
- #endif // GGML_CUDA_FORCE_MMQ
209
- #ifdef GGML_CUDA_FORCE_CUBLAS
210
- GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
211
- #else
212
- GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
213
- #endif // GGML_CUDA_FORCE_CUBLAS
214
206
  GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
215
207
 
216
208
  std::vector<std::pair<int, std::string>> turing_devices_without_mma;
@@ -241,6 +233,14 @@ static ggml_cuda_device_info ggml_cuda_init() {
241
233
  info.devices[id].nsm = prop.multiProcessorCount;
242
234
  info.devices[id].smpb = prop.sharedMemPerBlock;
243
235
  info.devices[id].warp_size = prop.warpSize;
236
+
237
+ #ifndef GGML_USE_MUSA
238
+ int supports_coop_launch = 0;
239
+ CUDA_CHECK(cudaDeviceGetAttribute(&supports_coop_launch, cudaDevAttrCooperativeLaunch, id));
240
+ info.devices[id].supports_cooperative_launch = !!supports_coop_launch;
241
+ #else
242
+ info.devices[id].supports_cooperative_launch = false;
243
+ #endif // !(GGML_USE_MUSA)
244
244
  #if defined(GGML_USE_HIP)
245
245
  info.devices[id].smpbo = prop.sharedMemPerBlock;
246
246
 
@@ -2211,7 +2211,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2211
2211
 
2212
2212
  const int cc = ggml_cuda_info().devices[id].cc;
2213
2213
  const int warp_size = ggml_cuda_info().devices[id].warp_size;
2214
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2214
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
2215
2215
  use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
2216
2216
  use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
2217
2217
  any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
@@ -2219,7 +2219,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2219
2219
  } else {
2220
2220
  const int cc = ggml_cuda_info().devices[ctx.device].cc;
2221
2221
  const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
2222
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2222
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
2223
2223
  use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
2224
2224
  use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
2225
2225
  any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
@@ -2287,7 +2287,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2287
2287
  return;
2288
2288
  }
2289
2289
 
2290
- if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
2290
+ if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
2291
2291
  ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
2292
2292
  return;
2293
2293
  }
@@ -2687,6 +2687,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2687
2687
  case GGML_OP_SUM:
2688
2688
  ggml_cuda_op_sum(ctx, dst);
2689
2689
  break;
2690
+ case GGML_OP_CUMSUM:
2691
+ ggml_cuda_op_cumsum(ctx, dst);
2692
+ break;
2690
2693
  case GGML_OP_SUM_ROWS:
2691
2694
  ggml_cuda_op_sum_rows(ctx, dst);
2692
2695
  break;
@@ -2699,6 +2702,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2699
2702
  case GGML_OP_SSM_SCAN:
2700
2703
  ggml_cuda_op_ssm_scan(ctx, dst);
2701
2704
  break;
2705
+ case GGML_OP_TOP_K:
2706
+ ggml_cuda_op_top_k(ctx, dst);
2707
+ break;
2702
2708
  case GGML_OP_ARGSORT:
2703
2709
  ggml_cuda_op_argsort(ctx, dst);
2704
2710
  break;
@@ -2708,9 +2714,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2708
2714
  case GGML_OP_CROSS_ENTROPY_LOSS:
2709
2715
  ggml_cuda_cross_entropy_loss(ctx, dst);
2710
2716
  break;
2711
- case GGML_OP_CUMSUM:
2712
- ggml_cuda_op_cumsum(ctx, dst);
2713
- break;
2714
2717
  case GGML_OP_TRI:
2715
2718
  ggml_cuda_op_tri(ctx, dst);
2716
2719
  break;
@@ -3076,8 +3079,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
3076
3079
  ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
3077
3080
  ggml_tensor * softmax = cgraph->nodes[node_idx];
3078
3081
  ggml_tensor * weights = cgraph->nodes[node_idx + 9];
3082
+ ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
3083
+ ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
3084
+ int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
3079
3085
 
3080
- if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3086
+ if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
3081
3087
  return true;
3082
3088
  }
3083
3089
  }
@@ -3085,7 +3091,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
3085
3091
  if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
3086
3092
  ggml_tensor * softmax = cgraph->nodes[node_idx];
3087
3093
  ggml_tensor * weights = cgraph->nodes[node_idx + 4];
3088
- if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3094
+ ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
3095
+ ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
3096
+ int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
3097
+
3098
+ if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
3089
3099
  return true;
3090
3100
  }
3091
3101
  }
@@ -3094,8 +3104,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
3094
3104
  ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
3095
3105
  ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
3096
3106
  ggml_tensor * weights = cgraph->nodes[node_idx + 5];
3107
+ ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
3108
+ ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
3109
+ int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
3097
3110
 
3098
- if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3111
+ if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
3099
3112
  return true;
3100
3113
  }
3101
3114
  }
@@ -3253,6 +3266,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
3253
3266
  should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
3254
3267
  }
3255
3268
  }
3269
+
3256
3270
  if (should_launch_concurrent_events) {
3257
3271
  // Restore original node order within each concurrent region to enable fusion within streams
3258
3272
 
@@ -3304,6 +3318,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
3304
3318
  cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
3305
3319
  }
3306
3320
  }
3321
+ } else {
3322
+ stream_ctx.concurrent_events.clear();
3307
3323
  }
3308
3324
 
3309
3325
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -3692,10 +3708,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
3692
3708
  }
3693
3709
  }
3694
3710
 
3695
- static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3696
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3697
-
3698
- ggml_cuda_set_device(cuda_ctx->device);
3711
+ static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
3699
3712
 
3700
3713
  #ifdef USE_CUDA_GRAPH
3701
3714
  static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
@@ -3706,7 +3719,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
3706
3719
  }
3707
3720
 
3708
3721
  bool use_cuda_graph = true;
3709
- bool cuda_graph_update_required = false;
3710
3722
 
3711
3723
  if (cuda_ctx->cuda_graph->graph == nullptr) {
3712
3724
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
@@ -3727,6 +3739,27 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
3727
3739
  use_cuda_graph = false;
3728
3740
  }
3729
3741
 
3742
+ cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
3743
+ #else
3744
+ bool use_cuda_graph = false;
3745
+ #endif // USE_CUDA_GRAPH
3746
+
3747
+ return use_cuda_graph;
3748
+ }
3749
+
3750
+ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3751
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
3752
+
3753
+ ggml_cuda_set_device(cuda_ctx->device);
3754
+
3755
+ bool use_cuda_graph = false;
3756
+ bool cuda_graph_update_required = false;
3757
+
3758
+ // graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
3759
+ // we call it here instead.
3760
+ #ifdef USE_CUDA_GRAPH
3761
+ use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
3762
+
3730
3763
  if (use_cuda_graph) {
3731
3764
  cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
3732
3765
 
@@ -3741,11 +3774,13 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
3741
3774
 
3742
3775
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
3743
3776
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
3777
+ cuda_ctx->cuda_graph->cuda_graphs_enabled = false;
3744
3778
  #ifndef NDEBUG
3745
3779
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
3746
3780
  #endif
3747
3781
  }
3748
3782
  }
3783
+ #endif // USE_CUDA_GRAPH
3749
3784
 
3750
3785
  if (use_cuda_graph && cuda_graph_update_required) {
3751
3786
  // Start CUDA graph capture
@@ -3757,11 +3792,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
3757
3792
  CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
3758
3793
  }
3759
3794
 
3760
- #else
3761
- bool use_cuda_graph = false;
3762
- bool cuda_graph_update_required = false;
3763
- #endif // USE_CUDA_GRAPH
3764
-
3765
3795
  bool graph_evaluated_or_captured = false;
3766
3796
 
3767
3797
  evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
@@ -3797,8 +3827,10 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
3797
3827
  static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
3798
3828
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
3799
3829
 
3830
+ const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
3831
+
3800
3832
  static bool enable_graph_optimization = [] {
3801
- const char * env = getenv("GGML_CUDA_GRAPH_OPT");
3833
+ const char * env = getenv("GGML_CUDA_GRAPH_OPT");
3802
3834
  return env != nullptr && atoi(env) == 1;
3803
3835
  }();
3804
3836
 
@@ -3806,12 +3838,13 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
3806
3838
  return;
3807
3839
  }
3808
3840
 
3809
- GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
3810
- GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
3811
-
3812
3841
  ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
3813
3842
  stream_context.reset();
3814
3843
 
3844
+ if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) {
3845
+ return;
3846
+ }
3847
+
3815
3848
  // number of out-degrees for a particular node
3816
3849
  std::unordered_map<const ggml_tensor *, int> fan_out;
3817
3850
  // reverse mapping of node to index in the cgraph
@@ -3872,6 +3905,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
3872
3905
  if (count >= min_fan_out && count <= max_fan_out) {
3873
3906
  const int root_node_idx = node_indices[root_node];
3874
3907
 
3908
+ // only optimize for attn_norm
3909
+ // TODO: make this more generic
3910
+ if (!strstr(root_node->name, "attn_norm")) {
3911
+ continue;
3912
+ }
3913
+
3875
3914
  bool is_part_of_event = false;
3876
3915
  for (const auto & [start, end] : concurrent_node_ranges) {
3877
3916
  if (root_node_idx >= start && root_node_idx <= end) {
@@ -4600,6 +4639,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
4600
4639
  return true;
4601
4640
  case GGML_OP_SUM:
4602
4641
  return ggml_is_contiguous_rows(op->src[0]);
4642
+ case GGML_OP_TOP_K:
4603
4643
  case GGML_OP_ARGSORT:
4604
4644
  #ifndef GGML_CUDA_USE_CUB
4605
4645
  return op->src[0]->ne[0] <= 1024;
@@ -4775,6 +4815,16 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
4775
4815
  features.push_back({ "FA_ALL_QUANTS", "1" });
4776
4816
  #endif
4777
4817
 
4818
+ {
4819
+ const auto & info = ggml_cuda_info();
4820
+ for (int id = 0; id < info.device_count; ++id) {
4821
+ if (blackwell_mma_available(info.devices[id].cc)) {
4822
+ features.push_back({ "BLACKWELL_NATIVE_FP4", "1"});
4823
+ break;
4824
+ }
4825
+ }
4826
+ }
4827
+
4778
4828
  #undef _STRINGIFY
4779
4829
  #undef STRINGIFY
4780
4830
 
@@ -63,6 +63,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
63
63
 
64
64
  const int id = ggml_cuda_get_device();
65
65
  const int nsm = ggml_cuda_info().devices[id].nsm;
66
+
67
+ // Heuristic for block size selection to optimize occupancy.
68
+ // See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
66
69
  if ((nrows / nsm) < 2) {
67
70
  const dim3 block_dims(512, 1, 1);
68
71
  reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
@@ -900,6 +900,27 @@ namespace ggml_cuda_mma {
900
900
  #endif // AMPERE_MMA_AVAILABLE
901
901
  }
902
902
 
903
+ static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D,
904
+ const tile<16, 8, int> & A,
905
+ const tile<8, 8, int> & B,
906
+ uint32_t a_scale,
907
+ uint32_t b_scale) {
908
+ #ifdef BLACKWELL_MMA_AVAILABLE
909
+ const int * Axi = (const int *) A.x;
910
+ const int * Bxi = (const int *) B.x;
911
+ float * Dxi = (float *) D.x;
912
+
913
+ asm volatile(
914
+ "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
915
+ "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
916
+ "%10, {0, 0}, %11, {0, 0};"
917
+ : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
918
+ : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
919
+ #else
920
+ GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
921
+ #endif // BLACKWELL_MMA_AVAILABLE
922
+ }
923
+
903
924
  static __device__ __forceinline__ void mma(
904
925
  tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
905
926
  #ifdef TURING_MMA_AVAILABLE
@@ -1,3 +1,4 @@
1
+ #include "common.cuh"
1
2
  #include "mmq.cuh"
2
3
  #include "quantize.cuh"
3
4
  #include "mmid.cuh"
@@ -114,6 +115,9 @@ void ggml_cuda_mul_mat_q(
114
115
  const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
115
116
  || GGML_CUDA_CC_IS_CDNA(cc);
116
117
 
118
+ // TODO: tighter pool buffer size vs q8 path
119
+ const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
120
+
117
121
  if (!ids) {
118
122
  const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
119
123
  get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
@@ -123,12 +127,24 @@ void ggml_cuda_mul_mat_q(
123
127
  const int64_t s11 = src1->nb[1] / ts_src1;
124
128
  const int64_t s12 = src1->nb[2] / ts_src1;
125
129
  const int64_t s13 = src1->nb[3] / ts_src1;
126
- quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
127
- ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
130
+ if (use_native_mxfp4) {
131
+ static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
132
+ quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
133
+ ne11, ne12, ne13, stream);
134
+
135
+ } else {
136
+ quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
137
+ ne11, ne12, ne13, stream);
138
+ }
128
139
  CUDA_CHECK(cudaGetLastError());
129
140
  }
130
141
 
131
- const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
142
+ // Stride depends on quantization format
143
+ const int64_t s12 = use_native_mxfp4 ?
144
+ ne11 * ne10_padded * sizeof(block_fp4_mmq) /
145
+ (8 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 256 values (8 blocks of 32)
146
+ :
147
+ ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
132
148
  const int64_t s13 = ne12*s12;
133
149
 
134
150
  const mmq_args args = {
@@ -175,12 +191,19 @@ void ggml_cuda_mul_mat_q(
175
191
  const int64_t s11 = src1->nb[1] / ts_src1;
176
192
  const int64_t s12 = src1->nb[2] / ts_src1;
177
193
  const int64_t s13 = src1->nb[2] / ts_src1;
178
- quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type,
179
- ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
194
+
195
+ if (use_native_mxfp4) {
196
+ quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
197
+ ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
198
+ } else {
199
+ quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
200
+ ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
201
+ }
180
202
  CUDA_CHECK(cudaGetLastError());
181
203
  }
182
204
 
183
- const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
205
+ const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
206
+ ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
184
207
  const int64_t s13 = ne12*s12;
185
208
 
186
209
  // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
@@ -236,7 +259,7 @@ void ggml_cuda_op_mul_mat_q(
236
259
  GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size);
237
260
  }
238
261
 
239
- bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
262
+ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts) {
240
263
  #ifdef GGML_CUDA_FORCE_CUBLAS
241
264
  return false;
242
265
  #endif // GGML_CUDA_FORCE_CUBLAS
@@ -297,7 +320,10 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
297
320
  if (GGML_CUDA_CC_IS_CDNA3(cc)) {
298
321
  return true;
299
322
  }
300
- if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
323
+ if (n_experts > 64 || ne11 <= 128) {
324
+ return true;
325
+ }
326
+ if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
301
327
  return true;
302
328
  }
303
329
  if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {