llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7621-bin-win-cpu-x64.zip} +0 -0
  2. llama_cpp_pydist-0.20.0.dist-info/METADATA +4539 -0
  3. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/RECORD +208 -193
  4. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/WHEEL +1 -1
  5. vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
  6. vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
  7. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
  8. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
  9. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
  10. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
  11. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +6 -4
  12. vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
  13. vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
  14. vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
  15. vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
  16. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +39 -14
  17. vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
  18. vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
  19. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +32 -3
  20. vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
  21. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +23 -23
  22. vendor_llama_cpp_pydist/llama.cpp/common/common.h +3 -2
  23. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +417 -102
  24. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
  25. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +1 -1
  26. vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
  27. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
  28. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
  29. vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
  30. vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
  31. vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
  32. vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
  33. vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
  34. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
  35. vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  36. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
  37. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +248 -19
  38. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +3 -0
  39. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
  40. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +21 -172
  41. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  42. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  43. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  44. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  45. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  46. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  47. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +42 -1
  48. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +2 -2
  49. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +36 -0
  50. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
  51. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +69 -33
  52. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
  53. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
  54. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +67 -31
  55. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
  56. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
  57. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
  58. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
  59. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
  60. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
  61. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
  62. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
  63. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  64. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
  65. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
  66. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
  67. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
  68. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
  69. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
  70. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
  71. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
  72. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  73. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
  74. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
  75. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
  76. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
  77. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
  78. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
  79. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
  80. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
  81. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
  82. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
  83. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  84. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
  85. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
  86. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +710 -290
  87. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  88. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
  89. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  90. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  91. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
  92. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
  93. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
  94. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
  95. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
  96. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
  97. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
  98. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
  99. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
  100. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  101. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
  102. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
  103. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
  104. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
  105. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
  106. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
  107. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
  108. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +50 -20
  109. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
  110. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
  111. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
  112. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  113. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
  114. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
  115. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
  116. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
  117. vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
  118. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +13 -4
  119. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
  120. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
  121. vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
  122. vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
  123. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
  124. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
  125. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
  126. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
  127. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
  128. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
  129. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +23 -22
  130. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +4 -3
  131. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
  132. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
  133. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
  134. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
  135. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +287 -16
  136. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
  137. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +44 -33
  138. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +3 -0
  139. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
  140. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
  141. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +52 -37
  142. vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
  143. vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
  144. vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
  145. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  146. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
  147. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  148. vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
  149. vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
  150. vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  151. vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
  152. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +115 -0
  153. vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
  154. vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
  155. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +193 -61
  156. vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
  157. vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
  158. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
  159. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
  160. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
  161. vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
  162. vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
  163. vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
  164. vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
  165. vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
  166. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
  167. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
  168. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
  169. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
  170. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
  171. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +110 -4
  172. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
  173. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
  174. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
  175. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
  176. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
  177. vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
  178. vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
  179. vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
  180. vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
  181. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  182. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +10 -17
  183. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
  184. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +434 -267
  185. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
  186. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
  187. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +127 -57
  188. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +11 -2
  189. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
  190. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
  191. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +33 -11
  192. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
  193. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +7 -3
  194. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
  195. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
  196. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
  197. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
  198. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
  199. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
  200. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +12 -8
  201. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +10 -3
  202. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
  203. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +2 -0
  204. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +1 -1
  205. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
  206. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
  207. llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
  208. vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
  209. {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.20.0.dist-info}/LICENSE +0 -0
  210. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@
5
5
  #include "ggml.h"
6
6
 
7
7
  #ifdef GGML_CUDA_USE_CUB
8
- # include <cub/device/device_scan.cuh>
8
+ # include <cub/block/block_scan.cuh>
9
9
  #endif // GGML_CUDA_USE_CUB
10
10
 
11
11
  template<typename T, int BLOCK_SIZE>
@@ -16,12 +16,14 @@ static __global__ void cumsum_cub_kernel(
16
16
  const int64_t s01, const int64_t s02, const int64_t s03,
17
17
  const int64_t s1, const int64_t s2, const int64_t s3) {
18
18
  #ifdef GGML_CUDA_USE_CUB
19
- using BlockScan = cub::BlockScan<T, BLOCK_SIZE>;
19
+ using BlockScanT = cub::BlockScan<T, BLOCK_SIZE>;
20
20
 
21
- __shared__ typename BlockScan::TempStorage temp_storage;
22
- __shared__ T block_carry; // carry from previous tile
21
+ __shared__ typename BlockScanT::TempStorage temp_storage;
22
+ __shared__ T block_carry;
23
23
 
24
24
  const int tid = threadIdx.x;
25
+ constexpr int UNROLL_FACTOR = 4;
26
+ constexpr int TILE_SIZE = BLOCK_SIZE * UNROLL_FACTOR;
25
27
 
26
28
  const int64_t i1 = blockIdx.x;
27
29
  const int64_t i2 = blockIdx.y;
@@ -39,37 +41,47 @@ static __global__ void cumsum_cub_kernel(
39
41
  }
40
42
  __syncthreads();
41
43
 
42
- for (int64_t start = 0; start < ne00; start += BLOCK_SIZE) {
43
- int64_t idx = start + tid;
44
- T x = (idx < ne00) ? src_row[idx] : T(0);
44
+ for (int64_t start = 0; start < ne00; start += TILE_SIZE) {
45
+ T items[UNROLL_FACTOR];
46
+ T thread_sum = T(0);
45
47
 
46
- T inclusive;
47
- T block_total;
48
- BlockScan(temp_storage).InclusiveSum(x, inclusive, block_total);
48
+ #pragma unroll
49
+ for (int i = 0; i < UNROLL_FACTOR; i++) {
50
+ int64_t idx = start + tid * UNROLL_FACTOR + i;
51
+ T val = (idx < ne00) ? src_row[idx] : T(0);
52
+ thread_sum += val;
53
+ items[i] = thread_sum;
54
+ }
49
55
 
56
+ // Block-wide scan on thread sums
57
+ T thread_prefix;
58
+ T block_total;
59
+ BlockScanT(temp_storage).InclusiveSum(thread_sum, thread_prefix, block_total);
50
60
  __syncthreads();
51
61
 
52
- T final_val = inclusive + block_carry;
53
-
54
- // store result
55
- if (idx < ne00) {
56
- dst_row[idx] = final_val;
62
+ // Add offset to each item and store
63
+ T thread_offset = thread_prefix - thread_sum + block_carry;
64
+ #pragma unroll
65
+ for (int i = 0; i < UNROLL_FACTOR; i++) {
66
+ int64_t idx = start + tid * UNROLL_FACTOR + i;
67
+ if (idx < ne00) {
68
+ dst_row[idx] = items[i] + thread_offset;
69
+ }
57
70
  }
58
71
 
59
72
  __syncthreads();
60
73
 
74
+ // Update carry for next tile
61
75
  if (tid == 0) {
62
76
  block_carry += block_total;
63
77
  }
64
-
65
- __syncthreads();
66
78
  }
67
79
  #else
68
80
  NO_DEVICE_CODE;
69
81
  #endif // GGML_CUDA_USE_CUB
70
82
  }
71
83
 
72
- // Fallback kernel implementation (original)
84
+ // Fallback kernel implementation
73
85
  template<typename T>
74
86
  static __global__ void cumsum_kernel(
75
87
  const T * src, T * dst,
@@ -86,10 +98,10 @@ static __global__ void cumsum_kernel(
86
98
  const int warps_per_block = blockDim.x / warp_size;
87
99
 
88
100
  extern __shared__ float smem[];
89
- float * s_vals = smem;
90
- float * s_warp_sums = smem + blockDim.x;
91
- float * s_carry = smem + blockDim.x + warps_per_block;
92
- float * s_chunk_total = s_carry + 1;
101
+ float * s_vals = smem;
102
+ float * s_warp_sums = smem + blockDim.x;
103
+ float * s_carry = smem + blockDim.x + warps_per_block;
104
+ float * s_chunk_total = s_carry + 1;
93
105
 
94
106
  // Initialize carry
95
107
  if (tid == 0) {
@@ -107,21 +119,39 @@ static __global__ void cumsum_kernel(
107
119
  const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
108
120
  T * dst_row = dst + i1 * s1 + i2 * s2 + i3 * s3;
109
121
 
110
- for (int64_t start = 0; start < ne00; start += blockDim.x) {
111
- int64_t idx = start + tid;
112
- float val = (idx < ne00) ? ggml_cuda_cast<float, T>(src_row[idx]) : 0.0f;
122
+ // register blocking: process 4 elements per thread to hide latency
123
+ // and reduce synchronization overhead
124
+ constexpr int num_unroll = 4;
125
+ T temp[num_unroll];
126
+
127
+ for (int64_t i = 0; i < ne00; i += num_unroll * blockDim.x) {
128
+ int64_t idx = i + tid * num_unroll;
129
+
130
+ // thread local sequential scan
131
+ temp[0] = (idx < ne00 ? src_row[idx] : T(0));
132
+ #pragma unroll
133
+ for (int64_t j = 1; j < num_unroll; j++) {
134
+ temp[j] = temp[j - 1];
135
+ if (idx + j < ne00) {
136
+ temp[j] += src_row[idx + j];
137
+ } else {
138
+ temp[j] += 0;
139
+ }
140
+ }
113
141
 
114
- // 1. Warp inclusive scan
142
+ // last emenent is sum of all values assigned to thread
143
+ float val = (idx < ne00) ? ggml_cuda_cast<float, T>(temp[num_unroll - 1]) : 0.0f;
144
+
145
+ // Warp inclusive scan
115
146
  val = warp_prefix_inclusive_sum<T, warp_size>(val);
116
147
  s_vals[tid] = val;
117
148
 
118
- // Store warp total
119
149
  if (lane == warp_size - 1) {
120
150
  s_warp_sums[warp] = val;
121
151
  }
122
152
  __syncthreads();
123
153
 
124
- // 2. Exclusive scan of warp sums (warp 0 only)
154
+ // Exclusive scan of warp sums (warp 0 only)
125
155
  if (warp == 0) {
126
156
  float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
127
157
  float inc = warp_prefix_inclusive_sum<T, warp_size>(w);
@@ -134,18 +164,24 @@ static __global__ void cumsum_kernel(
134
164
  }
135
165
  __syncthreads();
136
166
 
167
+ // write back results
137
168
  float carry = *s_carry;
138
- float final_val = s_vals[tid] + s_warp_sums[warp] + carry;
139
- if (idx < ne00) {
140
- dst_row[idx] = ggml_cuda_cast<T, float>(final_val);
169
+ // calculate sum offset for this thread
170
+ float final_val_offset = s_vals[tid] + s_warp_sums[warp] + carry - temp[num_unroll - 1];
171
+
172
+ #pragma unroll
173
+ for (int32_t j = 0; j < num_unroll; j++) {
174
+ if (idx + j < ne00) {
175
+ dst_row[idx + j] = temp[j] + ggml_cuda_cast<T, float>(final_val_offset);
176
+ }
141
177
  }
178
+
142
179
  __syncthreads();
143
180
 
144
181
  // Update carry for next chunk
145
182
  if (tid == 0) {
146
183
  *s_carry += *s_chunk_total;
147
184
  }
148
- __syncthreads();
149
185
  }
150
186
  }
151
187
 
@@ -177,7 +213,7 @@ static void cumsum_cuda(
177
213
  const int warps_per_block = block_size / warp_size;
178
214
  const size_t shmem_size = (block_size + warps_per_block + 2) * sizeof(float);
179
215
 
180
- if (use_cub) {
216
+ if (use_cub && ne00 >= 1024) {
181
217
  cumsum_cub_kernel<T, CUDA_CUMSUM_BLOCK_SIZE><<<grid_dims, CUDA_CUMSUM_BLOCK_SIZE, 0, stream>>>(
182
218
  src, dst,
183
219
  ne00, ne01, ne02, ne03,
@@ -918,7 +918,9 @@ void launch_fattn(
918
918
  blocks_num.y = 1;
919
919
  blocks_num.z = 1;
920
920
 
921
- dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
921
+ if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
922
+ dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
923
+ }
922
924
  } else {
923
925
  const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.
924
926
 
@@ -531,7 +531,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
531
531
  for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::I) {
532
532
  #pragma unroll
533
533
  for (int l = 0; l < T_C_KQ::ne; ++l) {
534
- if (!oob_check || k0 + T_C_KQ::get_i(l) < k_VKQ_sup) {
534
+ if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
535
535
  KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
536
536
  }
537
537
  }
@@ -583,7 +583,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
583
583
  for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) {
584
584
  #pragma unroll
585
585
  for (int l = 0; l < T_C_KQ::ne; ++l) {
586
- if (!oob_check || k0 + T_C_KQ::get_j(l) < k_VKQ_sup) {
586
+ if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
587
587
  // Turing + Volta:
588
588
  KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
589
589
  }
@@ -201,16 +201,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
201
201
  GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
202
202
 
203
203
  int64_t total_vram = 0;
204
- #ifdef GGML_CUDA_FORCE_MMQ
205
- GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
206
- #else
207
- GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
208
- #endif // GGML_CUDA_FORCE_MMQ
209
- #ifdef GGML_CUDA_FORCE_CUBLAS
210
- GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
211
- #else
212
- GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
213
- #endif // GGML_CUDA_FORCE_CUBLAS
214
204
  GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
215
205
 
216
206
  std::vector<std::pair<int, std::string>> turing_devices_without_mma;
@@ -2211,7 +2201,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2211
2201
 
2212
2202
  const int cc = ggml_cuda_info().devices[id].cc;
2213
2203
  const int warp_size = ggml_cuda_info().devices[id].warp_size;
2214
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2204
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
2215
2205
  use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
2216
2206
  use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
2217
2207
  any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
@@ -2219,7 +2209,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2219
2209
  } else {
2220
2210
  const int cc = ggml_cuda_info().devices[ctx.device].cc;
2221
2211
  const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
2222
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2212
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
2223
2213
  use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
2224
2214
  use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
2225
2215
  any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
@@ -2287,7 +2277,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2287
2277
  return;
2288
2278
  }
2289
2279
 
2290
- if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
2280
+ if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
2291
2281
  ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
2292
2282
  return;
2293
2283
  }
@@ -3076,8 +3066,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
3076
3066
  ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
3077
3067
  ggml_tensor * softmax = cgraph->nodes[node_idx];
3078
3068
  ggml_tensor * weights = cgraph->nodes[node_idx + 9];
3069
+ ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
3070
+ ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
3071
+ int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
3079
3072
 
3080
- if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3073
+ if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
3081
3074
  return true;
3082
3075
  }
3083
3076
  }
@@ -3085,7 +3078,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
3085
3078
  if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
3086
3079
  ggml_tensor * softmax = cgraph->nodes[node_idx];
3087
3080
  ggml_tensor * weights = cgraph->nodes[node_idx + 4];
3088
- if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3081
+ ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
3082
+ ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
3083
+ int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
3084
+
3085
+ if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
3089
3086
  return true;
3090
3087
  }
3091
3088
  }
@@ -3094,8 +3091,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
3094
3091
  ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
3095
3092
  ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
3096
3093
  ggml_tensor * weights = cgraph->nodes[node_idx + 5];
3094
+ ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
3095
+ ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
3096
+ int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
3097
3097
 
3098
- if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3098
+ if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
3099
3099
  return true;
3100
3100
  }
3101
3101
  }
@@ -3253,6 +3253,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
3253
3253
  should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
3254
3254
  }
3255
3255
  }
3256
+
3256
3257
  if (should_launch_concurrent_events) {
3257
3258
  // Restore original node order within each concurrent region to enable fusion within streams
3258
3259
 
@@ -3304,6 +3305,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
3304
3305
  cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
3305
3306
  }
3306
3307
  }
3308
+ } else {
3309
+ stream_ctx.concurrent_events.clear();
3307
3310
  }
3308
3311
 
3309
3312
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -3692,11 +3695,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
3692
3695
  }
3693
3696
  }
3694
3697
 
3695
- static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3696
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3697
-
3698
- ggml_cuda_set_device(cuda_ctx->device);
3699
-
3698
+ static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
3700
3699
  #ifdef USE_CUDA_GRAPH
3701
3700
  static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
3702
3701
 
@@ -3706,7 +3705,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
3706
3705
  }
3707
3706
 
3708
3707
  bool use_cuda_graph = true;
3709
- bool cuda_graph_update_required = false;
3710
3708
 
3711
3709
  if (cuda_ctx->cuda_graph->graph == nullptr) {
3712
3710
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
@@ -3727,6 +3725,29 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
3727
3725
  use_cuda_graph = false;
3728
3726
  }
3729
3727
 
3728
+ cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
3729
+ #else
3730
+ bool use_cuda_graph = false;
3731
+ #endif // USE_CUDA_GRAPH
3732
+
3733
+ return use_cuda_graph;
3734
+ }
3735
+
3736
+ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3737
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
3738
+
3739
+ bool use_cuda_graph = false;
3740
+ bool cuda_graph_update_required = false;
3741
+
3742
+ // graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
3743
+ // we call it here instead.
3744
+ #ifdef USE_CUDA_GRAPH
3745
+ if (!cuda_ctx->cuda_graph) {
3746
+ use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
3747
+ } else {
3748
+ use_cuda_graph = cuda_ctx->cuda_graph && cuda_ctx->cuda_graph->cuda_graphs_enabled;
3749
+ }
3750
+
3730
3751
  if (use_cuda_graph) {
3731
3752
  cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
3732
3753
 
@@ -3746,6 +3767,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
3746
3767
  #endif
3747
3768
  }
3748
3769
  }
3770
+ #endif // USE_CUDA_GRAPH
3749
3771
 
3750
3772
  if (use_cuda_graph && cuda_graph_update_required) {
3751
3773
  // Start CUDA graph capture
@@ -3757,11 +3779,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
3757
3779
  CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
3758
3780
  }
3759
3781
 
3760
- #else
3761
- bool use_cuda_graph = false;
3762
- bool cuda_graph_update_required = false;
3763
- #endif // USE_CUDA_GRAPH
3764
-
3765
3782
  bool graph_evaluated_or_captured = false;
3766
3783
 
3767
3784
  evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
@@ -3797,8 +3814,10 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
3797
3814
  static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
3798
3815
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
3799
3816
 
3817
+ const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
3818
+
3800
3819
  static bool enable_graph_optimization = [] {
3801
- const char * env = getenv("GGML_CUDA_GRAPH_OPT");
3820
+ const char * env = getenv("GGML_CUDA_GRAPH_OPT");
3802
3821
  return env != nullptr && atoi(env) == 1;
3803
3822
  }();
3804
3823
 
@@ -3806,12 +3825,13 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
3806
3825
  return;
3807
3826
  }
3808
3827
 
3809
- GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
3810
- GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
3811
-
3812
3828
  ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
3813
3829
  stream_context.reset();
3814
3830
 
3831
+ if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) {
3832
+ return;
3833
+ }
3834
+
3815
3835
  // number of out-degrees for a particular node
3816
3836
  std::unordered_map<const ggml_tensor *, int> fan_out;
3817
3837
  // reverse mapping of node to index in the cgraph
@@ -3872,6 +3892,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
3872
3892
  if (count >= min_fan_out && count <= max_fan_out) {
3873
3893
  const int root_node_idx = node_indices[root_node];
3874
3894
 
3895
+ // only optimize for attn_norm
3896
+ // TODO: make this more generic
3897
+ if (!strstr(root_node->name, "attn_norm")) {
3898
+ continue;
3899
+ }
3900
+
3875
3901
  bool is_part_of_event = false;
3876
3902
  for (const auto & [start, end] : concurrent_node_ranges) {
3877
3903
  if (root_node_idx >= start && root_node_idx <= end) {
@@ -4775,6 +4801,16 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
4775
4801
  features.push_back({ "FA_ALL_QUANTS", "1" });
4776
4802
  #endif
4777
4803
 
4804
+ {
4805
+ const auto & info = ggml_cuda_info();
4806
+ for (int id = 0; id < info.device_count; ++id) {
4807
+ if (blackwell_mma_available(info.devices[id].cc)) {
4808
+ features.push_back({ "BLACKWELL_NATIVE_FP4", "1"});
4809
+ break;
4810
+ }
4811
+ }
4812
+ }
4813
+
4778
4814
  #undef _STRINGIFY
4779
4815
  #undef STRINGIFY
4780
4816
 
@@ -63,6 +63,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
63
63
 
64
64
  const int id = ggml_cuda_get_device();
65
65
  const int nsm = ggml_cuda_info().devices[id].nsm;
66
+
67
+ // Heuristic for block size selection to optimize occupancy.
68
+ // See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
66
69
  if ((nrows / nsm) < 2) {
67
70
  const dim3 block_dims(512, 1, 1);
68
71
  reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
@@ -900,6 +900,27 @@ namespace ggml_cuda_mma {
900
900
  #endif // AMPERE_MMA_AVAILABLE
901
901
  }
902
902
 
903
+ static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D,
904
+ const tile<16, 8, int> & A,
905
+ const tile<8, 8, int> & B,
906
+ uint32_t a_scale,
907
+ uint32_t b_scale) {
908
+ #ifdef BLACKWELL_MMA_AVAILABLE
909
+ const int * Axi = (const int *) A.x;
910
+ const int * Bxi = (const int *) B.x;
911
+ float * Dxi = (float *) D.x;
912
+
913
+ asm volatile(
914
+ "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
915
+ "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
916
+ "%10, {0, 0}, %11, {0, 0};"
917
+ : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
918
+ : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
919
+ #else
920
+ GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
921
+ #endif // BLACKWELL_MMA_AVAILABLE
922
+ }
923
+
903
924
  static __device__ __forceinline__ void mma(
904
925
  tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
905
926
  #ifdef TURING_MMA_AVAILABLE
@@ -1,3 +1,4 @@
1
+ #include "common.cuh"
1
2
  #include "mmq.cuh"
2
3
  #include "quantize.cuh"
3
4
  #include "mmid.cuh"
@@ -114,6 +115,9 @@ void ggml_cuda_mul_mat_q(
114
115
  const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
115
116
  || GGML_CUDA_CC_IS_CDNA(cc);
116
117
 
118
+ // TODO: tighter pool buffer size vs q8 path
119
+ const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
120
+
117
121
  if (!ids) {
118
122
  const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
119
123
  get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
@@ -123,12 +127,24 @@ void ggml_cuda_mul_mat_q(
123
127
  const int64_t s11 = src1->nb[1] / ts_src1;
124
128
  const int64_t s12 = src1->nb[2] / ts_src1;
125
129
  const int64_t s13 = src1->nb[3] / ts_src1;
126
- quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
127
- ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
130
+ if (use_native_mxfp4) {
131
+ static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
132
+ quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
133
+ ne11, ne12, ne13, stream);
134
+
135
+ } else {
136
+ quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
137
+ ne11, ne12, ne13, stream);
138
+ }
128
139
  CUDA_CHECK(cudaGetLastError());
129
140
  }
130
141
 
131
- const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
142
+ // Stride depends on quantization format
143
+ const int64_t s12 = use_native_mxfp4 ?
144
+ ne11 * ne10_padded * sizeof(block_fp4_mmq) /
145
+ (8 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 256 values (8 blocks of 32)
146
+ :
147
+ ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
132
148
  const int64_t s13 = ne12*s12;
133
149
 
134
150
  const mmq_args args = {
@@ -175,12 +191,19 @@ void ggml_cuda_mul_mat_q(
175
191
  const int64_t s11 = src1->nb[1] / ts_src1;
176
192
  const int64_t s12 = src1->nb[2] / ts_src1;
177
193
  const int64_t s13 = src1->nb[2] / ts_src1;
178
- quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type,
179
- ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
194
+
195
+ if (use_native_mxfp4) {
196
+ quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
197
+ ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
198
+ } else {
199
+ quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
200
+ ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
201
+ }
180
202
  CUDA_CHECK(cudaGetLastError());
181
203
  }
182
204
 
183
- const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
205
+ const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
206
+ ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
184
207
  const int64_t s13 = ne12*s12;
185
208
 
186
209
  // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
@@ -236,7 +259,7 @@ void ggml_cuda_op_mul_mat_q(
236
259
  GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size);
237
260
  }
238
261
 
239
- bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
262
+ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts) {
240
263
  #ifdef GGML_CUDA_FORCE_CUBLAS
241
264
  return false;
242
265
  #endif // GGML_CUDA_FORCE_CUBLAS
@@ -297,7 +320,10 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
297
320
  if (GGML_CUDA_CC_IS_CDNA3(cc)) {
298
321
  return true;
299
322
  }
300
- if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
323
+ if (n_experts > 64 || ne11 <= 128) {
324
+ return true;
325
+ }
326
+ if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
301
327
  return true;
302
328
  }
303
329
  if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {