llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
- llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
- vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
- vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
- vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
- vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
- vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
- vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
- vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
- vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
- vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
- vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
- llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
- vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
- {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -531,7 +531,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
|
|
531
531
|
for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::I) {
|
|
532
532
|
#pragma unroll
|
|
533
533
|
for (int l = 0; l < T_C_KQ::ne; ++l) {
|
|
534
|
-
if (!oob_check || k0 + T_C_KQ::get_i(l) < k_VKQ_sup) {
|
|
534
|
+
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
|
|
535
535
|
KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
|
|
536
536
|
}
|
|
537
537
|
}
|
|
@@ -583,7 +583,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
|
|
583
583
|
for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) {
|
|
584
584
|
#pragma unroll
|
|
585
585
|
for (int l = 0; l < T_C_KQ::ne; ++l) {
|
|
586
|
-
if (!oob_check || k0 + T_C_KQ::get_j(l) < k_VKQ_sup) {
|
|
586
|
+
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
|
|
587
587
|
// Turing + Volta:
|
|
588
588
|
KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
|
|
589
589
|
}
|
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
#include "ggml-cuda/count-equal.cuh"
|
|
20
20
|
#include "ggml-cuda/cpy.cuh"
|
|
21
21
|
#include "ggml-cuda/cross-entropy-loss.cuh"
|
|
22
|
+
#include "ggml-cuda/cumsum.cuh"
|
|
22
23
|
#include "ggml-cuda/diagmask.cuh"
|
|
23
24
|
#include "ggml-cuda/diag.cuh"
|
|
24
25
|
#include "ggml-cuda/fattn.cuh"
|
|
@@ -44,6 +45,7 @@
|
|
|
44
45
|
#include "ggml-cuda/ssm-scan.cuh"
|
|
45
46
|
#include "ggml-cuda/sum.cuh"
|
|
46
47
|
#include "ggml-cuda/sumrows.cuh"
|
|
48
|
+
#include "ggml-cuda/top-k.cuh"
|
|
47
49
|
#include "ggml-cuda/mean.cuh"
|
|
48
50
|
#include "ggml-cuda/tsembd.cuh"
|
|
49
51
|
#include "ggml-cuda/topk-moe.cuh"
|
|
@@ -201,16 +203,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
|
201
203
|
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
|
202
204
|
|
|
203
205
|
int64_t total_vram = 0;
|
|
204
|
-
#ifdef GGML_CUDA_FORCE_MMQ
|
|
205
|
-
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
|
206
|
-
#else
|
|
207
|
-
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
|
208
|
-
#endif // GGML_CUDA_FORCE_MMQ
|
|
209
|
-
#ifdef GGML_CUDA_FORCE_CUBLAS
|
|
210
|
-
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
|
|
211
|
-
#else
|
|
212
|
-
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
|
|
213
|
-
#endif // GGML_CUDA_FORCE_CUBLAS
|
|
214
206
|
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
|
215
207
|
|
|
216
208
|
std::vector<std::pair<int, std::string>> turing_devices_without_mma;
|
|
@@ -241,6 +233,14 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
|
241
233
|
info.devices[id].nsm = prop.multiProcessorCount;
|
|
242
234
|
info.devices[id].smpb = prop.sharedMemPerBlock;
|
|
243
235
|
info.devices[id].warp_size = prop.warpSize;
|
|
236
|
+
|
|
237
|
+
#ifndef GGML_USE_MUSA
|
|
238
|
+
int supports_coop_launch = 0;
|
|
239
|
+
CUDA_CHECK(cudaDeviceGetAttribute(&supports_coop_launch, cudaDevAttrCooperativeLaunch, id));
|
|
240
|
+
info.devices[id].supports_cooperative_launch = !!supports_coop_launch;
|
|
241
|
+
#else
|
|
242
|
+
info.devices[id].supports_cooperative_launch = false;
|
|
243
|
+
#endif // !(GGML_USE_MUSA)
|
|
244
244
|
#if defined(GGML_USE_HIP)
|
|
245
245
|
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
|
246
246
|
|
|
@@ -2211,7 +2211,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
|
2211
2211
|
|
|
2212
2212
|
const int cc = ggml_cuda_info().devices[id].cc;
|
|
2213
2213
|
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
|
2214
|
-
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
|
2214
|
+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
|
2215
2215
|
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
|
2216
2216
|
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
|
2217
2217
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
|
@@ -2219,7 +2219,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
|
2219
2219
|
} else {
|
|
2220
2220
|
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
|
2221
2221
|
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
|
2222
|
-
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
|
2222
|
+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
|
2223
2223
|
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
|
2224
2224
|
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
|
2225
2225
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
|
@@ -2287,7 +2287,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
|
2287
2287
|
return;
|
|
2288
2288
|
}
|
|
2289
2289
|
|
|
2290
|
-
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
|
|
2290
|
+
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
|
|
2291
2291
|
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
|
|
2292
2292
|
return;
|
|
2293
2293
|
}
|
|
@@ -2687,6 +2687,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
|
2687
2687
|
case GGML_OP_SUM:
|
|
2688
2688
|
ggml_cuda_op_sum(ctx, dst);
|
|
2689
2689
|
break;
|
|
2690
|
+
case GGML_OP_CUMSUM:
|
|
2691
|
+
ggml_cuda_op_cumsum(ctx, dst);
|
|
2692
|
+
break;
|
|
2690
2693
|
case GGML_OP_SUM_ROWS:
|
|
2691
2694
|
ggml_cuda_op_sum_rows(ctx, dst);
|
|
2692
2695
|
break;
|
|
@@ -2699,6 +2702,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
|
2699
2702
|
case GGML_OP_SSM_SCAN:
|
|
2700
2703
|
ggml_cuda_op_ssm_scan(ctx, dst);
|
|
2701
2704
|
break;
|
|
2705
|
+
case GGML_OP_TOP_K:
|
|
2706
|
+
ggml_cuda_op_top_k(ctx, dst);
|
|
2707
|
+
break;
|
|
2702
2708
|
case GGML_OP_ARGSORT:
|
|
2703
2709
|
ggml_cuda_op_argsort(ctx, dst);
|
|
2704
2710
|
break;
|
|
@@ -2708,9 +2714,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
|
2708
2714
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
|
2709
2715
|
ggml_cuda_cross_entropy_loss(ctx, dst);
|
|
2710
2716
|
break;
|
|
2711
|
-
case GGML_OP_CUMSUM:
|
|
2712
|
-
ggml_cuda_op_cumsum(ctx, dst);
|
|
2713
|
-
break;
|
|
2714
2717
|
case GGML_OP_TRI:
|
|
2715
2718
|
ggml_cuda_op_tri(ctx, dst);
|
|
2716
2719
|
break;
|
|
@@ -3076,8 +3079,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|
|
3076
3079
|
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
|
|
3077
3080
|
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
|
3078
3081
|
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
|
|
3082
|
+
ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
|
|
3083
|
+
ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
|
|
3084
|
+
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
|
3079
3085
|
|
|
3080
|
-
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
|
3086
|
+
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
|
3081
3087
|
return true;
|
|
3082
3088
|
}
|
|
3083
3089
|
}
|
|
@@ -3085,7 +3091,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|
|
3085
3091
|
if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
|
|
3086
3092
|
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
|
3087
3093
|
ggml_tensor * weights = cgraph->nodes[node_idx + 4];
|
|
3088
|
-
|
|
3094
|
+
ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
|
|
3095
|
+
ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
|
|
3096
|
+
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
|
3097
|
+
|
|
3098
|
+
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
|
3089
3099
|
return true;
|
|
3090
3100
|
}
|
|
3091
3101
|
}
|
|
@@ -3094,8 +3104,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|
|
3094
3104
|
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
|
|
3095
3105
|
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
|
|
3096
3106
|
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
|
|
3107
|
+
ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
|
|
3108
|
+
ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
|
|
3109
|
+
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
|
3097
3110
|
|
|
3098
|
-
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
|
3111
|
+
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
|
3099
3112
|
return true;
|
|
3100
3113
|
}
|
|
3101
3114
|
}
|
|
@@ -3253,6 +3266,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
3253
3266
|
should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
|
|
3254
3267
|
}
|
|
3255
3268
|
}
|
|
3269
|
+
|
|
3256
3270
|
if (should_launch_concurrent_events) {
|
|
3257
3271
|
// Restore original node order within each concurrent region to enable fusion within streams
|
|
3258
3272
|
|
|
@@ -3304,6 +3318,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
3304
3318
|
cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
|
|
3305
3319
|
}
|
|
3306
3320
|
}
|
|
3321
|
+
} else {
|
|
3322
|
+
stream_ctx.concurrent_events.clear();
|
|
3307
3323
|
}
|
|
3308
3324
|
|
|
3309
3325
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
@@ -3692,10 +3708,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
3692
3708
|
}
|
|
3693
3709
|
}
|
|
3694
3710
|
|
|
3695
|
-
static
|
|
3696
|
-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
3697
|
-
|
|
3698
|
-
ggml_cuda_set_device(cuda_ctx->device);
|
|
3711
|
+
static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
|
|
3699
3712
|
|
|
3700
3713
|
#ifdef USE_CUDA_GRAPH
|
|
3701
3714
|
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
|
@@ -3706,7 +3719,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3706
3719
|
}
|
|
3707
3720
|
|
|
3708
3721
|
bool use_cuda_graph = true;
|
|
3709
|
-
bool cuda_graph_update_required = false;
|
|
3710
3722
|
|
|
3711
3723
|
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
|
3712
3724
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
|
@@ -3727,6 +3739,27 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3727
3739
|
use_cuda_graph = false;
|
|
3728
3740
|
}
|
|
3729
3741
|
|
|
3742
|
+
cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
|
|
3743
|
+
#else
|
|
3744
|
+
bool use_cuda_graph = false;
|
|
3745
|
+
#endif // USE_CUDA_GRAPH
|
|
3746
|
+
|
|
3747
|
+
return use_cuda_graph;
|
|
3748
|
+
}
|
|
3749
|
+
|
|
3750
|
+
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
3751
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
|
3752
|
+
|
|
3753
|
+
ggml_cuda_set_device(cuda_ctx->device);
|
|
3754
|
+
|
|
3755
|
+
bool use_cuda_graph = false;
|
|
3756
|
+
bool cuda_graph_update_required = false;
|
|
3757
|
+
|
|
3758
|
+
// graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
|
|
3759
|
+
// we call it here instead.
|
|
3760
|
+
#ifdef USE_CUDA_GRAPH
|
|
3761
|
+
use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
|
3762
|
+
|
|
3730
3763
|
if (use_cuda_graph) {
|
|
3731
3764
|
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
|
3732
3765
|
|
|
@@ -3741,11 +3774,13 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3741
3774
|
|
|
3742
3775
|
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
|
3743
3776
|
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
|
3777
|
+
cuda_ctx->cuda_graph->cuda_graphs_enabled = false;
|
|
3744
3778
|
#ifndef NDEBUG
|
|
3745
3779
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
|
3746
3780
|
#endif
|
|
3747
3781
|
}
|
|
3748
3782
|
}
|
|
3783
|
+
#endif // USE_CUDA_GRAPH
|
|
3749
3784
|
|
|
3750
3785
|
if (use_cuda_graph && cuda_graph_update_required) {
|
|
3751
3786
|
// Start CUDA graph capture
|
|
@@ -3757,11 +3792,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3757
3792
|
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
|
3758
3793
|
}
|
|
3759
3794
|
|
|
3760
|
-
#else
|
|
3761
|
-
bool use_cuda_graph = false;
|
|
3762
|
-
bool cuda_graph_update_required = false;
|
|
3763
|
-
#endif // USE_CUDA_GRAPH
|
|
3764
|
-
|
|
3765
3795
|
bool graph_evaluated_or_captured = false;
|
|
3766
3796
|
|
|
3767
3797
|
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
|
@@ -3797,8 +3827,10 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
|
|
3797
3827
|
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
3798
3828
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
|
3799
3829
|
|
|
3830
|
+
const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
|
3831
|
+
|
|
3800
3832
|
static bool enable_graph_optimization = [] {
|
|
3801
|
-
const char * env
|
|
3833
|
+
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
|
|
3802
3834
|
return env != nullptr && atoi(env) == 1;
|
|
3803
3835
|
}();
|
|
3804
3836
|
|
|
@@ -3806,12 +3838,13 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
|
|
|
3806
3838
|
return;
|
|
3807
3839
|
}
|
|
3808
3840
|
|
|
3809
|
-
GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
|
|
3810
|
-
GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
|
|
3811
|
-
|
|
3812
3841
|
ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
|
|
3813
3842
|
stream_context.reset();
|
|
3814
3843
|
|
|
3844
|
+
if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) {
|
|
3845
|
+
return;
|
|
3846
|
+
}
|
|
3847
|
+
|
|
3815
3848
|
// number of out-degrees for a particular node
|
|
3816
3849
|
std::unordered_map<const ggml_tensor *, int> fan_out;
|
|
3817
3850
|
// reverse mapping of node to index in the cgraph
|
|
@@ -3872,6 +3905,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
|
|
|
3872
3905
|
if (count >= min_fan_out && count <= max_fan_out) {
|
|
3873
3906
|
const int root_node_idx = node_indices[root_node];
|
|
3874
3907
|
|
|
3908
|
+
// only optimize for attn_norm
|
|
3909
|
+
// TODO: make this more generic
|
|
3910
|
+
if (!strstr(root_node->name, "attn_norm")) {
|
|
3911
|
+
continue;
|
|
3912
|
+
}
|
|
3913
|
+
|
|
3875
3914
|
bool is_part_of_event = false;
|
|
3876
3915
|
for (const auto & [start, end] : concurrent_node_ranges) {
|
|
3877
3916
|
if (root_node_idx >= start && root_node_idx <= end) {
|
|
@@ -4600,6 +4639,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4600
4639
|
return true;
|
|
4601
4640
|
case GGML_OP_SUM:
|
|
4602
4641
|
return ggml_is_contiguous_rows(op->src[0]);
|
|
4642
|
+
case GGML_OP_TOP_K:
|
|
4603
4643
|
case GGML_OP_ARGSORT:
|
|
4604
4644
|
#ifndef GGML_CUDA_USE_CUB
|
|
4605
4645
|
return op->src[0]->ne[0] <= 1024;
|
|
@@ -4775,6 +4815,16 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
|
|
4775
4815
|
features.push_back({ "FA_ALL_QUANTS", "1" });
|
|
4776
4816
|
#endif
|
|
4777
4817
|
|
|
4818
|
+
{
|
|
4819
|
+
const auto & info = ggml_cuda_info();
|
|
4820
|
+
for (int id = 0; id < info.device_count; ++id) {
|
|
4821
|
+
if (blackwell_mma_available(info.devices[id].cc)) {
|
|
4822
|
+
features.push_back({ "BLACKWELL_NATIVE_FP4", "1"});
|
|
4823
|
+
break;
|
|
4824
|
+
}
|
|
4825
|
+
}
|
|
4826
|
+
}
|
|
4827
|
+
|
|
4778
4828
|
#undef _STRINGIFY
|
|
4779
4829
|
#undef STRINGIFY
|
|
4780
4830
|
|
|
@@ -63,6 +63,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
63
63
|
|
|
64
64
|
const int id = ggml_cuda_get_device();
|
|
65
65
|
const int nsm = ggml_cuda_info().devices[id].nsm;
|
|
66
|
+
|
|
67
|
+
// Heuristic for block size selection to optimize occupancy.
|
|
68
|
+
// See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
|
|
66
69
|
if ((nrows / nsm) < 2) {
|
|
67
70
|
const dim3 block_dims(512, 1, 1);
|
|
68
71
|
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
|
@@ -900,6 +900,27 @@ namespace ggml_cuda_mma {
|
|
|
900
900
|
#endif // AMPERE_MMA_AVAILABLE
|
|
901
901
|
}
|
|
902
902
|
|
|
903
|
+
static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D,
|
|
904
|
+
const tile<16, 8, int> & A,
|
|
905
|
+
const tile<8, 8, int> & B,
|
|
906
|
+
uint32_t a_scale,
|
|
907
|
+
uint32_t b_scale) {
|
|
908
|
+
#ifdef BLACKWELL_MMA_AVAILABLE
|
|
909
|
+
const int * Axi = (const int *) A.x;
|
|
910
|
+
const int * Bxi = (const int *) B.x;
|
|
911
|
+
float * Dxi = (float *) D.x;
|
|
912
|
+
|
|
913
|
+
asm volatile(
|
|
914
|
+
"mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
|
|
915
|
+
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
|
|
916
|
+
"%10, {0, 0}, %11, {0, 0};"
|
|
917
|
+
: "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
|
|
918
|
+
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
|
|
919
|
+
#else
|
|
920
|
+
GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
|
|
921
|
+
#endif // BLACKWELL_MMA_AVAILABLE
|
|
922
|
+
}
|
|
923
|
+
|
|
903
924
|
static __device__ __forceinline__ void mma(
|
|
904
925
|
tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
|
|
905
926
|
#ifdef TURING_MMA_AVAILABLE
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
#include "common.cuh"
|
|
1
2
|
#include "mmq.cuh"
|
|
2
3
|
#include "quantize.cuh"
|
|
3
4
|
#include "mmid.cuh"
|
|
@@ -114,6 +115,9 @@ void ggml_cuda_mul_mat_q(
|
|
|
114
115
|
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
|
115
116
|
|| GGML_CUDA_CC_IS_CDNA(cc);
|
|
116
117
|
|
|
118
|
+
// TODO: tighter pool buffer size vs q8 path
|
|
119
|
+
const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
|
|
120
|
+
|
|
117
121
|
if (!ids) {
|
|
118
122
|
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
|
119
123
|
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
|
@@ -123,12 +127,24 @@ void ggml_cuda_mul_mat_q(
|
|
|
123
127
|
const int64_t s11 = src1->nb[1] / ts_src1;
|
|
124
128
|
const int64_t s12 = src1->nb[2] / ts_src1;
|
|
125
129
|
const int64_t s13 = src1->nb[3] / ts_src1;
|
|
126
|
-
|
|
127
|
-
|
|
130
|
+
if (use_native_mxfp4) {
|
|
131
|
+
static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
|
|
132
|
+
quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
|
|
133
|
+
ne11, ne12, ne13, stream);
|
|
134
|
+
|
|
135
|
+
} else {
|
|
136
|
+
quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
|
|
137
|
+
ne11, ne12, ne13, stream);
|
|
138
|
+
}
|
|
128
139
|
CUDA_CHECK(cudaGetLastError());
|
|
129
140
|
}
|
|
130
141
|
|
|
131
|
-
|
|
142
|
+
// Stride depends on quantization format
|
|
143
|
+
const int64_t s12 = use_native_mxfp4 ?
|
|
144
|
+
ne11 * ne10_padded * sizeof(block_fp4_mmq) /
|
|
145
|
+
(8 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 256 values (8 blocks of 32)
|
|
146
|
+
:
|
|
147
|
+
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
|
|
132
148
|
const int64_t s13 = ne12*s12;
|
|
133
149
|
|
|
134
150
|
const mmq_args args = {
|
|
@@ -175,12 +191,19 @@ void ggml_cuda_mul_mat_q(
|
|
|
175
191
|
const int64_t s11 = src1->nb[1] / ts_src1;
|
|
176
192
|
const int64_t s12 = src1->nb[2] / ts_src1;
|
|
177
193
|
const int64_t s13 = src1->nb[2] / ts_src1;
|
|
178
|
-
|
|
179
|
-
|
|
194
|
+
|
|
195
|
+
if (use_native_mxfp4) {
|
|
196
|
+
quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
|
197
|
+
ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
|
198
|
+
} else {
|
|
199
|
+
quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
|
200
|
+
ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
|
201
|
+
}
|
|
180
202
|
CUDA_CHECK(cudaGetLastError());
|
|
181
203
|
}
|
|
182
204
|
|
|
183
|
-
const int64_t s12 = ne11*ne10_padded * sizeof(
|
|
205
|
+
const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
|
|
206
|
+
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
|
|
184
207
|
const int64_t s13 = ne12*s12;
|
|
185
208
|
|
|
186
209
|
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
|
@@ -236,7 +259,7 @@ void ggml_cuda_op_mul_mat_q(
|
|
|
236
259
|
GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size);
|
|
237
260
|
}
|
|
238
261
|
|
|
239
|
-
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
262
|
+
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts) {
|
|
240
263
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
|
241
264
|
return false;
|
|
242
265
|
#endif // GGML_CUDA_FORCE_CUBLAS
|
|
@@ -297,7 +320,10 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
|
297
320
|
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
|
|
298
321
|
return true;
|
|
299
322
|
}
|
|
300
|
-
if (
|
|
323
|
+
if (n_experts > 64 || ne11 <= 128) {
|
|
324
|
+
return true;
|
|
325
|
+
}
|
|
326
|
+
if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
|
|
301
327
|
return true;
|
|
302
328
|
}
|
|
303
329
|
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
|