llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7621-bin-win-cpu-x64.zip} +0 -0
- llama_cpp_pydist-0.20.0.dist-info/METADATA +4539 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/RECORD +208 -193
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/WHEEL +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
- vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +6 -4
- vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
- vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
- vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +39 -14
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +32 -3
- vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +23 -23
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +3 -2
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +417 -102
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +248 -19
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +21 -172
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +36 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +69 -33
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +67 -31
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +710 -290
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +50 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +13 -4
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +23 -22
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +4 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +287 -16
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +44 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +52 -37
- vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +115 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
- vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +193 -61
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
- vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +110 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +10 -17
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +434 -267
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +127 -57
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +33 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +7 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +12 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +10 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
- llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
- vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
- {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.20.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
#include "ggml.h"
|
|
6
6
|
|
|
7
7
|
#ifdef GGML_CUDA_USE_CUB
|
|
8
|
-
# include <cub/
|
|
8
|
+
# include <cub/block/block_scan.cuh>
|
|
9
9
|
#endif // GGML_CUDA_USE_CUB
|
|
10
10
|
|
|
11
11
|
template<typename T, int BLOCK_SIZE>
|
|
@@ -16,12 +16,14 @@ static __global__ void cumsum_cub_kernel(
|
|
|
16
16
|
const int64_t s01, const int64_t s02, const int64_t s03,
|
|
17
17
|
const int64_t s1, const int64_t s2, const int64_t s3) {
|
|
18
18
|
#ifdef GGML_CUDA_USE_CUB
|
|
19
|
-
using
|
|
19
|
+
using BlockScanT = cub::BlockScan<T, BLOCK_SIZE>;
|
|
20
20
|
|
|
21
|
-
__shared__ typename
|
|
22
|
-
__shared__ T block_carry;
|
|
21
|
+
__shared__ typename BlockScanT::TempStorage temp_storage;
|
|
22
|
+
__shared__ T block_carry;
|
|
23
23
|
|
|
24
24
|
const int tid = threadIdx.x;
|
|
25
|
+
constexpr int UNROLL_FACTOR = 4;
|
|
26
|
+
constexpr int TILE_SIZE = BLOCK_SIZE * UNROLL_FACTOR;
|
|
25
27
|
|
|
26
28
|
const int64_t i1 = blockIdx.x;
|
|
27
29
|
const int64_t i2 = blockIdx.y;
|
|
@@ -39,37 +41,47 @@ static __global__ void cumsum_cub_kernel(
|
|
|
39
41
|
}
|
|
40
42
|
__syncthreads();
|
|
41
43
|
|
|
42
|
-
for (int64_t start = 0; start < ne00; start +=
|
|
43
|
-
|
|
44
|
-
T
|
|
44
|
+
for (int64_t start = 0; start < ne00; start += TILE_SIZE) {
|
|
45
|
+
T items[UNROLL_FACTOR];
|
|
46
|
+
T thread_sum = T(0);
|
|
45
47
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
48
|
+
#pragma unroll
|
|
49
|
+
for (int i = 0; i < UNROLL_FACTOR; i++) {
|
|
50
|
+
int64_t idx = start + tid * UNROLL_FACTOR + i;
|
|
51
|
+
T val = (idx < ne00) ? src_row[idx] : T(0);
|
|
52
|
+
thread_sum += val;
|
|
53
|
+
items[i] = thread_sum;
|
|
54
|
+
}
|
|
49
55
|
|
|
56
|
+
// Block-wide scan on thread sums
|
|
57
|
+
T thread_prefix;
|
|
58
|
+
T block_total;
|
|
59
|
+
BlockScanT(temp_storage).InclusiveSum(thread_sum, thread_prefix, block_total);
|
|
50
60
|
__syncthreads();
|
|
51
61
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
62
|
+
// Add offset to each item and store
|
|
63
|
+
T thread_offset = thread_prefix - thread_sum + block_carry;
|
|
64
|
+
#pragma unroll
|
|
65
|
+
for (int i = 0; i < UNROLL_FACTOR; i++) {
|
|
66
|
+
int64_t idx = start + tid * UNROLL_FACTOR + i;
|
|
67
|
+
if (idx < ne00) {
|
|
68
|
+
dst_row[idx] = items[i] + thread_offset;
|
|
69
|
+
}
|
|
57
70
|
}
|
|
58
71
|
|
|
59
72
|
__syncthreads();
|
|
60
73
|
|
|
74
|
+
// Update carry for next tile
|
|
61
75
|
if (tid == 0) {
|
|
62
76
|
block_carry += block_total;
|
|
63
77
|
}
|
|
64
|
-
|
|
65
|
-
__syncthreads();
|
|
66
78
|
}
|
|
67
79
|
#else
|
|
68
80
|
NO_DEVICE_CODE;
|
|
69
81
|
#endif // GGML_CUDA_USE_CUB
|
|
70
82
|
}
|
|
71
83
|
|
|
72
|
-
// Fallback kernel implementation
|
|
84
|
+
// Fallback kernel implementation
|
|
73
85
|
template<typename T>
|
|
74
86
|
static __global__ void cumsum_kernel(
|
|
75
87
|
const T * src, T * dst,
|
|
@@ -86,10 +98,10 @@ static __global__ void cumsum_kernel(
|
|
|
86
98
|
const int warps_per_block = blockDim.x / warp_size;
|
|
87
99
|
|
|
88
100
|
extern __shared__ float smem[];
|
|
89
|
-
float *
|
|
90
|
-
float *
|
|
91
|
-
float *
|
|
92
|
-
float *
|
|
101
|
+
float * s_vals = smem;
|
|
102
|
+
float * s_warp_sums = smem + blockDim.x;
|
|
103
|
+
float * s_carry = smem + blockDim.x + warps_per_block;
|
|
104
|
+
float * s_chunk_total = s_carry + 1;
|
|
93
105
|
|
|
94
106
|
// Initialize carry
|
|
95
107
|
if (tid == 0) {
|
|
@@ -107,21 +119,39 @@ static __global__ void cumsum_kernel(
|
|
|
107
119
|
const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
|
|
108
120
|
T * dst_row = dst + i1 * s1 + i2 * s2 + i3 * s3;
|
|
109
121
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
122
|
+
// register blocking: process 4 elements per thread to hide latency
|
|
123
|
+
// and reduce synchronization overhead
|
|
124
|
+
constexpr int num_unroll = 4;
|
|
125
|
+
T temp[num_unroll];
|
|
126
|
+
|
|
127
|
+
for (int64_t i = 0; i < ne00; i += num_unroll * blockDim.x) {
|
|
128
|
+
int64_t idx = i + tid * num_unroll;
|
|
129
|
+
|
|
130
|
+
// thread local sequential scan
|
|
131
|
+
temp[0] = (idx < ne00 ? src_row[idx] : T(0));
|
|
132
|
+
#pragma unroll
|
|
133
|
+
for (int64_t j = 1; j < num_unroll; j++) {
|
|
134
|
+
temp[j] = temp[j - 1];
|
|
135
|
+
if (idx + j < ne00) {
|
|
136
|
+
temp[j] += src_row[idx + j];
|
|
137
|
+
} else {
|
|
138
|
+
temp[j] += 0;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
113
141
|
|
|
114
|
-
//
|
|
142
|
+
// last emenent is sum of all values assigned to thread
|
|
143
|
+
float val = (idx < ne00) ? ggml_cuda_cast<float, T>(temp[num_unroll - 1]) : 0.0f;
|
|
144
|
+
|
|
145
|
+
// Warp inclusive scan
|
|
115
146
|
val = warp_prefix_inclusive_sum<T, warp_size>(val);
|
|
116
147
|
s_vals[tid] = val;
|
|
117
148
|
|
|
118
|
-
// Store warp total
|
|
119
149
|
if (lane == warp_size - 1) {
|
|
120
150
|
s_warp_sums[warp] = val;
|
|
121
151
|
}
|
|
122
152
|
__syncthreads();
|
|
123
153
|
|
|
124
|
-
//
|
|
154
|
+
// Exclusive scan of warp sums (warp 0 only)
|
|
125
155
|
if (warp == 0) {
|
|
126
156
|
float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
|
|
127
157
|
float inc = warp_prefix_inclusive_sum<T, warp_size>(w);
|
|
@@ -134,18 +164,24 @@ static __global__ void cumsum_kernel(
|
|
|
134
164
|
}
|
|
135
165
|
__syncthreads();
|
|
136
166
|
|
|
167
|
+
// write back results
|
|
137
168
|
float carry = *s_carry;
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
169
|
+
// calculate sum offset for this thread
|
|
170
|
+
float final_val_offset = s_vals[tid] + s_warp_sums[warp] + carry - temp[num_unroll - 1];
|
|
171
|
+
|
|
172
|
+
#pragma unroll
|
|
173
|
+
for (int32_t j = 0; j < num_unroll; j++) {
|
|
174
|
+
if (idx + j < ne00) {
|
|
175
|
+
dst_row[idx + j] = temp[j] + ggml_cuda_cast<T, float>(final_val_offset);
|
|
176
|
+
}
|
|
141
177
|
}
|
|
178
|
+
|
|
142
179
|
__syncthreads();
|
|
143
180
|
|
|
144
181
|
// Update carry for next chunk
|
|
145
182
|
if (tid == 0) {
|
|
146
183
|
*s_carry += *s_chunk_total;
|
|
147
184
|
}
|
|
148
|
-
__syncthreads();
|
|
149
185
|
}
|
|
150
186
|
}
|
|
151
187
|
|
|
@@ -177,7 +213,7 @@ static void cumsum_cuda(
|
|
|
177
213
|
const int warps_per_block = block_size / warp_size;
|
|
178
214
|
const size_t shmem_size = (block_size + warps_per_block + 2) * sizeof(float);
|
|
179
215
|
|
|
180
|
-
if (use_cub) {
|
|
216
|
+
if (use_cub && ne00 >= 1024) {
|
|
181
217
|
cumsum_cub_kernel<T, CUDA_CUMSUM_BLOCK_SIZE><<<grid_dims, CUDA_CUMSUM_BLOCK_SIZE, 0, stream>>>(
|
|
182
218
|
src, dst,
|
|
183
219
|
ne00, ne01, ne02, ne03,
|
|
@@ -918,7 +918,9 @@ void launch_fattn(
|
|
|
918
918
|
blocks_num.y = 1;
|
|
919
919
|
blocks_num.z = 1;
|
|
920
920
|
|
|
921
|
-
|
|
921
|
+
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
|
922
|
+
dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
|
|
923
|
+
}
|
|
922
924
|
} else {
|
|
923
925
|
const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.
|
|
924
926
|
|
|
@@ -531,7 +531,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
|
|
531
531
|
for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::I) {
|
|
532
532
|
#pragma unroll
|
|
533
533
|
for (int l = 0; l < T_C_KQ::ne; ++l) {
|
|
534
|
-
if (!oob_check || k0 + T_C_KQ::get_i(l) < k_VKQ_sup) {
|
|
534
|
+
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
|
|
535
535
|
KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
|
|
536
536
|
}
|
|
537
537
|
}
|
|
@@ -583,7 +583,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
|
|
583
583
|
for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) {
|
|
584
584
|
#pragma unroll
|
|
585
585
|
for (int l = 0; l < T_C_KQ::ne; ++l) {
|
|
586
|
-
if (!oob_check || k0 + T_C_KQ::get_j(l) < k_VKQ_sup) {
|
|
586
|
+
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
|
|
587
587
|
// Turing + Volta:
|
|
588
588
|
KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
|
|
589
589
|
}
|
|
@@ -201,16 +201,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
|
201
201
|
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
|
202
202
|
|
|
203
203
|
int64_t total_vram = 0;
|
|
204
|
-
#ifdef GGML_CUDA_FORCE_MMQ
|
|
205
|
-
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
|
206
|
-
#else
|
|
207
|
-
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
|
208
|
-
#endif // GGML_CUDA_FORCE_MMQ
|
|
209
|
-
#ifdef GGML_CUDA_FORCE_CUBLAS
|
|
210
|
-
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
|
|
211
|
-
#else
|
|
212
|
-
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
|
|
213
|
-
#endif // GGML_CUDA_FORCE_CUBLAS
|
|
214
204
|
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
|
215
205
|
|
|
216
206
|
std::vector<std::pair<int, std::string>> turing_devices_without_mma;
|
|
@@ -2211,7 +2201,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
|
2211
2201
|
|
|
2212
2202
|
const int cc = ggml_cuda_info().devices[id].cc;
|
|
2213
2203
|
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
|
2214
|
-
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
|
2204
|
+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
|
2215
2205
|
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
|
2216
2206
|
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
|
2217
2207
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
|
@@ -2219,7 +2209,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
|
2219
2209
|
} else {
|
|
2220
2210
|
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
|
2221
2211
|
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
|
2222
|
-
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
|
2212
|
+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
|
2223
2213
|
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
|
2224
2214
|
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
|
2225
2215
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
|
@@ -2287,7 +2277,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
|
2287
2277
|
return;
|
|
2288
2278
|
}
|
|
2289
2279
|
|
|
2290
|
-
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
|
|
2280
|
+
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
|
|
2291
2281
|
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
|
|
2292
2282
|
return;
|
|
2293
2283
|
}
|
|
@@ -3076,8 +3066,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|
|
3076
3066
|
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
|
|
3077
3067
|
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
|
3078
3068
|
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
|
|
3069
|
+
ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
|
|
3070
|
+
ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
|
|
3071
|
+
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
|
3079
3072
|
|
|
3080
|
-
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
|
3073
|
+
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
|
3081
3074
|
return true;
|
|
3082
3075
|
}
|
|
3083
3076
|
}
|
|
@@ -3085,7 +3078,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|
|
3085
3078
|
if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
|
|
3086
3079
|
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
|
3087
3080
|
ggml_tensor * weights = cgraph->nodes[node_idx + 4];
|
|
3088
|
-
|
|
3081
|
+
ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
|
|
3082
|
+
ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
|
|
3083
|
+
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
|
3084
|
+
|
|
3085
|
+
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
|
3089
3086
|
return true;
|
|
3090
3087
|
}
|
|
3091
3088
|
}
|
|
@@ -3094,8 +3091,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|
|
3094
3091
|
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
|
|
3095
3092
|
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
|
|
3096
3093
|
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
|
|
3094
|
+
ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
|
|
3095
|
+
ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
|
|
3096
|
+
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
|
3097
3097
|
|
|
3098
|
-
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
|
3098
|
+
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
|
3099
3099
|
return true;
|
|
3100
3100
|
}
|
|
3101
3101
|
}
|
|
@@ -3253,6 +3253,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
3253
3253
|
should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
|
|
3254
3254
|
}
|
|
3255
3255
|
}
|
|
3256
|
+
|
|
3256
3257
|
if (should_launch_concurrent_events) {
|
|
3257
3258
|
// Restore original node order within each concurrent region to enable fusion within streams
|
|
3258
3259
|
|
|
@@ -3304,6 +3305,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
3304
3305
|
cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
|
|
3305
3306
|
}
|
|
3306
3307
|
}
|
|
3308
|
+
} else {
|
|
3309
|
+
stream_ctx.concurrent_events.clear();
|
|
3307
3310
|
}
|
|
3308
3311
|
|
|
3309
3312
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
@@ -3692,11 +3695,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
3692
3695
|
}
|
|
3693
3696
|
}
|
|
3694
3697
|
|
|
3695
|
-
static
|
|
3696
|
-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
3697
|
-
|
|
3698
|
-
ggml_cuda_set_device(cuda_ctx->device);
|
|
3699
|
-
|
|
3698
|
+
static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
|
|
3700
3699
|
#ifdef USE_CUDA_GRAPH
|
|
3701
3700
|
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
|
3702
3701
|
|
|
@@ -3706,7 +3705,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3706
3705
|
}
|
|
3707
3706
|
|
|
3708
3707
|
bool use_cuda_graph = true;
|
|
3709
|
-
bool cuda_graph_update_required = false;
|
|
3710
3708
|
|
|
3711
3709
|
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
|
3712
3710
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
|
@@ -3727,6 +3725,29 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3727
3725
|
use_cuda_graph = false;
|
|
3728
3726
|
}
|
|
3729
3727
|
|
|
3728
|
+
cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
|
|
3729
|
+
#else
|
|
3730
|
+
bool use_cuda_graph = false;
|
|
3731
|
+
#endif // USE_CUDA_GRAPH
|
|
3732
|
+
|
|
3733
|
+
return use_cuda_graph;
|
|
3734
|
+
}
|
|
3735
|
+
|
|
3736
|
+
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
3737
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
|
3738
|
+
|
|
3739
|
+
bool use_cuda_graph = false;
|
|
3740
|
+
bool cuda_graph_update_required = false;
|
|
3741
|
+
|
|
3742
|
+
// graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
|
|
3743
|
+
// we call it here instead.
|
|
3744
|
+
#ifdef USE_CUDA_GRAPH
|
|
3745
|
+
if (!cuda_ctx->cuda_graph) {
|
|
3746
|
+
use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
|
3747
|
+
} else {
|
|
3748
|
+
use_cuda_graph = cuda_ctx->cuda_graph && cuda_ctx->cuda_graph->cuda_graphs_enabled;
|
|
3749
|
+
}
|
|
3750
|
+
|
|
3730
3751
|
if (use_cuda_graph) {
|
|
3731
3752
|
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
|
3732
3753
|
|
|
@@ -3746,6 +3767,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3746
3767
|
#endif
|
|
3747
3768
|
}
|
|
3748
3769
|
}
|
|
3770
|
+
#endif // USE_CUDA_GRAPH
|
|
3749
3771
|
|
|
3750
3772
|
if (use_cuda_graph && cuda_graph_update_required) {
|
|
3751
3773
|
// Start CUDA graph capture
|
|
@@ -3757,11 +3779,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3757
3779
|
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
|
3758
3780
|
}
|
|
3759
3781
|
|
|
3760
|
-
#else
|
|
3761
|
-
bool use_cuda_graph = false;
|
|
3762
|
-
bool cuda_graph_update_required = false;
|
|
3763
|
-
#endif // USE_CUDA_GRAPH
|
|
3764
|
-
|
|
3765
3782
|
bool graph_evaluated_or_captured = false;
|
|
3766
3783
|
|
|
3767
3784
|
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
|
@@ -3797,8 +3814,10 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
|
|
3797
3814
|
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
3798
3815
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
|
3799
3816
|
|
|
3817
|
+
const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
|
3818
|
+
|
|
3800
3819
|
static bool enable_graph_optimization = [] {
|
|
3801
|
-
const char * env
|
|
3820
|
+
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
|
|
3802
3821
|
return env != nullptr && atoi(env) == 1;
|
|
3803
3822
|
}();
|
|
3804
3823
|
|
|
@@ -3806,12 +3825,13 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
|
|
|
3806
3825
|
return;
|
|
3807
3826
|
}
|
|
3808
3827
|
|
|
3809
|
-
GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
|
|
3810
|
-
GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
|
|
3811
|
-
|
|
3812
3828
|
ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
|
|
3813
3829
|
stream_context.reset();
|
|
3814
3830
|
|
|
3831
|
+
if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) {
|
|
3832
|
+
return;
|
|
3833
|
+
}
|
|
3834
|
+
|
|
3815
3835
|
// number of out-degrees for a particular node
|
|
3816
3836
|
std::unordered_map<const ggml_tensor *, int> fan_out;
|
|
3817
3837
|
// reverse mapping of node to index in the cgraph
|
|
@@ -3872,6 +3892,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
|
|
|
3872
3892
|
if (count >= min_fan_out && count <= max_fan_out) {
|
|
3873
3893
|
const int root_node_idx = node_indices[root_node];
|
|
3874
3894
|
|
|
3895
|
+
// only optimize for attn_norm
|
|
3896
|
+
// TODO: make this more generic
|
|
3897
|
+
if (!strstr(root_node->name, "attn_norm")) {
|
|
3898
|
+
continue;
|
|
3899
|
+
}
|
|
3900
|
+
|
|
3875
3901
|
bool is_part_of_event = false;
|
|
3876
3902
|
for (const auto & [start, end] : concurrent_node_ranges) {
|
|
3877
3903
|
if (root_node_idx >= start && root_node_idx <= end) {
|
|
@@ -4775,6 +4801,16 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
|
|
4775
4801
|
features.push_back({ "FA_ALL_QUANTS", "1" });
|
|
4776
4802
|
#endif
|
|
4777
4803
|
|
|
4804
|
+
{
|
|
4805
|
+
const auto & info = ggml_cuda_info();
|
|
4806
|
+
for (int id = 0; id < info.device_count; ++id) {
|
|
4807
|
+
if (blackwell_mma_available(info.devices[id].cc)) {
|
|
4808
|
+
features.push_back({ "BLACKWELL_NATIVE_FP4", "1"});
|
|
4809
|
+
break;
|
|
4810
|
+
}
|
|
4811
|
+
}
|
|
4812
|
+
}
|
|
4813
|
+
|
|
4778
4814
|
#undef _STRINGIFY
|
|
4779
4815
|
#undef STRINGIFY
|
|
4780
4816
|
|
|
@@ -63,6 +63,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
63
63
|
|
|
64
64
|
const int id = ggml_cuda_get_device();
|
|
65
65
|
const int nsm = ggml_cuda_info().devices[id].nsm;
|
|
66
|
+
|
|
67
|
+
// Heuristic for block size selection to optimize occupancy.
|
|
68
|
+
// See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
|
|
66
69
|
if ((nrows / nsm) < 2) {
|
|
67
70
|
const dim3 block_dims(512, 1, 1);
|
|
68
71
|
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
|
@@ -900,6 +900,27 @@ namespace ggml_cuda_mma {
|
|
|
900
900
|
#endif // AMPERE_MMA_AVAILABLE
|
|
901
901
|
}
|
|
902
902
|
|
|
903
|
+
static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D,
|
|
904
|
+
const tile<16, 8, int> & A,
|
|
905
|
+
const tile<8, 8, int> & B,
|
|
906
|
+
uint32_t a_scale,
|
|
907
|
+
uint32_t b_scale) {
|
|
908
|
+
#ifdef BLACKWELL_MMA_AVAILABLE
|
|
909
|
+
const int * Axi = (const int *) A.x;
|
|
910
|
+
const int * Bxi = (const int *) B.x;
|
|
911
|
+
float * Dxi = (float *) D.x;
|
|
912
|
+
|
|
913
|
+
asm volatile(
|
|
914
|
+
"mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
|
|
915
|
+
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
|
|
916
|
+
"%10, {0, 0}, %11, {0, 0};"
|
|
917
|
+
: "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
|
|
918
|
+
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
|
|
919
|
+
#else
|
|
920
|
+
GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
|
|
921
|
+
#endif // BLACKWELL_MMA_AVAILABLE
|
|
922
|
+
}
|
|
923
|
+
|
|
903
924
|
static __device__ __forceinline__ void mma(
|
|
904
925
|
tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
|
|
905
926
|
#ifdef TURING_MMA_AVAILABLE
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
#include "common.cuh"
|
|
1
2
|
#include "mmq.cuh"
|
|
2
3
|
#include "quantize.cuh"
|
|
3
4
|
#include "mmid.cuh"
|
|
@@ -114,6 +115,9 @@ void ggml_cuda_mul_mat_q(
|
|
|
114
115
|
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
|
115
116
|
|| GGML_CUDA_CC_IS_CDNA(cc);
|
|
116
117
|
|
|
118
|
+
// TODO: tighter pool buffer size vs q8 path
|
|
119
|
+
const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
|
|
120
|
+
|
|
117
121
|
if (!ids) {
|
|
118
122
|
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
|
119
123
|
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
|
@@ -123,12 +127,24 @@ void ggml_cuda_mul_mat_q(
|
|
|
123
127
|
const int64_t s11 = src1->nb[1] / ts_src1;
|
|
124
128
|
const int64_t s12 = src1->nb[2] / ts_src1;
|
|
125
129
|
const int64_t s13 = src1->nb[3] / ts_src1;
|
|
126
|
-
|
|
127
|
-
|
|
130
|
+
if (use_native_mxfp4) {
|
|
131
|
+
static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
|
|
132
|
+
quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
|
|
133
|
+
ne11, ne12, ne13, stream);
|
|
134
|
+
|
|
135
|
+
} else {
|
|
136
|
+
quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
|
|
137
|
+
ne11, ne12, ne13, stream);
|
|
138
|
+
}
|
|
128
139
|
CUDA_CHECK(cudaGetLastError());
|
|
129
140
|
}
|
|
130
141
|
|
|
131
|
-
|
|
142
|
+
// Stride depends on quantization format
|
|
143
|
+
const int64_t s12 = use_native_mxfp4 ?
|
|
144
|
+
ne11 * ne10_padded * sizeof(block_fp4_mmq) /
|
|
145
|
+
(8 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 256 values (8 blocks of 32)
|
|
146
|
+
:
|
|
147
|
+
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
|
|
132
148
|
const int64_t s13 = ne12*s12;
|
|
133
149
|
|
|
134
150
|
const mmq_args args = {
|
|
@@ -175,12 +191,19 @@ void ggml_cuda_mul_mat_q(
|
|
|
175
191
|
const int64_t s11 = src1->nb[1] / ts_src1;
|
|
176
192
|
const int64_t s12 = src1->nb[2] / ts_src1;
|
|
177
193
|
const int64_t s13 = src1->nb[2] / ts_src1;
|
|
178
|
-
|
|
179
|
-
|
|
194
|
+
|
|
195
|
+
if (use_native_mxfp4) {
|
|
196
|
+
quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
|
197
|
+
ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
|
198
|
+
} else {
|
|
199
|
+
quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
|
200
|
+
ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
|
201
|
+
}
|
|
180
202
|
CUDA_CHECK(cudaGetLastError());
|
|
181
203
|
}
|
|
182
204
|
|
|
183
|
-
const int64_t s12 = ne11*ne10_padded * sizeof(
|
|
205
|
+
const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
|
|
206
|
+
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
|
|
184
207
|
const int64_t s13 = ne12*s12;
|
|
185
208
|
|
|
186
209
|
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
|
@@ -236,7 +259,7 @@ void ggml_cuda_op_mul_mat_q(
|
|
|
236
259
|
GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size);
|
|
237
260
|
}
|
|
238
261
|
|
|
239
|
-
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
262
|
+
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts) {
|
|
240
263
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
|
241
264
|
return false;
|
|
242
265
|
#endif // GGML_CUDA_FORCE_CUBLAS
|
|
@@ -297,7 +320,10 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
|
297
320
|
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
|
|
298
321
|
return true;
|
|
299
322
|
}
|
|
300
|
-
if (
|
|
323
|
+
if (n_experts > 64 || ne11 <= 128) {
|
|
324
|
+
return true;
|
|
325
|
+
}
|
|
326
|
+
if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
|
|
301
327
|
return true;
|
|
302
328
|
}
|
|
303
329
|
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
|