llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
- llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
- vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
- vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
- vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
- vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
- vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
- vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
- vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
- vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
- vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
- vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
- llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
- vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
- {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -60,6 +60,25 @@ llama_context::llama_context(
|
|
|
60
60
|
cparams.cb_eval = params.cb_eval;
|
|
61
61
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
62
62
|
|
|
63
|
+
// Initialize backend samplers here so they are part of the sampling graph
|
|
64
|
+
// before the reserve passes run later in this function. This avoids a later
|
|
65
|
+
// re-reserve when graph nodes change.
|
|
66
|
+
if (params.samplers != nullptr && params.n_samplers > 0) {
|
|
67
|
+
for (size_t i = 0; i < params.n_samplers; ++i) {
|
|
68
|
+
const auto & config = params.samplers[i];
|
|
69
|
+
|
|
70
|
+
if (llama_sampler_chain_get(config.sampler, -1) == nullptr) {
|
|
71
|
+
throw std::runtime_error("the backend samplers must be of type llama_sampler_chain");
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (set_sampler(config.seq_id, config.sampler)) {
|
|
75
|
+
const int n_samplers = llama_sampler_chain_n(config.sampler);
|
|
76
|
+
|
|
77
|
+
LLAMA_LOG_INFO("%s: setting backend sampler for seq_id %d (n = %d)\n", __func__, config.seq_id, n_samplers);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
63
82
|
auto rope_scaling_type = params.rope_scaling_type;
|
|
64
83
|
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
|
65
84
|
rope_scaling_type = hparams.rope_scaling_type_train;
|
|
@@ -231,7 +250,10 @@ llama_context::llama_context(
|
|
|
231
250
|
// graph outputs buffer
|
|
232
251
|
{
|
|
233
252
|
// resized during inference when a batch uses more outputs
|
|
234
|
-
|
|
253
|
+
// Create a dummy batch for initialization.
|
|
254
|
+
llama_batch dummy_batch = {};
|
|
255
|
+
dummy_batch.n_tokens = 0;
|
|
256
|
+
if (output_reserve(params.n_seq_max, dummy_batch) < params.n_seq_max) {
|
|
235
257
|
throw std::runtime_error("failed to reserve initial output buffer");
|
|
236
258
|
}
|
|
237
259
|
|
|
@@ -294,8 +316,8 @@ llama_context::llama_context(
|
|
|
294
316
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
|
295
317
|
bool pipeline_parallel =
|
|
296
318
|
model.n_devices() > 1 &&
|
|
297
|
-
model.
|
|
298
|
-
model.
|
|
319
|
+
model.n_gpu_layers() > model.hparams.n_layer &&
|
|
320
|
+
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
|
|
299
321
|
cparams.offload_kqv &&
|
|
300
322
|
!model.has_tensor_overrides();
|
|
301
323
|
|
|
@@ -456,26 +478,35 @@ llama_context::llama_context(
|
|
|
456
478
|
LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
|
|
457
479
|
}
|
|
458
480
|
}
|
|
481
|
+
|
|
482
|
+
// Initialize the full vocabulary token ids for backend samplers.
|
|
483
|
+
{
|
|
484
|
+
const int n_vocab = model.vocab.n_tokens();
|
|
485
|
+
|
|
486
|
+
sampling.token_ids_full_vocab.resize(n_vocab);
|
|
487
|
+
for (int i = 0; i < n_vocab; ++i) {
|
|
488
|
+
sampling.token_ids_full_vocab[i] = i;
|
|
489
|
+
}
|
|
490
|
+
}
|
|
459
491
|
}
|
|
460
492
|
|
|
461
493
|
llama_context::~llama_context() {
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
// }
|
|
494
|
+
if (!model.hparams.no_alloc) {
|
|
495
|
+
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
|
496
|
+
ggml_backend_t backend = backend_ptrs[i];
|
|
497
|
+
ggml_backend_buffer_type_t buft = backend_buft[i];
|
|
498
|
+
|
|
499
|
+
const size_t size_exp = backend_buf_exp_size[i];
|
|
500
|
+
const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
501
|
+
if (size_exp == size_act) {
|
|
502
|
+
LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
|
503
|
+
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
504
|
+
} else {
|
|
505
|
+
LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
|
506
|
+
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
479
510
|
ggml_opt_free(opt_ctx);
|
|
480
511
|
}
|
|
481
512
|
|
|
@@ -617,6 +648,35 @@ float * llama_context::get_logits() {
|
|
|
617
648
|
return logits;
|
|
618
649
|
}
|
|
619
650
|
|
|
651
|
+
int64_t llama_context::output_resolve_row(int32_t i) const {
|
|
652
|
+
int64_t j = -1;
|
|
653
|
+
|
|
654
|
+
// support negative indices (last output row)
|
|
655
|
+
if (i < 0) {
|
|
656
|
+
j = n_outputs + i;
|
|
657
|
+
if (j < 0) {
|
|
658
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
|
|
659
|
+
}
|
|
660
|
+
} else if ((size_t) i >= output_ids.size()) {
|
|
661
|
+
throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
|
|
662
|
+
} else {
|
|
663
|
+
// use output_ids to translate the batch token index into a row number
|
|
664
|
+
// that holds this token's data.
|
|
665
|
+
j = output_ids[i];
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
if (j < 0) {
|
|
669
|
+
// the batch token was not configured to output anything
|
|
670
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
if (j >= n_outputs) {
|
|
674
|
+
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
return j;
|
|
678
|
+
}
|
|
679
|
+
|
|
620
680
|
float * llama_context::get_logits_ith(int32_t i) {
|
|
621
681
|
int64_t j = -1;
|
|
622
682
|
|
|
@@ -627,6 +687,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
|
|
627
687
|
throw std::runtime_error("no logits");
|
|
628
688
|
}
|
|
629
689
|
|
|
690
|
+
// TODO: use output_resolve_row()
|
|
630
691
|
if (i < 0) {
|
|
631
692
|
j = n_outputs + i;
|
|
632
693
|
if (j < 0) {
|
|
@@ -663,6 +724,10 @@ float * llama_context::get_embeddings() {
|
|
|
663
724
|
return embd;
|
|
664
725
|
}
|
|
665
726
|
|
|
727
|
+
llama_token * llama_context::get_sampled_tokens() const{
|
|
728
|
+
return sampling.sampled;
|
|
729
|
+
}
|
|
730
|
+
|
|
666
731
|
float * llama_context::get_embeddings_ith(int32_t i) {
|
|
667
732
|
int64_t j = -1;
|
|
668
733
|
|
|
@@ -673,6 +738,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
|
|
673
738
|
throw std::runtime_error("no embeddings");
|
|
674
739
|
}
|
|
675
740
|
|
|
741
|
+
// TODO: use output_resolve_row()
|
|
676
742
|
if (i < 0) {
|
|
677
743
|
j = n_outputs + i;
|
|
678
744
|
if (j < 0) {
|
|
@@ -712,6 +778,136 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
|
|
|
712
778
|
return it->second.data();
|
|
713
779
|
}
|
|
714
780
|
|
|
781
|
+
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
|
|
782
|
+
output_reorder();
|
|
783
|
+
|
|
784
|
+
if (sampling.sampled == nullptr) {
|
|
785
|
+
return LLAMA_TOKEN_NULL;
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
try {
|
|
789
|
+
const int64_t row = output_resolve_row(idx);
|
|
790
|
+
GGML_ASSERT(row < (int64_t) sampling.sampled_size);
|
|
791
|
+
return sampling.sampled[row];
|
|
792
|
+
} catch (const std::exception & err) {
|
|
793
|
+
LLAMA_LOG_ERROR("%s: invalid backend sampled token id %d, reason: %s\n", __func__, idx, err.what());
|
|
794
|
+
return LLAMA_TOKEN_NULL;
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
float * llama_context::get_sampled_probs_ith(int32_t idx) {
|
|
799
|
+
output_reorder();
|
|
800
|
+
|
|
801
|
+
if (sampling.probs == nullptr) {
|
|
802
|
+
return nullptr;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
try {
|
|
806
|
+
const int64_t row = output_resolve_row(idx);
|
|
807
|
+
if ((size_t) row >= sampling.probs_count.size() || sampling.probs_count[row] == 0) {
|
|
808
|
+
return nullptr;
|
|
809
|
+
}
|
|
810
|
+
return sampling.probs + row*model.vocab.n_tokens();
|
|
811
|
+
} catch (const std::exception & err) {
|
|
812
|
+
LLAMA_LOG_ERROR("%s: invalid backend sampled probs id %d, reason: %s\n", __func__, idx, err.what());
|
|
813
|
+
return nullptr;
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
float * llama_context::get_sampled_logits_ith(int32_t idx) {
|
|
818
|
+
output_reorder();
|
|
819
|
+
|
|
820
|
+
if (sampling.logits == nullptr) {
|
|
821
|
+
return nullptr;
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
try {
|
|
825
|
+
const int64_t row = output_resolve_row(idx);
|
|
826
|
+
if ((size_t) row >= sampling.logits_count.size() || sampling.logits_count[row] == 0) {
|
|
827
|
+
return nullptr;
|
|
828
|
+
}
|
|
829
|
+
return sampling.logits + row*model.vocab.n_tokens();
|
|
830
|
+
} catch (const std::exception & err) {
|
|
831
|
+
LLAMA_LOG_ERROR("%s: invalid backend sampled logits id %d, reason: %s\n", __func__, idx, err.what());
|
|
832
|
+
return nullptr;
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
const llama_token * llama_context::get_sampled_candidates_ith(int32_t idx) {
|
|
837
|
+
output_reorder();
|
|
838
|
+
|
|
839
|
+
try {
|
|
840
|
+
const int64_t row = output_resolve_row(idx);
|
|
841
|
+
if (sampling.candidates != nullptr &&
|
|
842
|
+
(size_t) row < sampling.candidates_count.size() &&
|
|
843
|
+
sampling.candidates_count[row] > 0) {
|
|
844
|
+
return sampling.candidates + row*model.vocab.n_tokens();
|
|
845
|
+
}
|
|
846
|
+
} catch (const std::exception & err) {
|
|
847
|
+
// fallback to full vocab list
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
return sampling.token_ids_full_vocab.data();
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
size_t llama_context::get_sampled_candidates_count(int32_t idx) {
|
|
854
|
+
output_reorder();
|
|
855
|
+
|
|
856
|
+
if (sampling.candidates == nullptr) {
|
|
857
|
+
return 0;
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
try {
|
|
861
|
+
const int64_t row = output_resolve_row(idx);
|
|
862
|
+
if ((size_t) row >= sampling.candidates_count.size()) {
|
|
863
|
+
return 0;
|
|
864
|
+
}
|
|
865
|
+
return sampling.candidates_count[row];
|
|
866
|
+
} catch (const std::exception & err) {
|
|
867
|
+
LLAMA_LOG_ERROR("%s: invalid backend sampled candidates count id %d, reason: %s\n", __func__, idx, err.what());
|
|
868
|
+
return 0;
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
size_t llama_context::get_sampled_logits_count(int32_t idx) {
|
|
873
|
+
output_reorder();
|
|
874
|
+
|
|
875
|
+
if (sampling.logits == nullptr) {
|
|
876
|
+
return model.vocab.n_tokens();
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
try {
|
|
880
|
+
const int64_t row = output_resolve_row(idx);
|
|
881
|
+
if ((size_t) row >= sampling.logits_count.size()) {
|
|
882
|
+
return 0;
|
|
883
|
+
}
|
|
884
|
+
return sampling.logits_count[row];
|
|
885
|
+
} catch (const std::exception & err) {
|
|
886
|
+
LLAMA_LOG_ERROR("%s: invalid backend sampled logits count id %d, reason: %s\n", __func__, idx, err.what());
|
|
887
|
+
return 0;
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
size_t llama_context::get_sampled_probs_count(int32_t idx) {
|
|
892
|
+
output_reorder();
|
|
893
|
+
|
|
894
|
+
if (sampling.probs == nullptr) {
|
|
895
|
+
return 0;
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
try {
|
|
899
|
+
const int64_t row = output_resolve_row(idx);
|
|
900
|
+
if ((size_t) row >= sampling.probs_count.size()) {
|
|
901
|
+
return 0;
|
|
902
|
+
}
|
|
903
|
+
return sampling.probs_count[row];
|
|
904
|
+
} catch (const std::exception & err) {
|
|
905
|
+
LLAMA_LOG_ERROR("%s: invalid backend sampled probs count id %d, reason: %s\n", __func__, idx, err.what());
|
|
906
|
+
return 0;
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
|
|
715
911
|
void llama_context::attach_threadpool(
|
|
716
912
|
ggml_threadpool_t threadpool,
|
|
717
913
|
ggml_threadpool_t threadpool_batch) {
|
|
@@ -768,6 +964,42 @@ void llama_context::set_warmup(bool value) {
|
|
|
768
964
|
cparams.warmup = value;
|
|
769
965
|
}
|
|
770
966
|
|
|
967
|
+
bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
|
968
|
+
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
|
|
969
|
+
|
|
970
|
+
const bool can_offload =
|
|
971
|
+
sampler &&
|
|
972
|
+
sampler->iface->backend_init &&
|
|
973
|
+
sampler->iface->backend_apply &&
|
|
974
|
+
llama_sampler_chain_n(sampler) > 0;
|
|
975
|
+
|
|
976
|
+
if (sampler && can_offload) {
|
|
977
|
+
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
|
|
978
|
+
auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
|
|
979
|
+
if (host_buft) {
|
|
980
|
+
buft = host_buft;
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
sampler->iface->backend_init(sampler, buft);
|
|
984
|
+
|
|
985
|
+
sampling.samplers[seq_id] = sampler;
|
|
986
|
+
|
|
987
|
+
return true;
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
if (sampler && !can_offload) {
|
|
991
|
+
LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
|
|
992
|
+
|
|
993
|
+
sampling.samplers.erase(seq_id);
|
|
994
|
+
|
|
995
|
+
return false;
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
sampling.samplers.erase(seq_id);
|
|
999
|
+
|
|
1000
|
+
return true;
|
|
1001
|
+
}
|
|
1002
|
+
|
|
771
1003
|
void llama_context::set_adapter_lora(
|
|
772
1004
|
llama_adapter_lora * adapter,
|
|
773
1005
|
float scale) {
|
|
@@ -908,7 +1140,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
908
1140
|
n_queued_tokens += n_tokens;
|
|
909
1141
|
|
|
910
1142
|
// reserve output buffer
|
|
911
|
-
if (output_reserve(n_tokens) < n_tokens) {
|
|
1143
|
+
if (output_reserve(n_tokens, batch_inp) < n_tokens) {
|
|
912
1144
|
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
|
|
913
1145
|
return -2;
|
|
914
1146
|
};
|
|
@@ -1032,6 +1264,112 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
1032
1264
|
return 0;
|
|
1033
1265
|
}
|
|
1034
1266
|
|
|
1267
|
+
static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
|
|
1268
|
+
std::map<llama_seq_id, uint32_t> seq_to_row;
|
|
1269
|
+
// how many output tokens we have seen so far for this ubatch.
|
|
1270
|
+
uint32_t local = 0;
|
|
1271
|
+
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
|
1272
|
+
// skip tokens that are not output.
|
|
1273
|
+
if (!ubatch.output[i]) {
|
|
1274
|
+
continue;
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
|
1278
|
+
// row_offset is the number of output tokens before this ubatch.
|
|
1279
|
+
seq_to_row[seq_id] = row_offset + local;
|
|
1280
|
+
++local;
|
|
1281
|
+
}
|
|
1282
|
+
return seq_to_row;
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
static void copy_tensor_async_ints(
|
|
1286
|
+
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
|
|
1287
|
+
llama_token * sampled,
|
|
1288
|
+
size_t sampled_size,
|
|
1289
|
+
const std::map<llama_seq_id, uint32_t> & seq_to_row,
|
|
1290
|
+
ggml_backend_sched_t sched) {
|
|
1291
|
+
if (sampled == nullptr) {
|
|
1292
|
+
return;
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
for (const auto & [seq_id, tensor] : tensor_map) {
|
|
1296
|
+
auto it = seq_to_row.find(seq_id);
|
|
1297
|
+
if (it == seq_to_row.end()) {
|
|
1298
|
+
continue;
|
|
1299
|
+
}
|
|
1300
|
+
|
|
1301
|
+
const uint32_t row = it->second;
|
|
1302
|
+
GGML_ASSERT(row < sampled_size);
|
|
1303
|
+
|
|
1304
|
+
GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
|
|
1305
|
+
|
|
1306
|
+
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
|
1307
|
+
ggml_backend_tensor_get_async(backend, tensor, sampled + row, 0, sizeof(sampled[row]));
|
|
1308
|
+
}
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
static void copy_tensor_async_floats(
|
|
1312
|
+
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
|
|
1313
|
+
float * dst,
|
|
1314
|
+
size_t stride,
|
|
1315
|
+
std::vector<uint32_t> & counts,
|
|
1316
|
+
const std::map<llama_seq_id, uint32_t> & seq_to_row,
|
|
1317
|
+
ggml_backend_sched_t sched) {
|
|
1318
|
+
if (dst == nullptr) {
|
|
1319
|
+
return;
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
for (const auto & [seq_id, tensor] : tensor_map) {
|
|
1323
|
+
auto it = seq_to_row.find(seq_id);
|
|
1324
|
+
if (it == seq_to_row.end()) {
|
|
1325
|
+
continue;
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
const uint32_t row = it->second;
|
|
1329
|
+
GGML_ASSERT(row < counts.size());
|
|
1330
|
+
|
|
1331
|
+
GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
|
|
1332
|
+
|
|
1333
|
+
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
|
1334
|
+
float * row_ptr = dst + (size_t) row * stride;
|
|
1335
|
+
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
|
|
1336
|
+
|
|
1337
|
+
// Update the actual number of logits/probabilities that were written for this row.
|
|
1338
|
+
counts[row] = ggml_nelements(tensor);
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
static void copy_tensor_async_candidates(
|
|
1343
|
+
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
|
|
1344
|
+
llama_token * dst,
|
|
1345
|
+
size_t stride,
|
|
1346
|
+
std::vector<uint32_t> & counts,
|
|
1347
|
+
const std::map<llama_seq_id, uint32_t> & seq_to_row,
|
|
1348
|
+
ggml_backend_sched_t sched) {
|
|
1349
|
+
if (dst == nullptr) {
|
|
1350
|
+
return;
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
for (const auto & [seq_id, tensor] : tensor_map) {
|
|
1354
|
+
auto it = seq_to_row.find(seq_id);
|
|
1355
|
+
if (it == seq_to_row.end()) {
|
|
1356
|
+
continue;
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1359
|
+
const uint32_t row = it->second;
|
|
1360
|
+
GGML_ASSERT(row < counts.size());
|
|
1361
|
+
|
|
1362
|
+
GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
|
|
1363
|
+
|
|
1364
|
+
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
|
1365
|
+
llama_token * row_ptr = dst + (size_t) row * stride;
|
|
1366
|
+
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
|
|
1367
|
+
|
|
1368
|
+
// Update the actual number of candidates that were written.
|
|
1369
|
+
counts[row] = ggml_nelements(tensor);
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1035
1373
|
int llama_context::decode(const llama_batch & batch_inp) {
|
|
1036
1374
|
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
|
|
1037
1375
|
|
|
@@ -1052,9 +1390,36 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1052
1390
|
const int64_t n_embd = hparams.n_embd_inp();
|
|
1053
1391
|
|
|
1054
1392
|
// when computing embeddings, all tokens are output
|
|
1055
|
-
const bool output_all
|
|
1393
|
+
const bool output_all = cparams.embeddings;
|
|
1394
|
+
const bool has_samplers = !sampling.samplers.empty();
|
|
1395
|
+
|
|
1396
|
+
const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
|
|
1397
|
+
|
|
1398
|
+
// TODO: avoid this workaround in the future
|
|
1399
|
+
if (has_samplers && batch_inp.logits) {
|
|
1400
|
+
std::vector<int32_t> seq_output_count(n_seq_max, 0);
|
|
1401
|
+
|
|
1402
|
+
for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
|
|
1403
|
+
if (batch_inp.logits[i] == 0) {
|
|
1404
|
+
continue;
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
|
|
1408
|
+
|
|
1409
|
+
for (int32_t s = 0; s < ns; ++s) {
|
|
1410
|
+
const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
|
|
1056
1411
|
|
|
1057
|
-
|
|
1412
|
+
seq_output_count[seq_id]++;
|
|
1413
|
+
if (seq_output_count[seq_id] > 1) {
|
|
1414
|
+
LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
|
|
1415
|
+
__func__, seq_id, seq_output_count[seq_id]);
|
|
1416
|
+
return -1;
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
|
|
1058
1423
|
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
|
1059
1424
|
return -1;
|
|
1060
1425
|
}
|
|
@@ -1135,7 +1500,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1135
1500
|
}
|
|
1136
1501
|
|
|
1137
1502
|
// reserve output buffer
|
|
1138
|
-
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
|
1503
|
+
if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
|
|
1139
1504
|
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
|
1140
1505
|
return -2;
|
|
1141
1506
|
};
|
|
@@ -1208,7 +1573,10 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1208
1573
|
}
|
|
1209
1574
|
|
|
1210
1575
|
// extract logits
|
|
1211
|
-
|
|
1576
|
+
// For multi-sequence batches that mix backend samplers and CPU sampler
|
|
1577
|
+
// this is currently inefficient as we copy all logits even for the
|
|
1578
|
+
// backend sampled tokens.
|
|
1579
|
+
if (logits && t_logits && n_outputs > 0) {
|
|
1212
1580
|
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
|
|
1213
1581
|
GGML_ASSERT(backend_res != nullptr);
|
|
1214
1582
|
GGML_ASSERT(logits != nullptr);
|
|
@@ -1223,7 +1591,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1223
1591
|
}
|
|
1224
1592
|
|
|
1225
1593
|
// extract embeddings
|
|
1226
|
-
if (t_embd && n_outputs > 0) {
|
|
1594
|
+
if (embd && t_embd && n_outputs > 0) {
|
|
1227
1595
|
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
|
|
1228
1596
|
GGML_ASSERT(backend_embd != nullptr);
|
|
1229
1597
|
|
|
@@ -1277,6 +1645,22 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1277
1645
|
}
|
|
1278
1646
|
}
|
|
1279
1647
|
|
|
1648
|
+
// This flag indicates whether a backend sampler has actually sampled a specific
|
|
1649
|
+
// token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
|
|
1650
|
+
const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty();
|
|
1651
|
+
|
|
1652
|
+
if (has_samplers && has_sampled) {
|
|
1653
|
+
const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
|
|
1654
|
+
const auto stride = n_vocab;
|
|
1655
|
+
|
|
1656
|
+
// async copy the sampling data from the backend to the host
|
|
1657
|
+
copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
|
|
1658
|
+
|
|
1659
|
+
copy_tensor_async_floats (res->t_sampled_logits, sampling.logits, stride, sampling.logits_count, seq_to_output_row, sched.get());
|
|
1660
|
+
copy_tensor_async_floats (res->t_sampled_probs, sampling.probs, stride, sampling.probs_count, seq_to_output_row, sched.get());
|
|
1661
|
+
copy_tensor_async_candidates(res->t_candidates, sampling.candidates, stride, sampling.candidates_count, seq_to_output_row, sched.get());
|
|
1662
|
+
}
|
|
1663
|
+
|
|
1280
1664
|
n_outputs_prev += n_outputs;
|
|
1281
1665
|
} while (mctx->next());
|
|
1282
1666
|
|
|
@@ -1340,7 +1724,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1340
1724
|
// output
|
|
1341
1725
|
//
|
|
1342
1726
|
|
|
1343
|
-
uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
1727
|
+
uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & batch) {
|
|
1344
1728
|
const auto & hparams = model.hparams;
|
|
1345
1729
|
const auto & vocab = model.vocab;
|
|
1346
1730
|
|
|
@@ -1359,8 +1743,53 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1359
1743
|
has_embd = true;
|
|
1360
1744
|
}
|
|
1361
1745
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1746
|
+
// Check which sampling modes are needed for the current batch.
|
|
1747
|
+
// TODO: avoid this branching by working with the worst-case
|
|
1748
|
+
bool has_sampling = false;
|
|
1749
|
+
bool cpu_logits = false;
|
|
1750
|
+
|
|
1751
|
+
if (batch.logits) {
|
|
1752
|
+
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
|
1753
|
+
if (!batch.logits[i]) {
|
|
1754
|
+
continue;
|
|
1755
|
+
}
|
|
1756
|
+
for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
|
|
1757
|
+
llama_seq_id seq_id = batch.seq_id[i][j];
|
|
1758
|
+
if (sampling.samplers.find(seq_id) != sampling.samplers.end()) {
|
|
1759
|
+
has_sampling = true;
|
|
1760
|
+
} else {
|
|
1761
|
+
cpu_logits = true;
|
|
1762
|
+
}
|
|
1763
|
+
}
|
|
1764
|
+
}
|
|
1765
|
+
} else {
|
|
1766
|
+
// When batch.logits is nullptr (when loading state with a dummy batch),
|
|
1767
|
+
// allocate CPU logits.
|
|
1768
|
+
cpu_logits = true;
|
|
1769
|
+
}
|
|
1770
|
+
|
|
1771
|
+
size_t backend_float_count = 0;
|
|
1772
|
+
size_t backend_token_count = 0;
|
|
1773
|
+
|
|
1774
|
+
// Allocate CPU logits buffer only if needed by sequences in this batch
|
|
1775
|
+
logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
|
|
1776
|
+
embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
|
1777
|
+
|
|
1778
|
+
// TODO: avoid this branching by working with the worst-case
|
|
1779
|
+
if (!has_sampling) {
|
|
1780
|
+
sampling.logits_size = 0;
|
|
1781
|
+
sampling.probs_size = 0;
|
|
1782
|
+
sampling.sampled_size = 0;
|
|
1783
|
+
sampling.candidates_size = 0;
|
|
1784
|
+
} else {
|
|
1785
|
+
sampling.logits_size = n_vocab*n_outputs_max;
|
|
1786
|
+
sampling.probs_size = n_vocab*n_outputs_max;
|
|
1787
|
+
sampling.sampled_size = n_outputs_max;
|
|
1788
|
+
sampling.candidates_size = n_vocab*n_outputs_max;
|
|
1789
|
+
|
|
1790
|
+
backend_float_count = sampling.logits_size + sampling.probs_size;
|
|
1791
|
+
backend_token_count = sampling.sampled_size + sampling.candidates_size;
|
|
1792
|
+
}
|
|
1364
1793
|
|
|
1365
1794
|
if (output_ids.empty()) {
|
|
1366
1795
|
// init, never resized afterwards
|
|
@@ -1368,7 +1797,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1368
1797
|
}
|
|
1369
1798
|
|
|
1370
1799
|
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
|
|
1371
|
-
const size_t new_size =
|
|
1800
|
+
const size_t new_size =
|
|
1801
|
+
(logits_size + embd_size + backend_float_count) * sizeof(float) +
|
|
1802
|
+
( backend_token_count) * sizeof(llama_token);
|
|
1372
1803
|
|
|
1373
1804
|
// alloc only when more than the current capacity is required
|
|
1374
1805
|
// TODO: also consider shrinking the buffer
|
|
@@ -1376,9 +1807,11 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1376
1807
|
if (buf_output) {
|
|
1377
1808
|
#ifndef NDEBUG
|
|
1378
1809
|
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
|
1379
|
-
|
|
1810
|
+
LLAMA_LOG_DEBUG("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
1380
1811
|
#endif
|
|
1381
1812
|
synchronize();
|
|
1813
|
+
|
|
1814
|
+
// TODO: not needed?
|
|
1382
1815
|
buf_output = nullptr;
|
|
1383
1816
|
logits = nullptr;
|
|
1384
1817
|
embd = nullptr;
|
|
@@ -1400,8 +1833,49 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1400
1833
|
|
|
1401
1834
|
float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
|
|
1402
1835
|
|
|
1403
|
-
logits =
|
|
1404
|
-
embd =
|
|
1836
|
+
logits = nullptr;
|
|
1837
|
+
embd = nullptr;
|
|
1838
|
+
|
|
1839
|
+
size_t offset = 0;
|
|
1840
|
+
uint8_t * base = (uint8_t *) output_base;
|
|
1841
|
+
|
|
1842
|
+
logits = (has_logits && cpu_logits) ? output_base : nullptr;
|
|
1843
|
+
offset += logits_size * sizeof(float);
|
|
1844
|
+
|
|
1845
|
+
embd = has_embd ? (float *) (base + offset) : nullptr;
|
|
1846
|
+
offset += embd_size * sizeof(float);
|
|
1847
|
+
|
|
1848
|
+
sampling.logits = nullptr;
|
|
1849
|
+
sampling.probs = nullptr;
|
|
1850
|
+
sampling.sampled = nullptr;
|
|
1851
|
+
sampling.candidates = nullptr;
|
|
1852
|
+
|
|
1853
|
+
if (has_sampling) {
|
|
1854
|
+
sampling.logits = (float *) (base + offset);
|
|
1855
|
+
offset += sampling.logits_size * sizeof(float);
|
|
1856
|
+
|
|
1857
|
+
sampling.probs = (float *) (base + offset);
|
|
1858
|
+
offset += sampling.probs_size * sizeof(float);
|
|
1859
|
+
|
|
1860
|
+
sampling.sampled = (llama_token *) (base + offset);
|
|
1861
|
+
offset += sampling.sampled_size * sizeof(llama_token);
|
|
1862
|
+
|
|
1863
|
+
sampling.candidates = (llama_token *) (base + offset);
|
|
1864
|
+
offset += sampling.candidates_size * sizeof(llama_token);
|
|
1865
|
+
|
|
1866
|
+
// The count vectors keep track of the actual number of logits/probs/candidates
|
|
1867
|
+
// copied from the backend for each output row.
|
|
1868
|
+
|
|
1869
|
+
sampling.logits_count.resize(n_outputs_max);
|
|
1870
|
+
sampling.probs_count.resize(n_outputs_max);
|
|
1871
|
+
sampling.candidates_count.resize(n_outputs_max);
|
|
1872
|
+
|
|
1873
|
+
std::fill(sampling.logits_count.begin(), sampling.logits_count.end(), 0);
|
|
1874
|
+
std::fill(sampling.probs_count.begin(), sampling.probs_count.end(), 0);
|
|
1875
|
+
std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
|
|
1876
|
+
|
|
1877
|
+
std::fill_n(sampling.sampled, sampling.sampled_size, LLAMA_TOKEN_NULL);
|
|
1878
|
+
}
|
|
1405
1879
|
|
|
1406
1880
|
// set all ids as invalid (negative)
|
|
1407
1881
|
std::fill(output_ids.begin(), output_ids.end(), -1);
|
|
@@ -1430,6 +1904,40 @@ void llama_context::output_reorder() {
|
|
|
1430
1904
|
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
|
1431
1905
|
}
|
|
1432
1906
|
}
|
|
1907
|
+
|
|
1908
|
+
if (sampling.logits && sampling.logits_size > 0) {
|
|
1909
|
+
for (uint64_t k = 0; k < n_vocab; ++k) {
|
|
1910
|
+
std::swap(sampling.logits[i0*n_vocab + k], sampling.logits[i1*n_vocab + k]);
|
|
1911
|
+
}
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1914
|
+
if (sampling.probs && sampling.probs_size > 0) {
|
|
1915
|
+
for (uint64_t k = 0; k < n_vocab; ++k) {
|
|
1916
|
+
std::swap(sampling.probs[i0*n_vocab + k], sampling.probs[i1*n_vocab + k]);
|
|
1917
|
+
}
|
|
1918
|
+
}
|
|
1919
|
+
|
|
1920
|
+
if (sampling.candidates && sampling.candidates_size > 0) {
|
|
1921
|
+
for (uint64_t k = 0; k < n_vocab; ++k) {
|
|
1922
|
+
std::swap(sampling.candidates[i0*n_vocab + k], sampling.candidates[i1*n_vocab + k]);
|
|
1923
|
+
}
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1926
|
+
if (sampling.sampled && sampling.sampled_size > 0) {
|
|
1927
|
+
std::swap(sampling.sampled[i0], sampling.sampled[i1]);
|
|
1928
|
+
}
|
|
1929
|
+
|
|
1930
|
+
if (!sampling.logits_count.empty()) {
|
|
1931
|
+
std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
|
|
1932
|
+
}
|
|
1933
|
+
|
|
1934
|
+
if (!sampling.probs_count.empty()) {
|
|
1935
|
+
std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
|
|
1936
|
+
}
|
|
1937
|
+
|
|
1938
|
+
if (!sampling.candidates_count.empty()) {
|
|
1939
|
+
std::swap(sampling.candidates_count[i0], sampling.candidates_count[i1]);
|
|
1940
|
+
}
|
|
1433
1941
|
}
|
|
1434
1942
|
|
|
1435
1943
|
output_swaps.clear();
|
|
@@ -1443,7 +1951,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
|
|
1443
1951
|
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
|
1444
1952
|
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
|
1445
1953
|
}
|
|
1446
|
-
|
|
1954
|
+
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
|
1955
|
+
res += model.n_lora_nodes;
|
|
1956
|
+
return res;
|
|
1447
1957
|
}
|
|
1448
1958
|
|
|
1449
1959
|
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
|
@@ -1457,7 +1967,7 @@ ggml_cgraph * llama_context::graph_reserve(
|
|
|
1457
1967
|
|
|
1458
1968
|
if (n_tokens % n_seqs != 0) {
|
|
1459
1969
|
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
|
1460
|
-
n_outputs = std::
|
|
1970
|
+
n_outputs = std::max(n_outputs, n_tokens);
|
|
1461
1971
|
|
|
1462
1972
|
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
1463
1973
|
}
|
|
@@ -1476,6 +1986,15 @@ ggml_cgraph * llama_context::graph_reserve(
|
|
|
1476
1986
|
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
|
|
1477
1987
|
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
|
|
1478
1988
|
|
|
1989
|
+
// set one output token per sequence in order to activate all backend samplers
|
|
1990
|
+
std::vector<llama_seq_id> seq_ids(n_seqs);
|
|
1991
|
+
for (uint32_t i = 0; i < n_seqs; ++i) {
|
|
1992
|
+
seq_ids[i] = i;
|
|
1993
|
+
ubatch.n_seq_id[i] = 1;
|
|
1994
|
+
ubatch.seq_id[i] = &seq_ids[i];
|
|
1995
|
+
ubatch.output[i] = true;
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1479
1998
|
auto * res = gf_res_reserve.get();
|
|
1480
1999
|
|
|
1481
2000
|
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
|
|
@@ -1506,7 +2025,7 @@ llm_graph_params llama_context::graph_params(
|
|
|
1506
2025
|
llm_graph_result * res,
|
|
1507
2026
|
const llama_ubatch & ubatch,
|
|
1508
2027
|
const llama_memory_context_i * mctx,
|
|
1509
|
-
|
|
2028
|
+
llm_graph_type gtype) const {
|
|
1510
2029
|
return {
|
|
1511
2030
|
/*.arch =*/ model.arch,
|
|
1512
2031
|
/*.hparams =*/ model.hparams,
|
|
@@ -1519,6 +2038,7 @@ llm_graph_params llama_context::graph_params(
|
|
|
1519
2038
|
/*.loras =*/ &loras,
|
|
1520
2039
|
/*.mctx =*/ mctx,
|
|
1521
2040
|
/*.cross =*/ &cross,
|
|
2041
|
+
/*.samplers =*/ sampling.samplers,
|
|
1522
2042
|
/*.n_outputs =*/ n_outputs,
|
|
1523
2043
|
/*.cb =*/ graph_get_cb(),
|
|
1524
2044
|
/*.res =*/ res,
|
|
@@ -1571,7 +2091,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
|
|
|
1571
2091
|
|
|
1572
2092
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
|
1573
2093
|
// FIXME: fix in ggml_backend_sched
|
|
1574
|
-
const bool full_offload = model.
|
|
2094
|
+
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
|
|
1575
2095
|
if (ubatch.n_tokens < 32 || full_offload) {
|
|
1576
2096
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
|
1577
2097
|
const auto & dev_layer = model.dev_layer(il);
|
|
@@ -1974,6 +2494,9 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
|
|
1974
2494
|
}
|
|
1975
2495
|
}
|
|
1976
2496
|
|
|
2497
|
+
// TODO: handle sampling buffers and samplers state ?
|
|
2498
|
+
// https://github.com/ggml-org/llama.cpp/pull/17004
|
|
2499
|
+
|
|
1977
2500
|
if (memory != nullptr) {
|
|
1978
2501
|
LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
|
|
1979
2502
|
memory->state_write(io);
|
|
@@ -2006,7 +2529,10 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
|
|
2006
2529
|
auto n_outputs = this->n_outputs;
|
|
2007
2530
|
io.read_to(&n_outputs, sizeof(n_outputs));
|
|
2008
2531
|
|
|
2009
|
-
|
|
2532
|
+
// Create a dummy batch for state loading.
|
|
2533
|
+
llama_batch dummy_batch = {};
|
|
2534
|
+
dummy_batch.n_tokens = 0;
|
|
2535
|
+
if (n_outputs > output_reserve(n_outputs, dummy_batch)) {
|
|
2010
2536
|
throw std::runtime_error("could not reserve outputs");
|
|
2011
2537
|
}
|
|
2012
2538
|
|
|
@@ -2060,6 +2586,9 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
|
|
2060
2586
|
}
|
|
2061
2587
|
}
|
|
2062
2588
|
|
|
2589
|
+
// TODO: handle sampling buffers and samplers state ?
|
|
2590
|
+
// https://github.com/ggml-org/llama.cpp/pull/17004
|
|
2591
|
+
|
|
2063
2592
|
if (memory) {
|
|
2064
2593
|
LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
|
|
2065
2594
|
|
|
@@ -2248,7 +2777,7 @@ void llama_context::opt_epoch_iter(
|
|
|
2248
2777
|
}
|
|
2249
2778
|
|
|
2250
2779
|
// reserve output buffer
|
|
2251
|
-
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
|
2780
|
+
if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
|
|
2252
2781
|
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
|
2253
2782
|
GGML_ABORT("TODO: handle this error");
|
|
2254
2783
|
};
|
|
@@ -2393,6 +2922,8 @@ llama_context_params llama_context_default_params() {
|
|
|
2393
2922
|
/*.op_offload =*/ true,
|
|
2394
2923
|
/*.swa_full =*/ true,
|
|
2395
2924
|
/*.kv_unified =*/ false,
|
|
2925
|
+
/*.sampler =*/ nullptr,
|
|
2926
|
+
/*.n_sampler =*/ 0,
|
|
2396
2927
|
};
|
|
2397
2928
|
|
|
2398
2929
|
return result;
|
|
@@ -2552,7 +3083,15 @@ float * llama_get_logits(llama_context * ctx) {
|
|
|
2552
3083
|
float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
|
|
2553
3084
|
ctx->synchronize();
|
|
2554
3085
|
|
|
2555
|
-
|
|
3086
|
+
float * res = nullptr;
|
|
3087
|
+
|
|
3088
|
+
res = ctx->get_sampled_logits_ith(i);
|
|
3089
|
+
|
|
3090
|
+
if (!res) {
|
|
3091
|
+
res = ctx->get_logits_ith(i);
|
|
3092
|
+
}
|
|
3093
|
+
|
|
3094
|
+
return res;
|
|
2556
3095
|
}
|
|
2557
3096
|
|
|
2558
3097
|
float * llama_get_embeddings(llama_context * ctx) {
|
|
@@ -2573,6 +3112,52 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
|
|
|
2573
3112
|
return ctx->get_embeddings_seq(seq_id);
|
|
2574
3113
|
}
|
|
2575
3114
|
|
|
3115
|
+
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
|
|
3116
|
+
return ctx->set_sampler(seq_id, smpl);
|
|
3117
|
+
}
|
|
3118
|
+
|
|
3119
|
+
llama_token llama_get_sampled_token_ith(llama_context * ctx, int32_t i) {
|
|
3120
|
+
ctx->synchronize();
|
|
3121
|
+
|
|
3122
|
+
return ctx->get_sampled_token_ith(i);
|
|
3123
|
+
}
|
|
3124
|
+
|
|
3125
|
+
float * llama_get_sampled_probs_ith(llama_context * ctx, int32_t i) {
|
|
3126
|
+
ctx->synchronize();
|
|
3127
|
+
|
|
3128
|
+
return ctx->get_sampled_probs_ith(i);
|
|
3129
|
+
}
|
|
3130
|
+
|
|
3131
|
+
float * llama_get_sampled_logits_ith(llama_context * ctx, int32_t i) {
|
|
3132
|
+
ctx->synchronize();
|
|
3133
|
+
|
|
3134
|
+
return ctx->get_sampled_logits_ith(i);
|
|
3135
|
+
}
|
|
3136
|
+
|
|
3137
|
+
llama_token * llama_get_sampled_candidates_ith(llama_context * ctx, int32_t i) {
|
|
3138
|
+
ctx->synchronize();
|
|
3139
|
+
|
|
3140
|
+
return const_cast<llama_token *>(ctx->get_sampled_candidates_ith(i));
|
|
3141
|
+
}
|
|
3142
|
+
|
|
3143
|
+
uint32_t llama_get_sampled_candidates_count_ith(llama_context * ctx, int32_t i) {
|
|
3144
|
+
ctx->synchronize();
|
|
3145
|
+
|
|
3146
|
+
return static_cast<uint32_t>(ctx->get_sampled_candidates_count(i));
|
|
3147
|
+
}
|
|
3148
|
+
|
|
3149
|
+
uint32_t llama_get_sampled_logits_count_ith(llama_context * ctx, int32_t i) {
|
|
3150
|
+
ctx->synchronize();
|
|
3151
|
+
|
|
3152
|
+
return static_cast<uint32_t>(ctx->get_sampled_logits_count(i));
|
|
3153
|
+
}
|
|
3154
|
+
|
|
3155
|
+
uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
|
|
3156
|
+
ctx->synchronize();
|
|
3157
|
+
|
|
3158
|
+
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
|
|
3159
|
+
}
|
|
3160
|
+
|
|
2576
3161
|
// llama adapter API
|
|
2577
3162
|
|
|
2578
3163
|
int32_t llama_set_adapter_lora(
|