llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
- llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
- vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
- vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
- vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
- vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
- vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
- vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
- vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
- vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
- vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
- vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
- llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
- vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
- {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -33,6 +33,7 @@ int server_queue::post(server_task && task, bool front) {
|
|
|
33
33
|
} else {
|
|
34
34
|
queue_tasks.push_back(std::move(task));
|
|
35
35
|
}
|
|
36
|
+
time_last_task = ggml_time_ms();
|
|
36
37
|
condition_tasks.notify_one();
|
|
37
38
|
return task_id;
|
|
38
39
|
}
|
|
@@ -54,6 +55,7 @@ int server_queue::post(std::vector<server_task> && tasks, bool front) {
|
|
|
54
55
|
queue_tasks.push_back(std::move(task));
|
|
55
56
|
}
|
|
56
57
|
}
|
|
58
|
+
time_last_task = ggml_time_ms();
|
|
57
59
|
condition_tasks.notify_one();
|
|
58
60
|
return 0;
|
|
59
61
|
}
|
|
@@ -62,6 +64,7 @@ void server_queue::defer(server_task && task) {
|
|
|
62
64
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
63
65
|
QUE_DBG("defer task, id = %d\n", task.id);
|
|
64
66
|
queue_tasks_deferred.push_back(std::move(task));
|
|
67
|
+
time_last_task = ggml_time_ms();
|
|
65
68
|
condition_tasks.notify_one();
|
|
66
69
|
}
|
|
67
70
|
|
|
@@ -71,31 +74,52 @@ int server_queue::get_new_id() {
|
|
|
71
74
|
return new_id;
|
|
72
75
|
}
|
|
73
76
|
|
|
74
|
-
void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
|
|
75
|
-
callback_new_task = std::move(callback);
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
void server_queue::on_update_slots(std::function<void(void)> callback) {
|
|
79
|
-
callback_update_slots = std::move(callback);
|
|
80
|
-
}
|
|
81
|
-
|
|
82
77
|
void server_queue::pop_deferred_task() {
|
|
83
78
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
84
79
|
if (!queue_tasks_deferred.empty()) {
|
|
85
80
|
queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
|
|
86
81
|
queue_tasks_deferred.pop_front();
|
|
87
82
|
}
|
|
83
|
+
time_last_task = ggml_time_ms();
|
|
88
84
|
condition_tasks.notify_one();
|
|
89
85
|
}
|
|
90
86
|
|
|
87
|
+
void server_queue::wait_until_no_sleep() {
|
|
88
|
+
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
89
|
+
if (!sleeping) {
|
|
90
|
+
return;
|
|
91
|
+
} else {
|
|
92
|
+
if (!req_stop_sleeping) {
|
|
93
|
+
QUE_DBG("%s", "requesting to stop sleeping\n");
|
|
94
|
+
req_stop_sleeping = true;
|
|
95
|
+
condition_tasks.notify_one(); // only main thread is waiting on this
|
|
96
|
+
}
|
|
97
|
+
QUE_DBG("%s", "waiting until no sleep\n");
|
|
98
|
+
condition_tasks.wait(lock, [&]{
|
|
99
|
+
return !sleeping;
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
91
104
|
void server_queue::terminate() {
|
|
92
105
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
93
106
|
running = false;
|
|
94
107
|
condition_tasks.notify_all();
|
|
95
108
|
}
|
|
96
109
|
|
|
97
|
-
void server_queue::start_loop() {
|
|
110
|
+
void server_queue::start_loop(int64_t idle_sleep_ms) {
|
|
98
111
|
running = true;
|
|
112
|
+
time_last_task = ggml_time_ms();
|
|
113
|
+
|
|
114
|
+
constexpr auto max_wait_time = std::chrono::seconds(1);
|
|
115
|
+
auto should_sleep = [&]() -> bool {
|
|
116
|
+
// caller must hold mutex_tasks
|
|
117
|
+
if (idle_sleep_ms < 0) {
|
|
118
|
+
return false;
|
|
119
|
+
}
|
|
120
|
+
int64_t now = ggml_time_ms();
|
|
121
|
+
return (now - time_last_task) >= idle_sleep_ms;
|
|
122
|
+
};
|
|
99
123
|
|
|
100
124
|
while (true) {
|
|
101
125
|
QUE_DBG("%s", "processing new tasks\n");
|
|
@@ -117,23 +141,53 @@ void server_queue::start_loop() {
|
|
|
117
141
|
QUE_DBG("processing task, id = %d\n", task.id);
|
|
118
142
|
callback_new_task(std::move(task));
|
|
119
143
|
}
|
|
120
|
-
|
|
121
144
|
// all tasks in the current loop is processed, slots data is now ready
|
|
122
145
|
QUE_DBG("%s", "update slots\n");
|
|
123
146
|
|
|
147
|
+
// this will run the main inference process for all slots
|
|
124
148
|
callback_update_slots();
|
|
149
|
+
{
|
|
150
|
+
// update_slots() may take a while to finish, we need to make sure it's not counted as idle
|
|
151
|
+
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
152
|
+
time_last_task = ggml_time_ms();
|
|
153
|
+
}
|
|
125
154
|
|
|
126
155
|
QUE_DBG("%s", "waiting for new tasks\n");
|
|
127
|
-
{
|
|
156
|
+
while (true) {
|
|
128
157
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
129
|
-
if (!running) {
|
|
130
|
-
|
|
131
|
-
return;
|
|
158
|
+
if (!running || !queue_tasks.empty()) {
|
|
159
|
+
break; // go back to process new tasks or terminate
|
|
132
160
|
}
|
|
133
|
-
|
|
161
|
+
|
|
162
|
+
// no tasks, check for sleeping state
|
|
163
|
+
if (should_sleep()) {
|
|
164
|
+
QUE_INF("%s", "entering sleeping state\n");
|
|
165
|
+
sleeping = true;
|
|
166
|
+
callback_sleeping_state(true);
|
|
167
|
+
req_stop_sleeping = false;
|
|
168
|
+
// wait until we are requested to exit sleeping state
|
|
134
169
|
condition_tasks.wait(lock, [&]{
|
|
170
|
+
return (!running || req_stop_sleeping);
|
|
171
|
+
});
|
|
172
|
+
if (!running) { // may changed during sleep
|
|
173
|
+
break; // terminate
|
|
174
|
+
}
|
|
175
|
+
QUE_INF("%s", "exiting sleeping state\n");
|
|
176
|
+
req_stop_sleeping = false;
|
|
177
|
+
callback_sleeping_state(false);
|
|
178
|
+
sleeping = false;
|
|
179
|
+
time_last_task = ggml_time_ms();
|
|
180
|
+
condition_tasks.notify_all(); // notify wait_until_no_sleep()
|
|
181
|
+
break; // process new tasks
|
|
182
|
+
} else {
|
|
183
|
+
// wait for new tasks or timeout for checking sleeping condition
|
|
184
|
+
bool res = condition_tasks.wait_for(lock, max_wait_time, [&]{
|
|
135
185
|
return (!queue_tasks.empty() || !running);
|
|
136
186
|
});
|
|
187
|
+
if (res) {
|
|
188
|
+
break; // new task arrived or terminate
|
|
189
|
+
}
|
|
190
|
+
// otherwise, loop again to check sleeping condition
|
|
137
191
|
}
|
|
138
192
|
}
|
|
139
193
|
}
|
|
@@ -271,23 +325,25 @@ void server_response::terminate() {
|
|
|
271
325
|
// server_response_reader
|
|
272
326
|
//
|
|
273
327
|
|
|
274
|
-
void server_response_reader::post_task(server_task && task) {
|
|
328
|
+
void server_response_reader::post_task(server_task && task, bool front) {
|
|
275
329
|
GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
|
|
330
|
+
task.index = 0;
|
|
276
331
|
id_tasks.insert(task.id);
|
|
277
332
|
states.push_back(task.create_state());
|
|
278
333
|
queue_results.add_waiting_task_id(task.id);
|
|
279
|
-
queue_tasks.post(std::move(task));
|
|
334
|
+
queue_tasks.post(std::move(task), front);
|
|
280
335
|
}
|
|
281
336
|
|
|
282
|
-
void server_response_reader::post_tasks(std::vector<server_task> && tasks) {
|
|
337
|
+
void server_response_reader::post_tasks(std::vector<server_task> && tasks, bool front) {
|
|
283
338
|
GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
|
|
284
339
|
id_tasks = server_task::get_list_id(tasks);
|
|
285
340
|
states.reserve(tasks.size());
|
|
286
341
|
for (size_t i = 0; i < tasks.size(); i++) {
|
|
342
|
+
tasks[i].index = i;
|
|
287
343
|
states.push_back(tasks[i].create_state());
|
|
288
344
|
}
|
|
289
345
|
queue_results.add_waiting_tasks(tasks);
|
|
290
|
-
queue_tasks.post(std::move(tasks));
|
|
346
|
+
queue_tasks.post(std::move(tasks), front);
|
|
291
347
|
}
|
|
292
348
|
|
|
293
349
|
bool server_response_reader::has_next() const {
|
|
@@ -313,7 +369,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
|
|
|
313
369
|
}
|
|
314
370
|
if (!states.empty()) {
|
|
315
371
|
// update the generation state if needed
|
|
316
|
-
size_t idx = result->
|
|
372
|
+
const size_t idx = result->index;
|
|
317
373
|
GGML_ASSERT(idx < states.size());
|
|
318
374
|
result->update(states[idx]);
|
|
319
375
|
}
|
|
@@ -329,6 +385,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
|
|
|
329
385
|
|
|
330
386
|
server_response_reader::batch_response server_response_reader::wait_for_all(const std::function<bool()> & should_stop) {
|
|
331
387
|
batch_response batch_res;
|
|
388
|
+
batch_res.results.clear();
|
|
332
389
|
batch_res.results.resize(id_tasks.size());
|
|
333
390
|
while (has_next()) {
|
|
334
391
|
auto res = next(should_stop);
|
|
@@ -340,7 +397,7 @@ server_response_reader::batch_response server_response_reader::wait_for_all(cons
|
|
|
340
397
|
batch_res.error = std::move(res);
|
|
341
398
|
return batch_res;
|
|
342
399
|
}
|
|
343
|
-
const size_t idx = res->
|
|
400
|
+
const size_t idx = res->index;
|
|
344
401
|
GGML_ASSERT(idx < batch_res.results.size() && "index out of range");
|
|
345
402
|
GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received");
|
|
346
403
|
batch_res.results[idx] = std::move(res);
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include <condition_variable>
|
|
6
6
|
#include <deque>
|
|
7
7
|
#include <mutex>
|
|
8
|
+
#include <vector>
|
|
8
9
|
#include <unordered_set>
|
|
9
10
|
|
|
10
11
|
// struct for managing server tasks
|
|
@@ -12,7 +13,10 @@
|
|
|
12
13
|
struct server_queue {
|
|
13
14
|
private:
|
|
14
15
|
int id = 0;
|
|
15
|
-
bool running;
|
|
16
|
+
bool running = false;
|
|
17
|
+
bool sleeping = false;
|
|
18
|
+
bool req_stop_sleeping = false;
|
|
19
|
+
int64_t time_last_task = 0;
|
|
16
20
|
|
|
17
21
|
// queues
|
|
18
22
|
std::deque<server_task> queue_tasks;
|
|
@@ -24,6 +28,7 @@ private:
|
|
|
24
28
|
// callback functions
|
|
25
29
|
std::function<void(server_task &&)> callback_new_task;
|
|
26
30
|
std::function<void(void)> callback_update_slots;
|
|
31
|
+
std::function<void(bool)> callback_sleeping_state;
|
|
27
32
|
|
|
28
33
|
public:
|
|
29
34
|
// Add a new task to the end of the queue
|
|
@@ -38,15 +43,18 @@ public:
|
|
|
38
43
|
// Get the next id for creating a new task
|
|
39
44
|
int get_new_id();
|
|
40
45
|
|
|
41
|
-
// Register function to process a new task
|
|
42
|
-
void on_new_task(std::function<void(server_task &&)> callback);
|
|
43
|
-
|
|
44
|
-
// Register the function to be called when all slots data is ready to be processed
|
|
45
|
-
void on_update_slots(std::function<void(void)> callback);
|
|
46
|
-
|
|
47
46
|
// Call when the state of one slot is changed, it will move one task from deferred to main queue
|
|
48
47
|
void pop_deferred_task();
|
|
49
48
|
|
|
49
|
+
// if sleeping, request exiting sleep state and wait until it is done
|
|
50
|
+
// returns immediately if not sleeping
|
|
51
|
+
void wait_until_no_sleep();
|
|
52
|
+
|
|
53
|
+
bool is_sleeping() {
|
|
54
|
+
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
55
|
+
return sleeping;
|
|
56
|
+
}
|
|
57
|
+
|
|
50
58
|
// end the start_loop routine
|
|
51
59
|
void terminate();
|
|
52
60
|
|
|
@@ -56,8 +64,15 @@ public:
|
|
|
56
64
|
* - Process the task (i.e. maybe copy data into slot)
|
|
57
65
|
* - Check if multitask is finished
|
|
58
66
|
* - Update all slots
|
|
67
|
+
*
|
|
68
|
+
* Sleeping procedure (disabled if idle_sleep_ms < 0):
|
|
69
|
+
* - If there is no task after idle_sleep_ms, enter sleeping state
|
|
70
|
+
* - Call callback_sleeping_state(true)
|
|
71
|
+
* - Wait until req_stop_sleeping is set to true
|
|
72
|
+
* - Call callback_sleeping_state(false)
|
|
73
|
+
* - Exit sleeping state
|
|
59
74
|
*/
|
|
60
|
-
void start_loop();
|
|
75
|
+
void start_loop(int64_t idle_sleep_ms = -1);
|
|
61
76
|
|
|
62
77
|
// for metrics
|
|
63
78
|
size_t queue_tasks_deferred_size() {
|
|
@@ -65,6 +80,27 @@ public:
|
|
|
65
80
|
return queue_tasks_deferred.size();
|
|
66
81
|
}
|
|
67
82
|
|
|
83
|
+
//
|
|
84
|
+
// Functions below are not thread-safe, must only be used before start_loop() is called
|
|
85
|
+
//
|
|
86
|
+
|
|
87
|
+
// Register function to process a new task
|
|
88
|
+
void on_new_task(std::function<void(server_task &&)> callback) {
|
|
89
|
+
callback_new_task = std::move(callback);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Register the function to be called when all slots data is ready to be processed
|
|
93
|
+
void on_update_slots(std::function<void(void)> callback) {
|
|
94
|
+
callback_update_slots = std::move(callback);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Register callback for sleeping state change
|
|
98
|
+
// note: when entering sleeping state, the callback is called AFTER sleeping is set to true
|
|
99
|
+
// when leaving sleeping state, the callback is called BEFORE sleeping is set to false
|
|
100
|
+
void on_sleeping_state(std::function<void(bool)> callback) {
|
|
101
|
+
callback_sleeping_state = std::move(callback);
|
|
102
|
+
}
|
|
103
|
+
|
|
68
104
|
private:
|
|
69
105
|
void cleanup_pending_task(int id_target);
|
|
70
106
|
};
|
|
@@ -138,8 +174,10 @@ struct server_response_reader {
|
|
|
138
174
|
int get_new_id() {
|
|
139
175
|
return queue_tasks.get_new_id();
|
|
140
176
|
}
|
|
141
|
-
|
|
142
|
-
|
|
177
|
+
|
|
178
|
+
// if front = true, the task will be posted to the front of the queue (high priority)
|
|
179
|
+
void post_task(server_task && task, bool front = false);
|
|
180
|
+
void post_tasks(std::vector<server_task> && tasks, bool front = false);
|
|
143
181
|
bool has_next() const;
|
|
144
182
|
|
|
145
183
|
// return nullptr if should_stop() is true before receiving a result
|
|
@@ -32,8 +32,8 @@ json task_params::to_json(bool only_metrics) const {
|
|
|
32
32
|
}
|
|
33
33
|
|
|
34
34
|
json lora = json::array();
|
|
35
|
-
for (
|
|
36
|
-
lora.push_back({{"id",
|
|
35
|
+
for (auto & it : this->lora) {
|
|
36
|
+
lora.push_back({{"id", it.first}, {"scale", it.second}});
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
if (only_metrics) {
|
|
@@ -78,6 +78,7 @@ json task_params::to_json(bool only_metrics) const {
|
|
|
78
78
|
{"speculative.p_min", speculative.p_min},
|
|
79
79
|
{"timings_per_token", timings_per_token},
|
|
80
80
|
{"post_sampling_probs", post_sampling_probs},
|
|
81
|
+
{"backend_sampling", sampling.backend_sampling},
|
|
81
82
|
{"lora", lora},
|
|
82
83
|
};
|
|
83
84
|
}
|
|
@@ -136,6 +137,7 @@ json task_params::to_json(bool only_metrics) const {
|
|
|
136
137
|
{"speculative.p_min", speculative.p_min},
|
|
137
138
|
{"timings_per_token", timings_per_token},
|
|
138
139
|
{"post_sampling_probs", post_sampling_probs},
|
|
140
|
+
{"backend_sampling", sampling.backend_sampling},
|
|
139
141
|
{"lora", lora},
|
|
140
142
|
};
|
|
141
143
|
}
|
|
@@ -145,12 +147,10 @@ json task_params::to_json(bool only_metrics) const {
|
|
|
145
147
|
//
|
|
146
148
|
|
|
147
149
|
task_params server_task::params_from_json_cmpl(
|
|
148
|
-
const
|
|
150
|
+
const llama_vocab * vocab,
|
|
149
151
|
const common_params & params_base,
|
|
152
|
+
const int n_ctx_slot,
|
|
150
153
|
const json & data) {
|
|
151
|
-
const llama_model * model = llama_get_model(ctx);
|
|
152
|
-
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
153
|
-
|
|
154
154
|
task_params params;
|
|
155
155
|
|
|
156
156
|
// Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
|
|
@@ -206,6 +206,7 @@ task_params server_task::params_from_json_cmpl(
|
|
|
206
206
|
params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
|
|
207
207
|
params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
|
|
208
208
|
params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
|
|
209
|
+
params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling);
|
|
209
210
|
params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
|
|
210
211
|
|
|
211
212
|
params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
|
|
@@ -223,12 +224,12 @@ task_params server_task::params_from_json_cmpl(
|
|
|
223
224
|
|
|
224
225
|
if (data.contains("lora")) {
|
|
225
226
|
if (data.at("lora").is_array()) {
|
|
226
|
-
params.lora = parse_lora_request(
|
|
227
|
+
params.lora = parse_lora_request(data.at("lora"));
|
|
227
228
|
} else {
|
|
228
229
|
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
|
|
229
230
|
}
|
|
230
231
|
} else {
|
|
231
|
-
params.lora =
|
|
232
|
+
params.lora = {};
|
|
232
233
|
}
|
|
233
234
|
|
|
234
235
|
// TODO: add more sanity checks for the input parameters
|
|
@@ -243,11 +244,11 @@ task_params server_task::params_from_json_cmpl(
|
|
|
243
244
|
|
|
244
245
|
if (params.sampling.penalty_last_n == -1) {
|
|
245
246
|
// note: should be the slot's context and not the full context, but it's ok
|
|
246
|
-
params.sampling.penalty_last_n =
|
|
247
|
+
params.sampling.penalty_last_n = n_ctx_slot;
|
|
247
248
|
}
|
|
248
249
|
|
|
249
250
|
if (params.sampling.dry_penalty_last_n == -1) {
|
|
250
|
-
params.sampling.dry_penalty_last_n =
|
|
251
|
+
params.sampling.dry_penalty_last_n = n_ctx_slot;
|
|
251
252
|
}
|
|
252
253
|
|
|
253
254
|
if (params.sampling.dry_base < 1.0f) {
|
|
@@ -1153,7 +1154,7 @@ json server_task_result_rerank::to_json() {
|
|
|
1153
1154
|
json server_task_result_cmpl_partial::to_json_anthropic() {
|
|
1154
1155
|
json events = json::array();
|
|
1155
1156
|
bool first = (n_decoded == 1);
|
|
1156
|
-
|
|
1157
|
+
bool text_block_started = false;
|
|
1157
1158
|
|
|
1158
1159
|
if (first) {
|
|
1159
1160
|
text_block_started = false;
|
|
@@ -1324,6 +1325,30 @@ json server_task_result_slot_erase::to_json() {
|
|
|
1324
1325
|
};
|
|
1325
1326
|
}
|
|
1326
1327
|
|
|
1328
|
+
//
|
|
1329
|
+
// server_task_result_get_lora
|
|
1330
|
+
//
|
|
1331
|
+
|
|
1332
|
+
json server_task_result_get_lora::to_json() {
|
|
1333
|
+
json result = json::array();
|
|
1334
|
+
for (size_t i = 0; i < loras.size(); ++i) {
|
|
1335
|
+
auto & lora = loras[i];
|
|
1336
|
+
json entry = {
|
|
1337
|
+
{"id", i},
|
|
1338
|
+
{"path", lora.info.path},
|
|
1339
|
+
{"scale", lora.info.scale},
|
|
1340
|
+
{"task_name", lora.info.task_name},
|
|
1341
|
+
{"prompt_prefix", lora.info.prompt_prefix},
|
|
1342
|
+
};
|
|
1343
|
+
if (!lora.alora_invocation_tokens.empty()) {
|
|
1344
|
+
entry["alora_invocation_string"] = lora.alora_invocation_string;
|
|
1345
|
+
entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
|
|
1346
|
+
}
|
|
1347
|
+
result.push_back(std::move(entry));
|
|
1348
|
+
}
|
|
1349
|
+
return result;
|
|
1350
|
+
}
|
|
1351
|
+
|
|
1327
1352
|
//
|
|
1328
1353
|
// server_task_result_apply_lora
|
|
1329
1354
|
//
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include <string>
|
|
7
7
|
#include <unordered_set>
|
|
8
8
|
#include <list>
|
|
9
|
+
#include <map>
|
|
9
10
|
|
|
10
11
|
// TODO: prevent including the whole server-common.h as we only use server_tokens
|
|
11
12
|
#include "server-common.h"
|
|
@@ -23,6 +24,7 @@ enum server_task_type {
|
|
|
23
24
|
SERVER_TASK_TYPE_SLOT_SAVE,
|
|
24
25
|
SERVER_TASK_TYPE_SLOT_RESTORE,
|
|
25
26
|
SERVER_TASK_TYPE_SLOT_ERASE,
|
|
27
|
+
SERVER_TASK_TYPE_GET_LORA,
|
|
26
28
|
SERVER_TASK_TYPE_SET_LORA,
|
|
27
29
|
};
|
|
28
30
|
|
|
@@ -60,7 +62,7 @@ struct task_params {
|
|
|
60
62
|
int64_t t_max_prompt_ms = -1; // TODO: implement
|
|
61
63
|
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
|
62
64
|
|
|
63
|
-
std::
|
|
65
|
+
std::map<int, float> lora; // mapping adapter ID -> scale
|
|
64
66
|
|
|
65
67
|
std::vector<std::string> antiprompt;
|
|
66
68
|
std::vector<std::string> response_fields;
|
|
@@ -105,8 +107,10 @@ struct task_result_state {
|
|
|
105
107
|
};
|
|
106
108
|
|
|
107
109
|
struct server_task {
|
|
108
|
-
int id
|
|
109
|
-
|
|
110
|
+
int id = -1; // to be filled by server_queue
|
|
111
|
+
|
|
112
|
+
// TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
|
|
113
|
+
size_t index = 0; // used when there are multiple prompts (batch request)
|
|
110
114
|
|
|
111
115
|
// used by SERVER_TASK_TYPE_CANCEL
|
|
112
116
|
int id_target = -1;
|
|
@@ -138,7 +142,7 @@ struct server_task {
|
|
|
138
142
|
bool metrics_reset_bucket = false;
|
|
139
143
|
|
|
140
144
|
// used by SERVER_TASK_TYPE_SET_LORA
|
|
141
|
-
std::
|
|
145
|
+
std::map<int, float> set_lora; // mapping adapter ID -> scale
|
|
142
146
|
|
|
143
147
|
server_task() = default;
|
|
144
148
|
|
|
@@ -149,9 +153,10 @@ struct server_task {
|
|
|
149
153
|
}
|
|
150
154
|
|
|
151
155
|
static task_params params_from_json_cmpl(
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
156
|
+
const llama_vocab * vocab,
|
|
157
|
+
const common_params & params_base,
|
|
158
|
+
const int n_ctx_slot,
|
|
159
|
+
const json & data);
|
|
155
160
|
|
|
156
161
|
// utility function
|
|
157
162
|
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
|
|
@@ -162,10 +167,9 @@ struct server_task {
|
|
|
162
167
|
return ids;
|
|
163
168
|
}
|
|
164
169
|
|
|
165
|
-
server_task create_child(int id_parent, int id_child
|
|
170
|
+
server_task create_child(int id_parent, int id_child) const {
|
|
166
171
|
server_task copy;
|
|
167
172
|
copy.id = id_child;
|
|
168
|
-
copy.index = idx;
|
|
169
173
|
copy.id_parent = id_parent;
|
|
170
174
|
copy.params = params;
|
|
171
175
|
copy.type = type;
|
|
@@ -212,6 +216,10 @@ struct result_prompt_progress {
|
|
|
212
216
|
struct server_task_result {
|
|
213
217
|
int id = -1;
|
|
214
218
|
int id_slot = -1;
|
|
219
|
+
|
|
220
|
+
// TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
|
|
221
|
+
size_t index = 0; // to be used for batched tasks
|
|
222
|
+
|
|
215
223
|
virtual bool is_error() {
|
|
216
224
|
// only used by server_task_result_error
|
|
217
225
|
return false;
|
|
@@ -220,9 +228,6 @@ struct server_task_result {
|
|
|
220
228
|
// only used by server_task_result_cmpl_*
|
|
221
229
|
return true;
|
|
222
230
|
}
|
|
223
|
-
virtual int get_index() {
|
|
224
|
-
return -1;
|
|
225
|
-
}
|
|
226
231
|
virtual void update(task_result_state &) {
|
|
227
232
|
// only used by server_task_result_cmpl_*
|
|
228
233
|
}
|
|
@@ -255,8 +260,6 @@ struct completion_token_output {
|
|
|
255
260
|
};
|
|
256
261
|
|
|
257
262
|
struct server_task_result_cmpl_final : server_task_result {
|
|
258
|
-
int index = 0;
|
|
259
|
-
|
|
260
263
|
std::string content;
|
|
261
264
|
llama_tokens tokens;
|
|
262
265
|
|
|
@@ -289,10 +292,6 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
289
292
|
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
|
290
293
|
bool is_updated = false;
|
|
291
294
|
|
|
292
|
-
virtual int get_index() override {
|
|
293
|
-
return index;
|
|
294
|
-
}
|
|
295
|
-
|
|
296
295
|
virtual bool is_stop() override {
|
|
297
296
|
return true; // in stream mode, final responses are considered stop
|
|
298
297
|
}
|
|
@@ -318,8 +317,6 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
318
317
|
};
|
|
319
318
|
|
|
320
319
|
struct server_task_result_cmpl_partial : server_task_result {
|
|
321
|
-
int index = 0;
|
|
322
|
-
|
|
323
320
|
std::string content;
|
|
324
321
|
llama_tokens tokens;
|
|
325
322
|
|
|
@@ -340,10 +337,6 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|
|
340
337
|
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
|
341
338
|
bool is_updated = false;
|
|
342
339
|
|
|
343
|
-
virtual int get_index() override {
|
|
344
|
-
return index;
|
|
345
|
-
}
|
|
346
|
-
|
|
347
340
|
virtual bool is_stop() override {
|
|
348
341
|
return false; // in stream mode, partial responses are not considered stop
|
|
349
342
|
}
|
|
@@ -365,7 +358,6 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|
|
365
358
|
};
|
|
366
359
|
|
|
367
360
|
struct server_task_result_embd : server_task_result {
|
|
368
|
-
int index = 0;
|
|
369
361
|
std::vector<std::vector<float>> embedding;
|
|
370
362
|
|
|
371
363
|
int32_t n_tokens;
|
|
@@ -373,10 +365,6 @@ struct server_task_result_embd : server_task_result {
|
|
|
373
365
|
// response formatting
|
|
374
366
|
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
|
375
367
|
|
|
376
|
-
virtual int get_index() override {
|
|
377
|
-
return index;
|
|
378
|
-
}
|
|
379
|
-
|
|
380
368
|
virtual json to_json() override;
|
|
381
369
|
|
|
382
370
|
json to_json_non_oaicompat();
|
|
@@ -385,20 +373,14 @@ struct server_task_result_embd : server_task_result {
|
|
|
385
373
|
};
|
|
386
374
|
|
|
387
375
|
struct server_task_result_rerank : server_task_result {
|
|
388
|
-
int index = 0;
|
|
389
376
|
float score = -1e6;
|
|
390
377
|
|
|
391
378
|
int32_t n_tokens;
|
|
392
379
|
|
|
393
|
-
virtual int get_index() override {
|
|
394
|
-
return index;
|
|
395
|
-
}
|
|
396
|
-
|
|
397
380
|
virtual json to_json() override;
|
|
398
381
|
};
|
|
399
382
|
|
|
400
383
|
struct server_task_result_error : server_task_result {
|
|
401
|
-
int index = 0;
|
|
402
384
|
error_type err_type = ERROR_TYPE_SERVER;
|
|
403
385
|
std::string err_msg;
|
|
404
386
|
|
|
@@ -460,6 +442,17 @@ struct server_task_result_slot_erase : server_task_result {
|
|
|
460
442
|
virtual json to_json() override;
|
|
461
443
|
};
|
|
462
444
|
|
|
445
|
+
struct server_task_result_get_lora : server_task_result {
|
|
446
|
+
struct lora {
|
|
447
|
+
common_adapter_lora_info info;
|
|
448
|
+
std::string alora_invocation_string;
|
|
449
|
+
llama_tokens alora_invocation_tokens;
|
|
450
|
+
};
|
|
451
|
+
std::vector<lora> loras;
|
|
452
|
+
|
|
453
|
+
virtual json to_json() override;
|
|
454
|
+
};
|
|
455
|
+
|
|
463
456
|
struct server_task_result_apply_lora : server_task_result {
|
|
464
457
|
virtual json to_json() override;
|
|
465
458
|
};
|
|
@@ -66,7 +66,7 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
|
|
|
66
66
|
};
|
|
67
67
|
}
|
|
68
68
|
|
|
69
|
-
int main(int argc, char ** argv
|
|
69
|
+
int main(int argc, char ** argv) {
|
|
70
70
|
// own arguments required by this example
|
|
71
71
|
common_params params;
|
|
72
72
|
|
|
@@ -119,14 +119,14 @@ int main(int argc, char ** argv, char ** envp) {
|
|
|
119
119
|
//
|
|
120
120
|
|
|
121
121
|
// register API routes
|
|
122
|
-
server_routes routes(params, ctx_server
|
|
122
|
+
server_routes routes(params, ctx_server);
|
|
123
123
|
|
|
124
124
|
bool is_router_server = params.model.path.empty();
|
|
125
125
|
std::optional<server_models_routes> models_routes{};
|
|
126
126
|
if (is_router_server) {
|
|
127
127
|
// setup server instances manager
|
|
128
128
|
try {
|
|
129
|
-
models_routes.emplace(params, argc, argv
|
|
129
|
+
models_routes.emplace(params, argc, argv);
|
|
130
130
|
} catch (const std::exception & e) {
|
|
131
131
|
LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
|
|
132
132
|
return 1;
|
|
@@ -252,7 +252,7 @@ int main(int argc, char ** argv, char ** envp) {
|
|
|
252
252
|
return 1;
|
|
253
253
|
}
|
|
254
254
|
|
|
255
|
-
|
|
255
|
+
routes.update_meta(ctx_server);
|
|
256
256
|
ctx_http.is_ready.store(true);
|
|
257
257
|
|
|
258
258
|
LOG_INF("%s: model loaded\n", __func__);
|
|
@@ -309,7 +309,11 @@ int main(int argc, char ** argv, char ** envp) {
|
|
|
309
309
|
if (monitor_thread.joinable()) {
|
|
310
310
|
monitor_thread.join();
|
|
311
311
|
}
|
|
312
|
-
|
|
312
|
+
|
|
313
|
+
auto * ll_ctx = ctx_server.get_llama_context();
|
|
314
|
+
if (ll_ctx != nullptr) {
|
|
315
|
+
llama_memory_breakdown_print(ll_ctx);
|
|
316
|
+
}
|
|
313
317
|
}
|
|
314
318
|
|
|
315
319
|
return 0;
|