llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
- llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
- vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
- vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
- vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
- vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
- vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
- vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
- vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
- vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
- vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
- vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
- llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
- vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
- {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -434,8 +434,8 @@ def test_context_size_exceeded_stream():
|
|
|
434
434
|
@pytest.mark.parametrize(
|
|
435
435
|
"n_batch,batch_count,reuse_cache",
|
|
436
436
|
[
|
|
437
|
-
(64,
|
|
438
|
-
(64,
|
|
437
|
+
(64, 4, False),
|
|
438
|
+
(64, 2, True),
|
|
439
439
|
]
|
|
440
440
|
)
|
|
441
441
|
def test_return_progress(n_batch, batch_count, reuse_cache):
|
|
@@ -462,10 +462,18 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
|
|
|
462
462
|
res = make_cmpl_request()
|
|
463
463
|
last_progress = None
|
|
464
464
|
total_batch_count = 0
|
|
465
|
+
|
|
465
466
|
for data in res:
|
|
466
467
|
cur_progress = data.get("prompt_progress", None)
|
|
467
468
|
if cur_progress is None:
|
|
468
469
|
continue
|
|
470
|
+
if total_batch_count == 0:
|
|
471
|
+
# first progress report must have n_cache == n_processed
|
|
472
|
+
assert cur_progress["total"] > 0
|
|
473
|
+
assert cur_progress["cache"] == cur_progress["processed"]
|
|
474
|
+
if reuse_cache:
|
|
475
|
+
# when reusing cache, we expect some cached tokens
|
|
476
|
+
assert cur_progress["cache"] > 0
|
|
469
477
|
if last_progress is not None:
|
|
470
478
|
assert cur_progress["total"] == last_progress["total"]
|
|
471
479
|
assert cur_progress["cache"] == last_progress["cache"]
|
|
@@ -473,6 +481,7 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
|
|
|
473
481
|
total_batch_count += 1
|
|
474
482
|
last_progress = cur_progress
|
|
475
483
|
|
|
484
|
+
# last progress should indicate completion (all tokens processed)
|
|
476
485
|
assert last_progress is not None
|
|
477
486
|
assert last_progress["total"] > 0
|
|
478
487
|
assert last_progress["processed"] == last_progress["total"]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import time
|
|
3
|
+
from utils import *
|
|
4
|
+
|
|
5
|
+
server = ServerPreset.tinyllama2()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.fixture(autouse=True)
|
|
9
|
+
def create_server():
|
|
10
|
+
global server
|
|
11
|
+
server = ServerPreset.tinyllama2()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_server_sleep():
|
|
15
|
+
global server
|
|
16
|
+
server.sleep_idle_seconds = 1
|
|
17
|
+
server.start()
|
|
18
|
+
|
|
19
|
+
# wait a bit so that server can go to sleep
|
|
20
|
+
time.sleep(2)
|
|
21
|
+
|
|
22
|
+
# make sure these endpoints are still responsive after sleep
|
|
23
|
+
res = server.make_request("GET", "/health")
|
|
24
|
+
assert res.status_code == 200
|
|
25
|
+
res = server.make_request("GET", "/props")
|
|
26
|
+
assert res.status_code == 200
|
|
27
|
+
assert res.body["is_sleeping"] == True
|
|
28
|
+
|
|
29
|
+
# make a generation request to wake up the server
|
|
30
|
+
res = server.make_request("POST", "/completion", data={
|
|
31
|
+
"n_predict": 1,
|
|
32
|
+
"prompt": "Hello",
|
|
33
|
+
})
|
|
34
|
+
assert res.status_code == 200
|
|
35
|
+
|
|
36
|
+
# it should no longer be sleeping
|
|
37
|
+
res = server.make_request("GET", "/props")
|
|
38
|
+
assert res.status_code == 200
|
|
39
|
+
assert res.body["is_sleeping"] == False
|
|
@@ -100,6 +100,7 @@ class ServerProcess:
|
|
|
100
100
|
server_path: str | None = None
|
|
101
101
|
mmproj_url: str | None = None
|
|
102
102
|
media_path: str | None = None
|
|
103
|
+
sleep_idle_seconds: int | None = None
|
|
103
104
|
|
|
104
105
|
# session variables
|
|
105
106
|
process: subprocess.Popen | None = None
|
|
@@ -230,6 +231,8 @@ class ServerProcess:
|
|
|
230
231
|
server_args.extend(["--mmproj-url", self.mmproj_url])
|
|
231
232
|
if self.media_path:
|
|
232
233
|
server_args.extend(["--media-path", self.media_path])
|
|
234
|
+
if self.sleep_idle_seconds is not None:
|
|
235
|
+
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
|
|
233
236
|
|
|
234
237
|
args = [str(arg) for arg in [server_path, *server_args]]
|
|
235
238
|
print(f"tests: starting server with: {' '.join(args)}")
|
|
@@ -89,6 +89,7 @@
|
|
|
89
89
|
const fallbackToolCalls = $derived(typeof toolCallContent === 'string' ? toolCallContent : null);
|
|
90
90
|
|
|
91
91
|
const processingState = useProcessingState();
|
|
92
|
+
|
|
92
93
|
let currentConfig = $derived(config());
|
|
93
94
|
let isRouter = $derived(isRouterMode());
|
|
94
95
|
let displayedModel = $derived((): string | null => {
|
|
@@ -116,6 +117,12 @@
|
|
|
116
117
|
}
|
|
117
118
|
});
|
|
118
119
|
|
|
120
|
+
$effect(() => {
|
|
121
|
+
if (isLoading() && !message?.content?.trim()) {
|
|
122
|
+
processingState.startMonitoring();
|
|
123
|
+
}
|
|
124
|
+
});
|
|
125
|
+
|
|
119
126
|
function formatToolCallBadge(toolCall: ApiChatCompletionToolCall, index: number) {
|
|
120
127
|
const callNumber = index + 1;
|
|
121
128
|
const functionName = toolCall.function?.name?.trim();
|
|
@@ -186,7 +193,7 @@
|
|
|
186
193
|
<div class="mt-6 w-full max-w-[48rem]" in:fade>
|
|
187
194
|
<div class="processing-container">
|
|
188
195
|
<span class="processing-text">
|
|
189
|
-
{processingState.getProcessingMessage()}
|
|
196
|
+
{processingState.getPromptProgressText() ?? processingState.getProcessingMessage()}
|
|
190
197
|
</span>
|
|
191
198
|
</div>
|
|
192
199
|
</div>
|
|
@@ -263,6 +270,23 @@
|
|
|
263
270
|
predictedTokens={message.timings.predicted_n}
|
|
264
271
|
predictedMs={message.timings.predicted_ms}
|
|
265
272
|
/>
|
|
273
|
+
{:else if isLoading() && currentConfig.showMessageStats}
|
|
274
|
+
{@const liveStats = processingState.getLiveProcessingStats()}
|
|
275
|
+
{@const genStats = processingState.getLiveGenerationStats()}
|
|
276
|
+
{@const promptProgress = processingState.processingState?.promptProgress}
|
|
277
|
+
{@const isStillProcessingPrompt =
|
|
278
|
+
promptProgress && promptProgress.processed < promptProgress.total}
|
|
279
|
+
|
|
280
|
+
{#if liveStats || genStats}
|
|
281
|
+
<ChatMessageStatistics
|
|
282
|
+
isLive={true}
|
|
283
|
+
isProcessingPrompt={!!isStillProcessingPrompt}
|
|
284
|
+
promptTokens={liveStats?.tokensProcessed}
|
|
285
|
+
promptMs={liveStats?.timeMs}
|
|
286
|
+
predictedTokens={genStats?.tokensGenerated}
|
|
287
|
+
predictedMs={genStats?.timeMs}
|
|
288
|
+
/>
|
|
289
|
+
{/if}
|
|
266
290
|
{/if}
|
|
267
291
|
</div>
|
|
268
292
|
{/if}
|
|
@@ -5,21 +5,64 @@
|
|
|
5
5
|
import { ChatMessageStatsView } from '$lib/enums';
|
|
6
6
|
|
|
7
7
|
interface Props {
|
|
8
|
-
predictedTokens
|
|
9
|
-
predictedMs
|
|
8
|
+
predictedTokens?: number;
|
|
9
|
+
predictedMs?: number;
|
|
10
10
|
promptTokens?: number;
|
|
11
11
|
promptMs?: number;
|
|
12
|
+
// Live mode: when true, shows stats during streaming
|
|
13
|
+
isLive?: boolean;
|
|
14
|
+
// Whether prompt processing is still in progress
|
|
15
|
+
isProcessingPrompt?: boolean;
|
|
16
|
+
// Initial view to show (defaults to READING in live mode)
|
|
17
|
+
initialView?: ChatMessageStatsView;
|
|
12
18
|
}
|
|
13
19
|
|
|
14
|
-
let {
|
|
20
|
+
let {
|
|
21
|
+
predictedTokens,
|
|
22
|
+
predictedMs,
|
|
23
|
+
promptTokens,
|
|
24
|
+
promptMs,
|
|
25
|
+
isLive = false,
|
|
26
|
+
isProcessingPrompt = false,
|
|
27
|
+
initialView = ChatMessageStatsView.GENERATION
|
|
28
|
+
}: Props = $props();
|
|
15
29
|
|
|
16
|
-
let activeView: ChatMessageStatsView = $state(
|
|
30
|
+
let activeView: ChatMessageStatsView = $state(initialView);
|
|
31
|
+
let hasAutoSwitchedToGeneration = $state(false);
|
|
17
32
|
|
|
18
|
-
|
|
19
|
-
|
|
33
|
+
// In live mode: auto-switch to GENERATION tab when prompt processing completes
|
|
34
|
+
$effect(() => {
|
|
35
|
+
if (isLive) {
|
|
36
|
+
// Auto-switch to generation tab only when prompt processing is done (once)
|
|
37
|
+
if (
|
|
38
|
+
!hasAutoSwitchedToGeneration &&
|
|
39
|
+
!isProcessingPrompt &&
|
|
40
|
+
predictedTokens &&
|
|
41
|
+
predictedTokens > 0
|
|
42
|
+
) {
|
|
43
|
+
activeView = ChatMessageStatsView.GENERATION;
|
|
44
|
+
hasAutoSwitchedToGeneration = true;
|
|
45
|
+
} else if (!hasAutoSwitchedToGeneration) {
|
|
46
|
+
// Stay on READING while prompt is still being processed
|
|
47
|
+
activeView = ChatMessageStatsView.READING;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
let hasGenerationStats = $derived(
|
|
53
|
+
predictedTokens !== undefined &&
|
|
54
|
+
predictedTokens > 0 &&
|
|
55
|
+
predictedMs !== undefined &&
|
|
56
|
+
predictedMs > 0
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
let tokensPerSecond = $derived(hasGenerationStats ? (predictedTokens! / predictedMs!) * 1000 : 0);
|
|
60
|
+
let timeInSeconds = $derived(
|
|
61
|
+
predictedMs !== undefined ? (predictedMs / 1000).toFixed(2) : '0.00'
|
|
62
|
+
);
|
|
20
63
|
|
|
21
64
|
let promptTokensPerSecond = $derived(
|
|
22
|
-
promptTokens !== undefined && promptMs !== undefined
|
|
65
|
+
promptTokens !== undefined && promptMs !== undefined && promptMs > 0
|
|
23
66
|
? (promptTokens / promptMs) * 1000
|
|
24
67
|
: undefined
|
|
25
68
|
);
|
|
@@ -34,11 +77,14 @@
|
|
|
34
77
|
promptTokensPerSecond !== undefined &&
|
|
35
78
|
promptTimeInSeconds !== undefined
|
|
36
79
|
);
|
|
80
|
+
|
|
81
|
+
// In live mode, generation tab is disabled until we have generation stats
|
|
82
|
+
let isGenerationDisabled = $derived(isLive && !hasGenerationStats);
|
|
37
83
|
</script>
|
|
38
84
|
|
|
39
85
|
<div class="inline-flex items-center text-xs text-muted-foreground">
|
|
40
86
|
<div class="inline-flex items-center rounded-sm bg-muted-foreground/15 p-0.5">
|
|
41
|
-
{#if hasPromptStats}
|
|
87
|
+
{#if hasPromptStats || isLive}
|
|
42
88
|
<Tooltip.Root>
|
|
43
89
|
<Tooltip.Trigger>
|
|
44
90
|
<button
|
|
@@ -65,25 +111,32 @@
|
|
|
65
111
|
class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
|
|
66
112
|
ChatMessageStatsView.GENERATION
|
|
67
113
|
? 'bg-background text-foreground shadow-sm'
|
|
68
|
-
:
|
|
69
|
-
|
|
114
|
+
: isGenerationDisabled
|
|
115
|
+
? 'cursor-not-allowed opacity-40'
|
|
116
|
+
: 'hover:text-foreground'}"
|
|
117
|
+
onclick={() => !isGenerationDisabled && (activeView = ChatMessageStatsView.GENERATION)}
|
|
118
|
+
disabled={isGenerationDisabled}
|
|
70
119
|
>
|
|
71
120
|
<Sparkles class="h-3 w-3" />
|
|
72
121
|
<span class="sr-only">Generation</span>
|
|
73
122
|
</button>
|
|
74
123
|
</Tooltip.Trigger>
|
|
75
124
|
<Tooltip.Content>
|
|
76
|
-
<p>
|
|
125
|
+
<p>
|
|
126
|
+
{isGenerationDisabled
|
|
127
|
+
? 'Generation (waiting for tokens...)'
|
|
128
|
+
: 'Generation (token output)'}
|
|
129
|
+
</p>
|
|
77
130
|
</Tooltip.Content>
|
|
78
131
|
</Tooltip.Root>
|
|
79
132
|
</div>
|
|
80
133
|
|
|
81
134
|
<div class="flex items-center gap-1 px-2">
|
|
82
|
-
{#if activeView === ChatMessageStatsView.GENERATION}
|
|
135
|
+
{#if activeView === ChatMessageStatsView.GENERATION && hasGenerationStats}
|
|
83
136
|
<BadgeChatStatistic
|
|
84
137
|
class="bg-transparent"
|
|
85
138
|
icon={WholeWord}
|
|
86
|
-
value="{predictedTokens} tokens"
|
|
139
|
+
value="{predictedTokens?.toLocaleString()} tokens"
|
|
87
140
|
tooltipLabel="Generated tokens"
|
|
88
141
|
/>
|
|
89
142
|
<BadgeChatStatistic
|
|
@@ -21,6 +21,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
|
|
|
21
21
|
autoMicOnEmpty: false,
|
|
22
22
|
// make sure these default values are in sync with `common.h`
|
|
23
23
|
samplers: 'top_k;typ_p;top_p;min_p;temperature',
|
|
24
|
+
backend_sampling: false,
|
|
24
25
|
temperature: 0.8,
|
|
25
26
|
dynatemp_range: 0.0,
|
|
26
27
|
dynatemp_exponent: 1.0,
|
|
@@ -57,6 +58,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
|
|
|
57
58
|
'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
|
|
58
59
|
samplers:
|
|
59
60
|
'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
|
|
61
|
+
backend_sampling:
|
|
62
|
+
'Enable backend-based samplers. When enabled, supported samplers run on the accelerator backend for faster sampling.',
|
|
60
63
|
temperature:
|
|
61
64
|
'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
|
|
62
65
|
dynatemp_range:
|
vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
CHANGED
|
@@ -1,10 +1,27 @@
|
|
|
1
1
|
import { activeProcessingState } from '$lib/stores/chat.svelte';
|
|
2
2
|
import { config } from '$lib/stores/settings.svelte';
|
|
3
3
|
|
|
4
|
+
export interface LiveProcessingStats {
|
|
5
|
+
tokensProcessed: number;
|
|
6
|
+
totalTokens: number;
|
|
7
|
+
timeMs: number;
|
|
8
|
+
tokensPerSecond: number;
|
|
9
|
+
etaSecs?: number;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface LiveGenerationStats {
|
|
13
|
+
tokensGenerated: number;
|
|
14
|
+
timeMs: number;
|
|
15
|
+
tokensPerSecond: number;
|
|
16
|
+
}
|
|
17
|
+
|
|
4
18
|
export interface UseProcessingStateReturn {
|
|
5
19
|
readonly processingState: ApiProcessingState | null;
|
|
6
20
|
getProcessingDetails(): string[];
|
|
7
21
|
getProcessingMessage(): string;
|
|
22
|
+
getPromptProgressText(): string | null;
|
|
23
|
+
getLiveProcessingStats(): LiveProcessingStats | null;
|
|
24
|
+
getLiveGenerationStats(): LiveGenerationStats | null;
|
|
8
25
|
shouldShowDetails(): boolean;
|
|
9
26
|
startMonitoring(): void;
|
|
10
27
|
stopMonitoring(): void;
|
|
@@ -29,6 +46,7 @@ export interface UseProcessingStateReturn {
|
|
|
29
46
|
export function useProcessingState(): UseProcessingStateReturn {
|
|
30
47
|
let isMonitoring = $state(false);
|
|
31
48
|
let lastKnownState = $state<ApiProcessingState | null>(null);
|
|
49
|
+
let lastKnownProcessingStats = $state<LiveProcessingStats | null>(null);
|
|
32
50
|
|
|
33
51
|
// Derive processing state reactively from chatStore's direct state
|
|
34
52
|
const processingState = $derived.by(() => {
|
|
@@ -46,6 +64,34 @@ export function useProcessingState(): UseProcessingStateReturn {
|
|
|
46
64
|
}
|
|
47
65
|
});
|
|
48
66
|
|
|
67
|
+
// Track last known processing stats for when promptProgress disappears
|
|
68
|
+
$effect(() => {
|
|
69
|
+
if (processingState?.promptProgress) {
|
|
70
|
+
const { processed, total, time_ms, cache } = processingState.promptProgress;
|
|
71
|
+
const actualProcessed = processed - cache;
|
|
72
|
+
const actualTotal = total - cache;
|
|
73
|
+
|
|
74
|
+
if (actualProcessed > 0 && time_ms > 0) {
|
|
75
|
+
const tokensPerSecond = actualProcessed / (time_ms / 1000);
|
|
76
|
+
lastKnownProcessingStats = {
|
|
77
|
+
tokensProcessed: actualProcessed,
|
|
78
|
+
totalTokens: actualTotal,
|
|
79
|
+
timeMs: time_ms,
|
|
80
|
+
tokensPerSecond
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
function getETASecs(done: number, total: number, elapsedMs: number): number | undefined {
|
|
87
|
+
const elapsedSecs = elapsedMs / 1000;
|
|
88
|
+
const progressETASecs =
|
|
89
|
+
done === 0 || elapsedSecs < 0.5
|
|
90
|
+
? undefined // can be the case for the 0% progress report
|
|
91
|
+
: elapsedSecs * (total / done - 1);
|
|
92
|
+
return progressETASecs;
|
|
93
|
+
}
|
|
94
|
+
|
|
49
95
|
function startMonitoring(): void {
|
|
50
96
|
if (isMonitoring) return;
|
|
51
97
|
isMonitoring = true;
|
|
@@ -59,28 +105,25 @@ export function useProcessingState(): UseProcessingStateReturn {
|
|
|
59
105
|
const currentConfig = config();
|
|
60
106
|
if (!currentConfig.keepStatsVisible) {
|
|
61
107
|
lastKnownState = null;
|
|
108
|
+
lastKnownProcessingStats = null;
|
|
62
109
|
}
|
|
63
110
|
}
|
|
64
111
|
|
|
65
112
|
function getProcessingMessage(): string {
|
|
66
|
-
|
|
67
|
-
if (!state) {
|
|
113
|
+
if (!processingState) {
|
|
68
114
|
return 'Processing...';
|
|
69
115
|
}
|
|
70
116
|
|
|
71
|
-
switch (
|
|
117
|
+
switch (processingState.status) {
|
|
72
118
|
case 'initializing':
|
|
73
119
|
return 'Initializing...';
|
|
74
120
|
case 'preparing':
|
|
75
|
-
if (
|
|
76
|
-
return `Processing (${
|
|
121
|
+
if (processingState.progressPercent !== undefined) {
|
|
122
|
+
return `Processing (${processingState.progressPercent}%)`;
|
|
77
123
|
}
|
|
78
124
|
return 'Preparing response...';
|
|
79
125
|
case 'generating':
|
|
80
|
-
|
|
81
|
-
return `Generating... (${state.tokensDecoded} tokens)`;
|
|
82
|
-
}
|
|
83
|
-
return 'Generating...';
|
|
126
|
+
return '';
|
|
84
127
|
default:
|
|
85
128
|
return 'Processing...';
|
|
86
129
|
}
|
|
@@ -131,8 +174,76 @@ export function useProcessingState(): UseProcessingStateReturn {
|
|
|
131
174
|
}
|
|
132
175
|
|
|
133
176
|
function shouldShowDetails(): boolean {
|
|
134
|
-
|
|
135
|
-
|
|
177
|
+
return processingState !== null && processingState.status !== 'idle';
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Returns a short progress message with percent
|
|
182
|
+
*/
|
|
183
|
+
function getPromptProgressText(): string | null {
|
|
184
|
+
if (!processingState?.promptProgress) return null;
|
|
185
|
+
|
|
186
|
+
const { processed, total, cache } = processingState.promptProgress;
|
|
187
|
+
|
|
188
|
+
const actualProcessed = processed - cache;
|
|
189
|
+
const actualTotal = total - cache;
|
|
190
|
+
const percent = Math.round((actualProcessed / actualTotal) * 100);
|
|
191
|
+
const eta = getETASecs(actualProcessed, actualTotal, processingState.promptProgress.time_ms);
|
|
192
|
+
|
|
193
|
+
if (eta !== undefined) {
|
|
194
|
+
const etaSecs = Math.ceil(eta);
|
|
195
|
+
return `Processing ${percent}% (ETA: ${etaSecs}s)`;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return `Processing ${percent}%`;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Returns live processing statistics for display (prompt processing phase)
|
|
203
|
+
* Returns last known stats when promptProgress becomes unavailable
|
|
204
|
+
*/
|
|
205
|
+
function getLiveProcessingStats(): LiveProcessingStats | null {
|
|
206
|
+
if (processingState?.promptProgress) {
|
|
207
|
+
const { processed, total, time_ms, cache } = processingState.promptProgress;
|
|
208
|
+
|
|
209
|
+
const actualProcessed = processed - cache;
|
|
210
|
+
const actualTotal = total - cache;
|
|
211
|
+
|
|
212
|
+
if (actualProcessed > 0 && time_ms > 0) {
|
|
213
|
+
const tokensPerSecond = actualProcessed / (time_ms / 1000);
|
|
214
|
+
|
|
215
|
+
return {
|
|
216
|
+
tokensProcessed: actualProcessed,
|
|
217
|
+
totalTokens: actualTotal,
|
|
218
|
+
timeMs: time_ms,
|
|
219
|
+
tokensPerSecond
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Return last known stats if promptProgress is no longer available
|
|
225
|
+
return lastKnownProcessingStats;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Returns live generation statistics for display (token generation phase)
|
|
230
|
+
*/
|
|
231
|
+
function getLiveGenerationStats(): LiveGenerationStats | null {
|
|
232
|
+
if (!processingState) return null;
|
|
233
|
+
|
|
234
|
+
const { tokensDecoded, tokensPerSecond } = processingState;
|
|
235
|
+
|
|
236
|
+
if (tokensDecoded <= 0) return null;
|
|
237
|
+
|
|
238
|
+
// Calculate time from tokens and speed
|
|
239
|
+
const timeMs =
|
|
240
|
+
tokensPerSecond && tokensPerSecond > 0 ? (tokensDecoded / tokensPerSecond) * 1000 : 0;
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
tokensGenerated: tokensDecoded,
|
|
244
|
+
timeMs,
|
|
245
|
+
tokensPerSecond: tokensPerSecond || 0
|
|
246
|
+
};
|
|
136
247
|
}
|
|
137
248
|
|
|
138
249
|
return {
|
|
@@ -141,6 +252,9 @@ export function useProcessingState(): UseProcessingStateReturn {
|
|
|
141
252
|
},
|
|
142
253
|
getProcessingDetails,
|
|
143
254
|
getProcessingMessage,
|
|
255
|
+
getPromptProgressText,
|
|
256
|
+
getLiveProcessingStats,
|
|
257
|
+
getLiveGenerationStats,
|
|
144
258
|
shouldShowDetails,
|
|
145
259
|
startMonitoring,
|
|
146
260
|
stopMonitoring
|
|
@@ -86,6 +86,7 @@ export class ChatService {
|
|
|
86
86
|
dry_penalty_last_n,
|
|
87
87
|
// Other parameters
|
|
88
88
|
samplers,
|
|
89
|
+
backend_sampling,
|
|
89
90
|
custom,
|
|
90
91
|
timings_per_token,
|
|
91
92
|
// Config options
|
|
@@ -117,7 +118,8 @@ export class ChatService {
|
|
|
117
118
|
role: msg.role,
|
|
118
119
|
content: msg.content
|
|
119
120
|
})),
|
|
120
|
-
stream
|
|
121
|
+
stream,
|
|
122
|
+
return_progress: stream ? true : undefined
|
|
121
123
|
};
|
|
122
124
|
|
|
123
125
|
// Include model in request if provided (required in ROUTER mode)
|
|
@@ -158,6 +160,8 @@ export class ChatService {
|
|
|
158
160
|
: samplers;
|
|
159
161
|
}
|
|
160
162
|
|
|
163
|
+
if (backend_sampling !== undefined) requestBody.backend_sampling = backend_sampling;
|
|
164
|
+
|
|
161
165
|
if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;
|
|
162
166
|
|
|
163
167
|
if (custom) {
|
|
@@ -271,7 +275,7 @@ export class ChatService {
|
|
|
271
275
|
onReasoningChunk?: (chunk: string) => void,
|
|
272
276
|
onToolCallChunk?: (chunk: string) => void,
|
|
273
277
|
onModel?: (model: string) => void,
|
|
274
|
-
onTimings?: (timings
|
|
278
|
+
onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
|
|
275
279
|
conversationId?: string,
|
|
276
280
|
abortSignal?: AbortSignal
|
|
277
281
|
): Promise<void> {
|
|
@@ -366,11 +370,13 @@ export class ChatService {
|
|
|
366
370
|
onModel?.(chunkModel);
|
|
367
371
|
}
|
|
368
372
|
|
|
369
|
-
if (
|
|
373
|
+
if (promptProgress) {
|
|
374
|
+
ChatService.notifyTimings(undefined, promptProgress, onTimings);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
if (timings) {
|
|
370
378
|
ChatService.notifyTimings(timings, promptProgress, onTimings);
|
|
371
|
-
|
|
372
|
-
lastTimings = timings;
|
|
373
|
-
}
|
|
379
|
+
lastTimings = timings;
|
|
374
380
|
}
|
|
375
381
|
|
|
376
382
|
if (content) {
|
|
@@ -768,10 +774,11 @@ export class ChatService {
|
|
|
768
774
|
timings: ChatMessageTimings | undefined,
|
|
769
775
|
promptProgress: ChatMessagePromptProgress | undefined,
|
|
770
776
|
onTimingsCallback:
|
|
771
|
-
| ((timings
|
|
777
|
+
| ((timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
|
|
772
778
|
| undefined
|
|
773
779
|
): void {
|
|
774
|
-
if (!
|
|
780
|
+
if (!onTimingsCallback || (!timings && !promptProgress)) return;
|
|
781
|
+
|
|
775
782
|
onTimingsCallback(timings, promptProgress);
|
|
776
783
|
}
|
|
777
784
|
}
|
|
@@ -303,11 +303,17 @@ class ChatStore {
|
|
|
303
303
|
const currentConfig = config();
|
|
304
304
|
const outputTokensMax = currentConfig.max_tokens || -1;
|
|
305
305
|
|
|
306
|
+
// Note: for timings data, the n_prompt does NOT include cache tokens
|
|
306
307
|
const contextUsed = promptTokens + cacheTokens + predictedTokens;
|
|
307
308
|
const outputTokensUsed = predictedTokens;
|
|
308
309
|
|
|
310
|
+
// Note: for prompt progress, the "processed" DOES include cache tokens
|
|
311
|
+
// we need to exclude them to get the real prompt tokens processed count
|
|
312
|
+
const progressCache = promptProgress?.cache || 0;
|
|
313
|
+
const progressActualDone = (promptProgress?.processed ?? 0) - progressCache;
|
|
314
|
+
const progressActualTotal = (promptProgress?.total ?? 0) - progressCache;
|
|
309
315
|
const progressPercent = promptProgress
|
|
310
|
-
? Math.round((
|
|
316
|
+
? Math.round((progressActualDone / progressActualTotal) * 100)
|
|
311
317
|
: undefined;
|
|
312
318
|
|
|
313
319
|
return {
|
|
@@ -324,6 +330,7 @@ class ChatStore {
|
|
|
324
330
|
topP: currentConfig.top_p ?? 0.95,
|
|
325
331
|
speculative: false,
|
|
326
332
|
progressPercent,
|
|
333
|
+
promptProgress,
|
|
327
334
|
promptTokens,
|
|
328
335
|
promptMs,
|
|
329
336
|
cacheTokens
|
|
@@ -534,7 +541,7 @@ class ChatStore {
|
|
|
534
541
|
conversationsStore.updateMessageAtIndex(idx, { toolCalls: streamedToolCallContent });
|
|
535
542
|
},
|
|
536
543
|
onModel: (modelName: string) => recordModel(modelName),
|
|
537
|
-
onTimings: (timings
|
|
544
|
+
onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
|
|
538
545
|
const tokensPerSecond =
|
|
539
546
|
timings?.predicted_ms && timings?.predicted_n
|
|
540
547
|
? (timings.predicted_n / timings.predicted_ms) * 1000
|
|
@@ -1032,7 +1039,7 @@ class ChatStore {
|
|
|
1032
1039
|
});
|
|
1033
1040
|
},
|
|
1034
1041
|
|
|
1035
|
-
onTimings: (timings
|
|
1042
|
+
onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
|
|
1036
1043
|
const tokensPerSecond =
|
|
1037
1044
|
timings?.predicted_ms && timings?.predicted_n
|
|
1038
1045
|
? (timings.predicted_n / timings.predicted_ms) * 1000
|
|
@@ -1454,6 +1461,8 @@ class ChatStore {
|
|
|
1454
1461
|
if (hasValue(currentConfig.dry_penalty_last_n))
|
|
1455
1462
|
apiOptions.dry_penalty_last_n = Number(currentConfig.dry_penalty_last_n);
|
|
1456
1463
|
if (currentConfig.samplers) apiOptions.samplers = currentConfig.samplers;
|
|
1464
|
+
if (currentConfig.backend_sampling)
|
|
1465
|
+
apiOptions.backend_sampling = currentConfig.backend_sampling;
|
|
1457
1466
|
if (currentConfig.custom) apiOptions.custom = currentConfig.custom;
|
|
1458
1467
|
|
|
1459
1468
|
return apiOptions;
|
|
@@ -294,15 +294,14 @@ class SettingsStore {
|
|
|
294
294
|
* This sets up the default values from /props endpoint
|
|
295
295
|
*/
|
|
296
296
|
syncWithServerDefaults(): void {
|
|
297
|
-
const
|
|
298
|
-
|
|
299
|
-
|
|
297
|
+
const propsDefaults = this.getServerDefaults();
|
|
298
|
+
|
|
299
|
+
if (Object.keys(propsDefaults).length === 0) {
|
|
300
|
+
console.warn('No server defaults available for initialization');
|
|
300
301
|
|
|
301
302
|
return;
|
|
302
303
|
}
|
|
303
304
|
|
|
304
|
-
const propsDefaults = this.getServerDefaults();
|
|
305
|
-
|
|
306
305
|
for (const [key, propsValue] of Object.entries(propsDefaults)) {
|
|
307
306
|
const currentValue = getConfigValue(this.config, key);
|
|
308
307
|
|
|
@@ -149,6 +149,7 @@ export interface ApiLlamaCppServerProps {
|
|
|
149
149
|
reasoning_in_content: boolean;
|
|
150
150
|
thinking_forced_open: boolean;
|
|
151
151
|
samplers: string[];
|
|
152
|
+
backend_sampling: boolean;
|
|
152
153
|
'speculative.n_max': number;
|
|
153
154
|
'speculative.n_min': number;
|
|
154
155
|
'speculative.p_min': number;
|
|
@@ -186,6 +187,7 @@ export interface ApiChatCompletionRequest {
|
|
|
186
187
|
}>;
|
|
187
188
|
stream?: boolean;
|
|
188
189
|
model?: string;
|
|
190
|
+
return_progress?: boolean;
|
|
189
191
|
// Reasoning parameters
|
|
190
192
|
reasoning_format?: string;
|
|
191
193
|
// Generation parameters
|
|
@@ -211,6 +213,7 @@ export interface ApiChatCompletionRequest {
|
|
|
211
213
|
dry_penalty_last_n?: number;
|
|
212
214
|
// Sampler configuration
|
|
213
215
|
samplers?: string[];
|
|
216
|
+
backend_sampling?: boolean;
|
|
214
217
|
// Custom parameters (JSON string)
|
|
215
218
|
custom?: Record<string, unknown>;
|
|
216
219
|
timings_per_token?: boolean;
|
|
@@ -311,6 +314,7 @@ export interface ApiSlotData {
|
|
|
311
314
|
reasoning_in_content: boolean;
|
|
312
315
|
thinking_forced_open: boolean;
|
|
313
316
|
samplers: string[];
|
|
317
|
+
backend_sampling: boolean;
|
|
314
318
|
'speculative.n_max': number;
|
|
315
319
|
'speculative.n_min': number;
|
|
316
320
|
'speculative.p_min': number;
|
|
@@ -341,6 +345,7 @@ export interface ApiProcessingState {
|
|
|
341
345
|
tokensPerSecond?: number;
|
|
342
346
|
// Progress information from prompt_progress
|
|
343
347
|
progressPercent?: number;
|
|
348
|
+
promptProgress?: ChatMessagePromptProgress;
|
|
344
349
|
promptTokens?: number;
|
|
345
350
|
promptMs?: number;
|
|
346
351
|
cacheTokens?: number;
|