llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
- llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
- vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
- vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
- vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
- vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
- vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
- vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
- vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
- vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
- vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
- vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
- llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
- vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
- {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
template <bool iswa>
|
|
4
|
+
llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
|
|
5
|
+
llm_graph_context(params) {
|
|
6
|
+
const int64_t head_dim_q = hparams.n_embd_head_k;
|
|
7
|
+
const int64_t head_dim_v = hparams.n_embd_head_v;
|
|
8
|
+
|
|
9
|
+
ggml_tensor * cur;
|
|
10
|
+
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
|
|
11
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12
|
+
|
|
13
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
14
|
+
inp_attn_type * inp_attn = nullptr;
|
|
15
|
+
|
|
16
|
+
if constexpr (iswa) {
|
|
17
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
18
|
+
} else {
|
|
19
|
+
inp_attn = build_attn_inp_kv();
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
23
|
+
|
|
24
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
25
|
+
ggml_tensor * residual = inpL;
|
|
26
|
+
|
|
27
|
+
float freq_base_l = 0.0f;
|
|
28
|
+
float freq_scale_l = 0.0f;
|
|
29
|
+
if constexpr (iswa) {
|
|
30
|
+
freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
31
|
+
freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
32
|
+
} else {
|
|
33
|
+
freq_base_l = freq_base;
|
|
34
|
+
freq_scale_l = freq_scale;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
38
|
+
cb(cur, "attn_norm", il);
|
|
39
|
+
|
|
40
|
+
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
|
|
41
|
+
cb(cur, "wqkv", il);
|
|
42
|
+
|
|
43
|
+
const int32_t n_head = hparams.n_head(il);
|
|
44
|
+
const int32_t n_head_kv = hparams.n_head_kv(il);
|
|
45
|
+
|
|
46
|
+
const int64_t q_offset = 0;
|
|
47
|
+
const int64_t k_offset = head_dim_q * n_head;
|
|
48
|
+
const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
|
|
49
|
+
|
|
50
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
|
|
51
|
+
head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
|
52
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
|
|
53
|
+
head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
|
54
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
|
|
55
|
+
head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
|
|
56
|
+
|
|
57
|
+
cb(Qcur, "Qcur", il);
|
|
58
|
+
cb(Kcur, "Kcur", il);
|
|
59
|
+
cb(Vcur, "Vcur", il);
|
|
60
|
+
|
|
61
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
62
|
+
cb(Qcur, "attn_q_norm", il);
|
|
63
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
64
|
+
cb(Kcur, "attn_k_norm", il);
|
|
65
|
+
|
|
66
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
|
|
67
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
68
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
69
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
|
|
70
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
71
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
72
|
+
|
|
73
|
+
const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
|
|
74
|
+
|
|
75
|
+
cur = build_attn(inp_attn,
|
|
76
|
+
model.layers[il].wo, NULL,
|
|
77
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
|
|
78
|
+
cb(cur, "attn_out", il);
|
|
79
|
+
|
|
80
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
81
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
82
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
|
86
|
+
cb(cur, "attn_post_norm", il);
|
|
87
|
+
|
|
88
|
+
cur = ggml_add(ctx0, cur, residual);
|
|
89
|
+
cb(cur, "attn_residual", il);
|
|
90
|
+
|
|
91
|
+
residual = cur;
|
|
92
|
+
|
|
93
|
+
cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
94
|
+
cb(cur, "ffn_norm", il);
|
|
95
|
+
|
|
96
|
+
cur = build_ffn(cur,
|
|
97
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
98
|
+
NULL, NULL, NULL,
|
|
99
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
100
|
+
NULL,
|
|
101
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
102
|
+
cb(cur, "ffn_out", il);
|
|
103
|
+
|
|
104
|
+
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
|
|
105
|
+
cb(cur, "ffn_post_norm", il);
|
|
106
|
+
|
|
107
|
+
cur = ggml_add(ctx0, cur, residual);
|
|
108
|
+
cb(cur, "ffn_residual", il);
|
|
109
|
+
|
|
110
|
+
cur = build_cvec(cur, il);
|
|
111
|
+
cb(cur, "l_out", il);
|
|
112
|
+
inpL = cur;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
cur = inpL;
|
|
116
|
+
|
|
117
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
118
|
+
res->t_embd = cur;
|
|
119
|
+
|
|
120
|
+
cur = build_lora_mm(model.output, cur);
|
|
121
|
+
res->t_logits = cur;
|
|
122
|
+
|
|
123
|
+
ggml_build_forward_expand(gf, cur);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Explicit template instantiations
|
|
127
|
+
template struct llm_build_plamo3<false>;
|
|
128
|
+
template struct llm_build_plamo3<true>;
|
|
@@ -26,10 +26,16 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
|
|
26
26
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
27
27
|
|
|
28
28
|
for (int il = 0; il < n_layer; ++il) {
|
|
29
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
30
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
31
|
+
|
|
29
32
|
ggml_tensor * inpSA = inpL;
|
|
30
|
-
ggml_tensor * probs = nullptr;
|
|
31
33
|
|
|
32
|
-
|
|
34
|
+
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
|
35
|
+
const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
|
|
36
|
+
il % hparams.n_no_rope_layer_step != 0;
|
|
37
|
+
|
|
38
|
+
ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
|
|
33
39
|
cb(probs, "ffn_moe_logits", il);
|
|
34
40
|
|
|
35
41
|
// norm
|
|
@@ -52,11 +58,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
|
|
52
58
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
53
59
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
54
60
|
|
|
55
|
-
if (
|
|
56
|
-
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
61
|
+
if (use_rope) {
|
|
62
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
57
63
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
58
64
|
|
|
59
|
-
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
65
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
60
66
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
61
67
|
}
|
|
62
68
|
cb(Qcur, "Qcur", il);
|
|
@@ -964,6 +964,11 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
|
964
964
|
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
|
|
965
965
|
{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
|
|
966
966
|
{ "\\p{S}", unicode_cpt_flags::SYMBOL },
|
|
967
|
+
{ "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
|
|
968
|
+
{ "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
|
|
969
|
+
{ "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
|
|
970
|
+
{ "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
|
|
971
|
+
{ "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
|
|
967
972
|
};
|
|
968
973
|
|
|
969
974
|
static const std::map<int, int> k_ucat_cpt = {
|
|
@@ -1074,22 +1079,26 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
|
1074
1079
|
continue;
|
|
1075
1080
|
}
|
|
1076
1081
|
|
|
1077
|
-
|
|
1082
|
+
// Match \p{...} Unicode properties of varying lengths
|
|
1083
|
+
if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
|
|
1078
1084
|
regex_expr[i + 1] == 'p' &&
|
|
1079
|
-
regex_expr[i + 2] == '{'
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
if (
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
+
regex_expr[i + 2] == '{') {
|
|
1086
|
+
// Find the closing brace
|
|
1087
|
+
size_t closing_brace = regex_expr.find('}', i + 3);
|
|
1088
|
+
if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit
|
|
1089
|
+
const std::string pat = regex_expr.substr(i, closing_brace - i + 1);
|
|
1090
|
+
if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
|
|
1091
|
+
if (!inside) {
|
|
1092
|
+
regex_expr_collapsed += '[';
|
|
1093
|
+
}
|
|
1094
|
+
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
|
|
1095
|
+
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
|
|
1096
|
+
if (!inside) {
|
|
1097
|
+
regex_expr_collapsed += ']';
|
|
1098
|
+
}
|
|
1099
|
+
i = closing_brace;
|
|
1100
|
+
continue;
|
|
1085
1101
|
}
|
|
1086
|
-
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
|
|
1087
|
-
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
|
|
1088
|
-
if (!inside) {
|
|
1089
|
-
regex_expr_collapsed += ']';
|
|
1090
|
-
}
|
|
1091
|
-
i += 4;
|
|
1092
|
-
continue;
|
|
1093
1102
|
}
|
|
1094
1103
|
}
|
|
1095
1104
|
|
|
@@ -219,8 +219,18 @@ endif()
|
|
|
219
219
|
llama_build_and_test(test-gguf.cpp)
|
|
220
220
|
llama_build_and_test(test-backend-ops.cpp)
|
|
221
221
|
|
|
222
|
-
llama_build_and_test(test-model-load-cancel.cpp
|
|
223
|
-
llama_build_and_test(test-autorelease.cpp
|
|
222
|
+
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
|
|
223
|
+
llama_build_and_test(test-autorelease.cpp LABEL "model")
|
|
224
|
+
llama_build_and_test(test-backend-sampler.cpp LABEL "model")
|
|
225
|
+
|
|
226
|
+
llama_test(test-backend-sampler NAME test-backend-sampler-greedy ARGS --test greedy)
|
|
227
|
+
llama_test(test-backend-sampler NAME test-backend-sampler-temp ARGS --test temp)
|
|
228
|
+
llama_test(test-backend-sampler NAME test-backend-sampler-top_k ARGS --test top_k)
|
|
229
|
+
llama_test(test-backend-sampler NAME test-backend-sampler-dist ARGS --test dist)
|
|
230
|
+
llama_test(test-backend-sampler NAME test-backend-sampler-dist-and-cpu ARGS --test dist_and_cpu)
|
|
231
|
+
llama_test(test-backend-sampler NAME test-backend-sampler-logit-bias ARGS --test logit_bias)
|
|
232
|
+
llama_test(test-backend-sampler NAME test-backend-sampler-mul_seq ARGS --test multi_sequence)
|
|
233
|
+
llama_test(test-backend-sampler NAME test-backend-sampler-set-sampler ARGS --test set_sampler)
|
|
224
234
|
|
|
225
235
|
# Test for state restore with fragmented KV cache
|
|
226
236
|
# Requires a model, uses same args pattern as test-thread-safety
|