llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
- llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
- vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
- vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
- vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
- vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
- vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
- vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
- vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
- vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
- vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
- vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
- llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
- vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
- {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -818,6 +818,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
818
818
|
case PROJECTOR_TYPE_VOXTRAL:
|
|
819
819
|
case PROJECTOR_TYPE_QWEN2A:
|
|
820
820
|
case PROJECTOR_TYPE_GLMA:
|
|
821
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
821
822
|
{
|
|
822
823
|
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
|
823
824
|
} break;
|
|
@@ -845,6 +846,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
845
846
|
{
|
|
846
847
|
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
|
847
848
|
} break;
|
|
849
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
850
|
+
{
|
|
851
|
+
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
|
|
852
|
+
} break;
|
|
848
853
|
default:
|
|
849
854
|
GGML_ABORT("missing cgraph builder");
|
|
850
855
|
}
|
|
@@ -1158,6 +1163,20 @@ struct clip_model_loader {
|
|
|
1158
1163
|
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
|
1159
1164
|
}
|
|
1160
1165
|
} break;
|
|
1166
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
1167
|
+
{
|
|
1168
|
+
hparams.n_merge = 2;
|
|
1169
|
+
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
|
1170
|
+
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
|
1171
|
+
std::vector<int> wa_layer_indexes_vec;
|
|
1172
|
+
get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
|
|
1173
|
+
for (auto & layer : wa_layer_indexes_vec) {
|
|
1174
|
+
hparams.wa_layer_indexes.insert(layer);
|
|
1175
|
+
}
|
|
1176
|
+
// support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
|
|
1177
|
+
hparams.set_limit_image_tokens(1, 62500);
|
|
1178
|
+
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
|
|
1179
|
+
} break;
|
|
1161
1180
|
case PROJECTOR_TYPE_GLM4V:
|
|
1162
1181
|
{
|
|
1163
1182
|
hparams.rope_theta = 10000.0f;
|
|
@@ -1176,6 +1195,7 @@ struct clip_model_loader {
|
|
|
1176
1195
|
case PROJECTOR_TYPE_QWEN2A:
|
|
1177
1196
|
case PROJECTOR_TYPE_GLMA:
|
|
1178
1197
|
case PROJECTOR_TYPE_VOXTRAL:
|
|
1198
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
1179
1199
|
{
|
|
1180
1200
|
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
|
1181
1201
|
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
|
@@ -1225,7 +1245,14 @@ struct clip_model_loader {
|
|
|
1225
1245
|
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
|
|
1226
1246
|
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
|
|
1227
1247
|
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
|
|
1228
|
-
LOG_INF("%s: n_wa_pattern:
|
|
1248
|
+
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
|
1249
|
+
if (!hparams.wa_layer_indexes.empty()) {
|
|
1250
|
+
LOG_INF("%s: wa_layer_indexes: ", __func__);
|
|
1251
|
+
for (auto & layer : hparams.wa_layer_indexes) {
|
|
1252
|
+
LOG_INF("%d ", layer);
|
|
1253
|
+
}
|
|
1254
|
+
LOG_INF("\n");
|
|
1255
|
+
}
|
|
1229
1256
|
if (hparams.image_min_pixels > 0) {
|
|
1230
1257
|
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
|
|
1231
1258
|
}
|
|
@@ -1493,6 +1520,14 @@ struct clip_model_loader {
|
|
|
1493
1520
|
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1494
1521
|
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1495
1522
|
} break;
|
|
1523
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
1524
|
+
{
|
|
1525
|
+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
|
|
1526
|
+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
|
|
1527
|
+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
|
1528
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
|
|
1529
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1530
|
+
} break;
|
|
1496
1531
|
case PROJECTOR_TYPE_GLM4V:
|
|
1497
1532
|
{
|
|
1498
1533
|
model.projection = get_tensor(TN_MM_PROJECTOR);
|
|
@@ -1517,6 +1552,14 @@ struct clip_model_loader {
|
|
|
1517
1552
|
model.projection = get_tensor(TN_MM_PROJECTOR);
|
|
1518
1553
|
} break;
|
|
1519
1554
|
case PROJECTOR_TYPE_LFM2:
|
|
1555
|
+
{
|
|
1556
|
+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
1557
|
+
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
|
|
1558
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
|
1559
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
|
1560
|
+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1561
|
+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1562
|
+
} break;
|
|
1520
1563
|
case PROJECTOR_TYPE_KIMIVL:
|
|
1521
1564
|
{
|
|
1522
1565
|
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
|
@@ -1576,6 +1619,17 @@ struct clip_model_loader {
|
|
|
1576
1619
|
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
|
1577
1620
|
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
|
1578
1621
|
} break;
|
|
1622
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
1623
|
+
{
|
|
1624
|
+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
|
1625
|
+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
|
1626
|
+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
|
1627
|
+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
|
1628
|
+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
|
1629
|
+
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
|
1630
|
+
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
|
1631
|
+
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
|
1632
|
+
} break;
|
|
1579
1633
|
case PROJECTOR_TYPE_INTERNVL:
|
|
1580
1634
|
{
|
|
1581
1635
|
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
|
@@ -2684,6 +2738,57 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
|
2684
2738
|
// res_imgs->data[0] = *res;
|
|
2685
2739
|
res_imgs->entries.push_back(std::move(img_f32));
|
|
2686
2740
|
} break;
|
|
2741
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
2742
|
+
{
|
|
2743
|
+
const int patch_size = params.patch_size; // typically 16
|
|
2744
|
+
const int merge_size = params.n_merge; // typically 2
|
|
2745
|
+
const int align_size = patch_size * merge_size; // 32
|
|
2746
|
+
|
|
2747
|
+
const int max_num_patches = params.image_max_pixels > 0 ?
|
|
2748
|
+
params.image_max_pixels / (patch_size * patch_size) : 256;
|
|
2749
|
+
|
|
2750
|
+
// Linear search for optimal scale to fit within max_num_patches
|
|
2751
|
+
float scale = 1.0f;
|
|
2752
|
+
int target_height = original_size.height;
|
|
2753
|
+
int target_width = original_size.width;
|
|
2754
|
+
|
|
2755
|
+
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
|
|
2756
|
+
float scaled_size = size * scale;
|
|
2757
|
+
// Round up to nearest multiple of align_size
|
|
2758
|
+
int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
|
|
2759
|
+
// Ensure at least one patch
|
|
2760
|
+
return std::max(align_size, aligned);
|
|
2761
|
+
};
|
|
2762
|
+
|
|
2763
|
+
// Linear search with 0.02 step size
|
|
2764
|
+
while (scale > 0.0f) {
|
|
2765
|
+
target_height = get_scaled_image_size(scale, original_size.height);
|
|
2766
|
+
target_width = get_scaled_image_size(scale, original_size.width);
|
|
2767
|
+
|
|
2768
|
+
int num_patches_h = target_height / patch_size;
|
|
2769
|
+
int num_patches_w = target_width / patch_size;
|
|
2770
|
+
int num_patches = num_patches_h * num_patches_w;
|
|
2771
|
+
|
|
2772
|
+
if (num_patches > max_num_patches) {
|
|
2773
|
+
scale -= 0.02f;
|
|
2774
|
+
} else {
|
|
2775
|
+
break;
|
|
2776
|
+
}
|
|
2777
|
+
}
|
|
2778
|
+
|
|
2779
|
+
clip_image_size new_size = {target_width, target_height};
|
|
2780
|
+
|
|
2781
|
+
// Resize the image
|
|
2782
|
+
clip_image_u8 resized;
|
|
2783
|
+
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
|
|
2784
|
+
|
|
2785
|
+
// Normalize to float32
|
|
2786
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
2787
|
+
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
|
|
2788
|
+
|
|
2789
|
+
// Add to results
|
|
2790
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
2791
|
+
} break;
|
|
2687
2792
|
|
|
2688
2793
|
case PROJECTOR_TYPE_IDEFICS3:
|
|
2689
2794
|
{
|
|
@@ -2916,6 +3021,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|
|
2916
3021
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
2917
3022
|
case PROJECTOR_TYPE_QWEN3VL:
|
|
2918
3023
|
case PROJECTOR_TYPE_GLM4V:
|
|
3024
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
2919
3025
|
return (img->nx / params.patch_size) / 2;
|
|
2920
3026
|
default:
|
|
2921
3027
|
break;
|
|
@@ -2931,6 +3037,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|
|
2931
3037
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
2932
3038
|
case PROJECTOR_TYPE_QWEN3VL:
|
|
2933
3039
|
case PROJECTOR_TYPE_GLM4V:
|
|
3040
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
2934
3041
|
return (img->ny / params.patch_size) / 2;
|
|
2935
3042
|
default:
|
|
2936
3043
|
break;
|
|
@@ -2991,6 +3098,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|
|
2991
3098
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
2992
3099
|
case PROJECTOR_TYPE_QWEN3VL:
|
|
2993
3100
|
case PROJECTOR_TYPE_GLM4V:
|
|
3101
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
2994
3102
|
{
|
|
2995
3103
|
// dynamic size (2 conv, so double patch size)
|
|
2996
3104
|
int x_patch = img->nx / (params.patch_size * 2);
|
|
@@ -3031,6 +3139,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|
|
3031
3139
|
case PROJECTOR_TYPE_VOXTRAL:
|
|
3032
3140
|
case PROJECTOR_TYPE_ULTRAVOX:
|
|
3033
3141
|
case PROJECTOR_TYPE_QWEN2A:
|
|
3142
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
3034
3143
|
{
|
|
3035
3144
|
n_patches = img->nx;
|
|
3036
3145
|
|
|
@@ -3117,7 +3226,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3117
3226
|
const int pos_w = image_size_width / patch_size;
|
|
3118
3227
|
const int pos_h = image_size_height / patch_size;
|
|
3119
3228
|
|
|
3120
|
-
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
|
3121
3229
|
|
|
3122
3230
|
auto get_inp_tensor = [&gf](const char * name) {
|
|
3123
3231
|
ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
|
|
@@ -3266,9 +3374,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3266
3374
|
set_input_i32("positions", positions);
|
|
3267
3375
|
} break;
|
|
3268
3376
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
3377
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
3269
3378
|
{
|
|
3270
3379
|
// pw * ph = number of tokens output by ViT after apply patch merger
|
|
3271
3380
|
// ipw * ipw = number of vision token been processed inside ViT
|
|
3381
|
+
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
|
|
3272
3382
|
const int merge_ratio = 2;
|
|
3273
3383
|
const int pw = image_size_width / patch_size / merge_ratio;
|
|
3274
3384
|
const int ph = image_size_height / patch_size / merge_ratio;
|
|
@@ -3279,7 +3389,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3279
3389
|
std::vector<int> inv_idx(ph * pw);
|
|
3280
3390
|
|
|
3281
3391
|
if (use_window_attn) {
|
|
3282
|
-
const int attn_window_size = 112;
|
|
3392
|
+
const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
|
|
3283
3393
|
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
|
3284
3394
|
int dst = 0;
|
|
3285
3395
|
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
|
@@ -3403,6 +3513,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3403
3513
|
case PROJECTOR_TYPE_ULTRAVOX:
|
|
3404
3514
|
case PROJECTOR_TYPE_LFM2:
|
|
3405
3515
|
case PROJECTOR_TYPE_VOXTRAL:
|
|
3516
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
3406
3517
|
case PROJECTOR_TYPE_JANUS_PRO:
|
|
3407
3518
|
case PROJECTOR_TYPE_COGVLM:
|
|
3408
3519
|
{
|
|
@@ -3516,6 +3627,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
3516
3627
|
case PROJECTOR_TYPE_QWEN2VL:
|
|
3517
3628
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
3518
3629
|
case PROJECTOR_TYPE_JANUS_PRO:
|
|
3630
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
3519
3631
|
return ctx->model.mm_1_b->ne[0];
|
|
3520
3632
|
case PROJECTOR_TYPE_QWEN3VL:
|
|
3521
3633
|
// main path + deepstack paths
|
|
@@ -3526,6 +3638,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
3526
3638
|
return ctx->model.projection->ne[1];
|
|
3527
3639
|
case PROJECTOR_TYPE_ULTRAVOX:
|
|
3528
3640
|
case PROJECTOR_TYPE_VOXTRAL:
|
|
3641
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
3529
3642
|
return ctx->model.mm_2_w->ne[1];
|
|
3530
3643
|
case PROJECTOR_TYPE_INTERNVL:
|
|
3531
3644
|
return ctx->model.mm_3_w->ne[1];
|
|
@@ -3587,7 +3700,8 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
|
|
3587
3700
|
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
|
3588
3701
|
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
|
3589
3702
|
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
|
3590
|
-
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
|
|
3703
|
+
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
|
|
3704
|
+
|| ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|
|
3591
3705
|
}
|
|
3592
3706
|
|
|
3593
3707
|
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
#include "../clip-graph.h"
|
|
4
4
|
|
|
5
|
+
/*
|
|
6
|
+
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
|
|
7
|
+
* We encourage human contributors to ensure the quality and reliability of the codebase.
|
|
8
|
+
*/
|
|
9
|
+
|
|
5
10
|
struct clip_graph_siglip : clip_graph {
|
|
6
11
|
clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
|
7
12
|
ggml_cgraph * build() override;
|
|
@@ -22,6 +27,11 @@ struct clip_graph_qwen3vl : clip_graph {
|
|
|
22
27
|
ggml_cgraph * build() override;
|
|
23
28
|
};
|
|
24
29
|
|
|
30
|
+
struct clip_graph_youtuvl : clip_graph {
|
|
31
|
+
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
|
32
|
+
ggml_cgraph * build() override;
|
|
33
|
+
};
|
|
34
|
+
|
|
25
35
|
struct clip_graph_minicpmv : clip_graph {
|
|
26
36
|
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
|
27
37
|
ggml_cgraph * build() override;
|
|
@@ -50,10 +50,15 @@ ggml_cgraph * clip_graph_siglip::build() {
|
|
|
50
50
|
const int scale_factor = model.hparams.n_merge;
|
|
51
51
|
cur = build_patch_merge_permute(cur, scale_factor);
|
|
52
52
|
|
|
53
|
-
// projection
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
53
|
+
// projection, in LFM2-VL input norm is optional
|
|
54
|
+
if (model.mm_input_norm_w) {
|
|
55
|
+
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
|
56
|
+
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (model.mm_input_norm_b) {
|
|
60
|
+
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
|
61
|
+
}
|
|
57
62
|
|
|
58
63
|
cur = build_ffn(cur,
|
|
59
64
|
model.mm_1_w, model.mm_1_b,
|
|
@@ -86,6 +86,15 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
|
|
86
86
|
FFN_GELU_ERF,
|
|
87
87
|
-1);
|
|
88
88
|
|
|
89
|
+
} else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
|
90
|
+
// projector
|
|
91
|
+
cur = build_ffn(cur,
|
|
92
|
+
model.mm_1_w, model.mm_1_b,
|
|
93
|
+
nullptr, nullptr,
|
|
94
|
+
model.mm_2_w, model.mm_2_b,
|
|
95
|
+
FFN_GELU_ERF,
|
|
96
|
+
-1);
|
|
97
|
+
|
|
89
98
|
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
|
|
90
99
|
cur = ggml_norm(ctx0, cur, hparams.eps);
|
|
91
100
|
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
ggml_cgraph * clip_graph_youtuvl::build() {
|
|
4
|
+
GGML_ASSERT(model.class_embedding == nullptr);
|
|
5
|
+
const int batch_size = 1;
|
|
6
|
+
const bool use_window_attn = !hparams.wa_layer_indexes.empty();
|
|
7
|
+
const int n_pos = n_patches;
|
|
8
|
+
const int num_position_ids = n_pos * 4;
|
|
9
|
+
const int m = 2;
|
|
10
|
+
const int Wp = n_patches_x;
|
|
11
|
+
const int Hp = n_patches_y;
|
|
12
|
+
const int Hm = Hp / m;
|
|
13
|
+
const int Wm = Wp / m;
|
|
14
|
+
norm_type norm_t = NORM_TYPE_NORMAL;
|
|
15
|
+
|
|
16
|
+
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
|
17
|
+
|
|
18
|
+
ggml_tensor * inp = build_inp_raw();
|
|
19
|
+
|
|
20
|
+
// change conv3d to linear
|
|
21
|
+
// reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
|
|
22
|
+
{
|
|
23
|
+
inp = ggml_reshape_4d(
|
|
24
|
+
ctx0, inp,
|
|
25
|
+
Wm * m * patch_size, m * patch_size, Hm, 3);
|
|
26
|
+
inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
|
|
27
|
+
inp = ggml_cont_4d(
|
|
28
|
+
ctx0, inp,
|
|
29
|
+
m * patch_size * 3, Wm, m * patch_size, Hm);
|
|
30
|
+
|
|
31
|
+
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
|
32
|
+
inp = ggml_cont_4d(
|
|
33
|
+
ctx0, inp,
|
|
34
|
+
m * patch_size * 3, patch_size, m, Hm * Wm);
|
|
35
|
+
|
|
36
|
+
inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
|
|
37
|
+
inp = ggml_cont_4d(
|
|
38
|
+
ctx0, inp,
|
|
39
|
+
patch_size, 3, patch_size, Hm * Wm * m * m);
|
|
40
|
+
|
|
41
|
+
inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
|
|
42
|
+
inp = ggml_cont_3d(
|
|
43
|
+
ctx0, inp,
|
|
44
|
+
3*patch_size* patch_size, Hm * Wm * m * m, 1);
|
|
45
|
+
}
|
|
46
|
+
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
|
47
|
+
|
|
48
|
+
if (model.patch_bias) {
|
|
49
|
+
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
|
53
|
+
|
|
54
|
+
ggml_tensor * inpL = inp;
|
|
55
|
+
ggml_tensor * window_mask = nullptr;
|
|
56
|
+
ggml_tensor * window_idx = nullptr;
|
|
57
|
+
ggml_tensor * inv_window_idx = nullptr;
|
|
58
|
+
|
|
59
|
+
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
|
60
|
+
ggml_set_name(positions, "positions");
|
|
61
|
+
ggml_set_input(positions);
|
|
62
|
+
|
|
63
|
+
// pre-layernorm
|
|
64
|
+
if (model.pre_ln_w) {
|
|
65
|
+
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
|
66
|
+
}
|
|
67
|
+
if (use_window_attn) {
|
|
68
|
+
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
|
69
|
+
ggml_set_name(inv_window_idx, "inv_window_idx");
|
|
70
|
+
ggml_set_input(inv_window_idx);
|
|
71
|
+
// mask for window attention
|
|
72
|
+
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
|
73
|
+
ggml_set_name(window_mask, "window_mask");
|
|
74
|
+
ggml_set_input(window_mask);
|
|
75
|
+
|
|
76
|
+
// if flash attn is used, we need to pad the mask and cast to f16
|
|
77
|
+
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
|
78
|
+
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
|
|
82
|
+
GGML_ASSERT(batch_size == 1);
|
|
83
|
+
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
|
|
84
|
+
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
|
|
85
|
+
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// loop over layers
|
|
89
|
+
for (int il = 0; il < n_layer; il++) {
|
|
90
|
+
const auto & layer = model.layers[il];
|
|
91
|
+
const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
|
|
92
|
+
|
|
93
|
+
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
|
94
|
+
|
|
95
|
+
// layernorm1
|
|
96
|
+
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
|
97
|
+
// self-attention
|
|
98
|
+
{
|
|
99
|
+
ggml_tensor * Qcur = ggml_add(ctx0,
|
|
100
|
+
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
|
|
101
|
+
ggml_tensor * Kcur = ggml_add(ctx0,
|
|
102
|
+
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
|
|
103
|
+
ggml_tensor * Vcur = ggml_add(ctx0,
|
|
104
|
+
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
|
|
105
|
+
|
|
106
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
|
107
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
|
108
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
|
|
109
|
+
|
|
110
|
+
Qcur = ggml_rope_multi(
|
|
111
|
+
ctx0, Qcur, positions, nullptr,
|
|
112
|
+
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
113
|
+
Kcur = ggml_rope_multi(
|
|
114
|
+
ctx0, Kcur, positions, nullptr,
|
|
115
|
+
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
116
|
+
|
|
117
|
+
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
|
|
118
|
+
|
|
119
|
+
cur = build_attn(layer.o_w, layer.o_b,
|
|
120
|
+
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
|
|
121
|
+
}
|
|
122
|
+
// re-add the layer input, e.g., residual
|
|
123
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
124
|
+
|
|
125
|
+
inpL = cur; // inpL = residual, cur = hidden_states
|
|
126
|
+
|
|
127
|
+
// layernorm2
|
|
128
|
+
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
|
129
|
+
|
|
130
|
+
// ffn
|
|
131
|
+
cur = build_ffn(cur,
|
|
132
|
+
layer.ff_up_w, layer.ff_up_b,
|
|
133
|
+
nullptr, nullptr,
|
|
134
|
+
layer.ff_down_w, layer.ff_down_b,
|
|
135
|
+
hparams.ffn_op, il);
|
|
136
|
+
|
|
137
|
+
// residual 2
|
|
138
|
+
cur = ggml_add(ctx0, inpL, cur);
|
|
139
|
+
|
|
140
|
+
inpL = cur;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
ggml_tensor * embeddings = inpL;
|
|
144
|
+
if (use_window_attn) {
|
|
145
|
+
const int spatial_merge_unit = 4;
|
|
146
|
+
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
|
|
147
|
+
ggml_set_name(window_idx, "window_idx");
|
|
148
|
+
ggml_set_input(window_idx);
|
|
149
|
+
GGML_ASSERT(batch_size == 1);
|
|
150
|
+
embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
|
|
151
|
+
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
|
|
152
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
|
|
153
|
+
cb(embeddings, "window_order_restored", -1);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
|
|
157
|
+
if (model.post_ln_w) {
|
|
158
|
+
embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Now apply merger (VLPatchMerger):
|
|
162
|
+
// 1. Apply RMS norm (ln_q in VLPatchMerger)
|
|
163
|
+
embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
|
|
164
|
+
cb(embeddings, "merger_normed", -1);
|
|
165
|
+
|
|
166
|
+
// 2. First reshape for spatial merge (merge 2x2 patches)
|
|
167
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
|
168
|
+
cb(embeddings, "merger_reshaped", -1);
|
|
169
|
+
|
|
170
|
+
embeddings = build_ffn(embeddings,
|
|
171
|
+
model.mm_0_w, model.mm_0_b,
|
|
172
|
+
nullptr, nullptr,
|
|
173
|
+
model.mm_1_w, model.mm_1_b,
|
|
174
|
+
FFN_GELU,
|
|
175
|
+
-1);
|
|
176
|
+
ggml_build_forward_expand(gf, embeddings);
|
|
177
|
+
|
|
178
|
+
return gf;
|
|
179
|
+
}
|
|
@@ -283,7 +283,7 @@ struct mtmd_context {
|
|
|
283
283
|
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
|
284
284
|
img_end = "[IMG_END]";
|
|
285
285
|
|
|
286
|
-
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
|
|
286
|
+
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
|
|
287
287
|
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
|
288
288
|
img_beg = "<|vision_start|>";
|
|
289
289
|
img_end = "<|vision_end|>";
|
|
@@ -330,6 +330,7 @@ struct mtmd_context {
|
|
|
330
330
|
case PROJECTOR_TYPE_ULTRAVOX:
|
|
331
331
|
case PROJECTOR_TYPE_VOXTRAL:
|
|
332
332
|
case PROJECTOR_TYPE_GLMA:
|
|
333
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
333
334
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
|
334
335
|
break;
|
|
335
336
|
case PROJECTOR_TYPE_LFM2A:
|
|
@@ -352,6 +353,9 @@ struct mtmd_context {
|
|
|
352
353
|
// [BEGIN_AUDIO] ... (embeddings) ...
|
|
353
354
|
aud_beg = "[BEGIN_AUDIO]";
|
|
354
355
|
|
|
356
|
+
} else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
|
357
|
+
// <sound> ... (embeddings) ...
|
|
358
|
+
aud_beg = "<sound>";
|
|
355
359
|
}
|
|
356
360
|
}
|
|
357
361
|
|
|
@@ -27,6 +27,9 @@
|
|
|
27
27
|
* - Make sure the C API is aligned with the libllama C API (as in llama.h)
|
|
28
28
|
* - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
|
|
29
29
|
* - Keep the API minimal, do not expose internal details unless necessary
|
|
30
|
+
*
|
|
31
|
+
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
|
|
32
|
+
* We encourage human contributors to ensure the quality and reliability of the codebase.
|
|
30
33
|
*/
|
|
31
34
|
|
|
32
35
|
#ifdef LLAMA_SHARED
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
#include <cmath>
|
|
13
13
|
#include <cctype>
|
|
14
14
|
#include <algorithm>
|
|
15
|
+
#include <filesystem>
|
|
15
16
|
|
|
16
17
|
struct quant_option {
|
|
17
18
|
std::string name;
|
|
@@ -643,6 +644,11 @@ int main(int argc, char ** argv) {
|
|
|
643
644
|
return 1;
|
|
644
645
|
}
|
|
645
646
|
|
|
647
|
+
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
|
|
648
|
+
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
|
|
649
|
+
return 1;
|
|
650
|
+
}
|
|
651
|
+
|
|
646
652
|
print_build_info();
|
|
647
653
|
|
|
648
654
|
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
|
@@ -38,14 +38,6 @@ set(TARGET_SRCS
|
|
|
38
38
|
server-http.h
|
|
39
39
|
server-models.cpp
|
|
40
40
|
server-models.h
|
|
41
|
-
server-task.cpp
|
|
42
|
-
server-task.h
|
|
43
|
-
server-queue.cpp
|
|
44
|
-
server-queue.h
|
|
45
|
-
server-common.cpp
|
|
46
|
-
server-common.h
|
|
47
|
-
server-context.cpp
|
|
48
|
-
server-context.h
|
|
49
41
|
)
|
|
50
42
|
set(PUBLIC_ASSETS
|
|
51
43
|
index.html.gz
|
|
@@ -107,6 +107,8 @@ For detailed instructions, see the [test documentation](./tests/README.md).
|
|
|
107
107
|
- Large-scale code base split into smaller files: https://github.com/ggml-org/llama.cpp/pull/17362
|
|
108
108
|
- Introduction of router mode: https://github.com/ggml-org/llama.cpp/pull/17470
|
|
109
109
|
- Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808
|
|
110
|
+
- INI presets: https://github.com/ggml-org/llama.cpp/pull/17859 (+ refactoring: https://github.com/ggml-org/llama.cpp/pull/18169)
|
|
111
|
+
- Sleeping mode: https://github.com/ggml-org/llama.cpp/pull/18228
|
|
110
112
|
|
|
111
113
|
|
|
112
114
|
|