llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
- llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
- vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
- vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
- vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
- vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
- vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
- vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
- vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
- vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
- vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
- vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
- llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
- vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
- {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -141,16 +141,24 @@ class ModelBase:
|
|
|
141
141
|
self.model_name = model_name
|
|
142
142
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
|
143
143
|
|
|
144
|
-
# Apply heuristics to figure out typical tensor encoding based on first
|
|
144
|
+
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
|
|
145
|
+
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
|
145
146
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
147
|
+
for _, tensor in self.get_tensors():
|
|
148
|
+
if tensor.dim() < 2:
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
if tensor.dtype == torch.bfloat16:
|
|
152
|
+
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
|
153
|
+
logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16")
|
|
154
|
+
break
|
|
155
|
+
elif tensor.dtype == torch.float16:
|
|
156
|
+
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
|
157
|
+
logger.info("heuristics detected float16 tensor dtype, setting --outtype f16")
|
|
158
|
+
break
|
|
151
159
|
else:
|
|
152
|
-
|
|
153
|
-
|
|
160
|
+
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
|
161
|
+
logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")
|
|
154
162
|
|
|
155
163
|
self.dequant_model()
|
|
156
164
|
|
|
@@ -763,9 +771,14 @@ class TextModel(ModelBase):
|
|
|
763
771
|
|
|
764
772
|
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
|
765
773
|
|
|
774
|
+
rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
|
|
775
|
+
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
|
|
776
|
+
|
|
766
777
|
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
|
767
778
|
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
|
768
|
-
if
|
|
779
|
+
if local_rope_theta is not None:
|
|
780
|
+
self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
|
|
781
|
+
if "rope_theta" not in self.rope_parameters and rope_theta is not None:
|
|
769
782
|
self.rope_parameters["rope_theta"] = rope_theta
|
|
770
783
|
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
|
|
771
784
|
self.rope_parameters["rope_type"] = rope_type
|
|
@@ -831,6 +844,7 @@ class TextModel(ModelBase):
|
|
|
831
844
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
|
832
845
|
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
|
833
846
|
|
|
847
|
+
# TODO: Handle "sliding_attention" similarly when models start implementing it
|
|
834
848
|
rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
|
|
835
849
|
if (rope_type := rope_params.get("rope_type")) is not None:
|
|
836
850
|
rope_factor = rope_params.get("factor")
|
|
@@ -877,6 +891,9 @@ class TextModel(ModelBase):
|
|
|
877
891
|
if (rope_theta := rope_params.get("rope_theta")) is not None:
|
|
878
892
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
|
879
893
|
logger.info(f"gguf: rope theta = {rope_theta}")
|
|
894
|
+
if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
|
|
895
|
+
self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
|
|
896
|
+
logger.info(f"gguf: rope theta swa = {local_rope_theta}")
|
|
880
897
|
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
|
|
881
898
|
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
|
882
899
|
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
|
@@ -1054,6 +1071,9 @@ class TextModel(ModelBase):
|
|
|
1054
1071
|
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
|
|
1055
1072
|
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
|
|
1056
1073
|
res = "grok-2"
|
|
1074
|
+
if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
|
|
1075
|
+
# ref: https://huggingface.co/aari1995/German_Semantic_V3
|
|
1076
|
+
res = "jina-v2-de"
|
|
1057
1077
|
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
|
1058
1078
|
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
|
1059
1079
|
res = "llama-bpe"
|
|
@@ -1204,6 +1224,9 @@ class TextModel(ModelBase):
|
|
|
1204
1224
|
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
|
|
1205
1225
|
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
|
|
1206
1226
|
res = "mellum"
|
|
1227
|
+
if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152":
|
|
1228
|
+
# ref: https://huggingface.co/answerdotai/ModernBERT-base
|
|
1229
|
+
res = "modern-bert"
|
|
1207
1230
|
if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
|
|
1208
1231
|
# ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
|
|
1209
1232
|
res = "afmoe"
|
|
@@ -1219,6 +1242,12 @@ class TextModel(ModelBase):
|
|
|
1219
1242
|
if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
|
|
1220
1243
|
# ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
|
|
1221
1244
|
res = "kormo"
|
|
1245
|
+
if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
|
|
1246
|
+
# ref: https://huggingface.co/tencent/Youtu-LLM-2B
|
|
1247
|
+
res = "youtu"
|
|
1248
|
+
if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
|
|
1249
|
+
# ref: https://huggingface.co/upstage/Solar-Open-100B
|
|
1250
|
+
res = "solar-open"
|
|
1222
1251
|
|
|
1223
1252
|
if res is None:
|
|
1224
1253
|
logger.warning("\n")
|
|
@@ -1685,6 +1714,84 @@ class TextModel(ModelBase):
|
|
|
1685
1714
|
if template is not None:
|
|
1686
1715
|
self.gguf_writer.add_chat_template(template)
|
|
1687
1716
|
|
|
1717
|
+
def _set_vocab_plamo(self):
|
|
1718
|
+
# PLaMo models use a custom tokenizer with a .jsonl file
|
|
1719
|
+
tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
|
|
1720
|
+
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
|
|
1721
|
+
|
|
1722
|
+
if not tokenizer_jsonl_path.is_file():
|
|
1723
|
+
raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}")
|
|
1724
|
+
|
|
1725
|
+
# Load tokenizer config
|
|
1726
|
+
with open(tokenizer_config_path, "r", encoding="utf-8") as f:
|
|
1727
|
+
tokenizer_config = json.load(f)
|
|
1728
|
+
|
|
1729
|
+
# Load tokens from JSONL file (actually a list format)
|
|
1730
|
+
tokens = []
|
|
1731
|
+
scores = []
|
|
1732
|
+
toktypes = []
|
|
1733
|
+
|
|
1734
|
+
with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f:
|
|
1735
|
+
for line_num, line in enumerate(f):
|
|
1736
|
+
if line.strip():
|
|
1737
|
+
token_data = json.loads(line)
|
|
1738
|
+
# Format: [token, score, type, ?, ?, ?, ?]
|
|
1739
|
+
token = token_data[0].encode("utf-8")
|
|
1740
|
+
score = float(token_data[1])
|
|
1741
|
+
token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
|
|
1742
|
+
|
|
1743
|
+
tokens.append(token)
|
|
1744
|
+
scores.append(score)
|
|
1745
|
+
|
|
1746
|
+
if token_type_str == "UNKNOWN":
|
|
1747
|
+
toktypes.append(gguf.TokenType.UNKNOWN)
|
|
1748
|
+
elif token_type_str == "CONTROL":
|
|
1749
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
1750
|
+
elif token_type_str == "BYTE":
|
|
1751
|
+
toktypes.append(gguf.TokenType.BYTE)
|
|
1752
|
+
else:
|
|
1753
|
+
token_str = token_data[0]
|
|
1754
|
+
if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
|
|
1755
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
1756
|
+
else:
|
|
1757
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
|
1758
|
+
|
|
1759
|
+
vocab_size = self.hparams["vocab_size"]
|
|
1760
|
+
if vocab_size > len(tokens):
|
|
1761
|
+
pad_count = vocab_size - len(tokens)
|
|
1762
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
|
1763
|
+
for i in range(1, pad_count + 1):
|
|
1764
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
|
1765
|
+
scores.append(-1000.0)
|
|
1766
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
|
1767
|
+
|
|
1768
|
+
self.gguf_writer.add_tokenizer_model("plamo2")
|
|
1769
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
|
1770
|
+
self.gguf_writer.add_token_list(tokens)
|
|
1771
|
+
self.gguf_writer.add_token_scores(scores)
|
|
1772
|
+
self.gguf_writer.add_token_types(toktypes)
|
|
1773
|
+
|
|
1774
|
+
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
|
|
1775
|
+
token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
|
|
1776
|
+
self.gguf_writer.add_bos_token_id(token_id)
|
|
1777
|
+
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
|
|
1778
|
+
token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
|
|
1779
|
+
self.gguf_writer.add_eos_token_id(token_id)
|
|
1780
|
+
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
|
|
1781
|
+
token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
|
|
1782
|
+
self.gguf_writer.add_pad_token_id(token_id)
|
|
1783
|
+
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
|
|
1784
|
+
token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
|
|
1785
|
+
self.gguf_writer.add_sep_token_id(token_id)
|
|
1786
|
+
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
|
|
1787
|
+
token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
|
|
1788
|
+
self.gguf_writer.add_unk_token_id(token_id)
|
|
1789
|
+
|
|
1790
|
+
# Add <|plamo:op|> as EOT to ensure appropriate end of generation
|
|
1791
|
+
self.gguf_writer.add_eot_token_id(4)
|
|
1792
|
+
|
|
1793
|
+
self.gguf_writer.add_add_space_prefix(False)
|
|
1794
|
+
|
|
1688
1795
|
|
|
1689
1796
|
class MmprojModel(ModelBase):
|
|
1690
1797
|
model_type = ModelType.MMPROJ
|
|
@@ -2397,6 +2504,7 @@ class StableLMModel(TextModel):
|
|
|
2397
2504
|
"VLlama3ForCausalLM",
|
|
2398
2505
|
"LlavaForConditionalGeneration",
|
|
2399
2506
|
"VoxtralForConditionalGeneration",
|
|
2507
|
+
"IQuestCoderForCausalLM",
|
|
2400
2508
|
"LlamaModel")
|
|
2401
2509
|
class LlamaModel(TextModel):
|
|
2402
2510
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
|
@@ -3414,7 +3522,7 @@ class QwenModel(TextModel):
|
|
|
3414
3522
|
self._set_vocab_qwen()
|
|
3415
3523
|
|
|
3416
3524
|
|
|
3417
|
-
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
|
|
3525
|
+
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
|
|
3418
3526
|
class Qwen2Model(TextModel):
|
|
3419
3527
|
model_arch = gguf.MODEL_ARCH.QWEN2
|
|
3420
3528
|
|
|
@@ -4787,87 +4895,7 @@ class Plamo2Model(TextModel):
|
|
|
4787
4895
|
model_arch = gguf.MODEL_ARCH.PLAMO2
|
|
4788
4896
|
|
|
4789
4897
|
def set_vocab(self):
|
|
4790
|
-
|
|
4791
|
-
# We need to handle this specially
|
|
4792
|
-
tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
|
|
4793
|
-
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
|
|
4794
|
-
|
|
4795
|
-
if not tokenizer_jsonl_path.is_file():
|
|
4796
|
-
raise FileNotFoundError(f"PLaMo 2 tokenizer file not found: {tokenizer_jsonl_path}")
|
|
4797
|
-
|
|
4798
|
-
# Load tokenizer config
|
|
4799
|
-
with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
|
|
4800
|
-
tokenizer_config = json.load(f)
|
|
4801
|
-
|
|
4802
|
-
# Load tokens from JSONL file (actually a list format)
|
|
4803
|
-
tokens = []
|
|
4804
|
-
scores = []
|
|
4805
|
-
toktypes = []
|
|
4806
|
-
|
|
4807
|
-
with open(tokenizer_jsonl_path, 'r', encoding='utf-8') as f:
|
|
4808
|
-
for line_num, line in enumerate(f):
|
|
4809
|
-
if line.strip():
|
|
4810
|
-
token_data = json.loads(line)
|
|
4811
|
-
# Format: [token, score, type, ?, ?, ?, ?]
|
|
4812
|
-
token = token_data[0].encode("utf-8")
|
|
4813
|
-
score = float(token_data[1])
|
|
4814
|
-
token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
|
|
4815
|
-
|
|
4816
|
-
tokens.append(token)
|
|
4817
|
-
scores.append(score)
|
|
4818
|
-
|
|
4819
|
-
# Map token type strings to GGUF token types
|
|
4820
|
-
if token_type_str == "UNKNOWN":
|
|
4821
|
-
toktypes.append(gguf.TokenType.UNKNOWN)
|
|
4822
|
-
elif token_type_str == "CONTROL":
|
|
4823
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
|
4824
|
-
elif token_type_str == "BYTE":
|
|
4825
|
-
toktypes.append(gguf.TokenType.BYTE)
|
|
4826
|
-
else:
|
|
4827
|
-
# Check for PLaMo-2 special tokens
|
|
4828
|
-
token_str = token_data[0]
|
|
4829
|
-
if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
|
|
4830
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
|
4831
|
-
else:
|
|
4832
|
-
toktypes.append(gguf.TokenType.NORMAL)
|
|
4833
|
-
|
|
4834
|
-
vocab_size = self.hparams["vocab_size"]
|
|
4835
|
-
if vocab_size > len(tokens):
|
|
4836
|
-
pad_count = vocab_size - len(tokens)
|
|
4837
|
-
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
|
4838
|
-
for i in range(1, pad_count + 1):
|
|
4839
|
-
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
|
4840
|
-
scores.append(-1000.0)
|
|
4841
|
-
toktypes.append(gguf.TokenType.UNUSED)
|
|
4842
|
-
|
|
4843
|
-
# Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
|
|
4844
|
-
self.gguf_writer.add_tokenizer_model("plamo2")
|
|
4845
|
-
self.gguf_writer.add_tokenizer_pre("default")
|
|
4846
|
-
self.gguf_writer.add_token_list(tokens)
|
|
4847
|
-
self.gguf_writer.add_token_scores(scores)
|
|
4848
|
-
self.gguf_writer.add_token_types(toktypes)
|
|
4849
|
-
|
|
4850
|
-
# Add special tokens from config
|
|
4851
|
-
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
|
|
4852
|
-
token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
|
|
4853
|
-
self.gguf_writer.add_bos_token_id(token_id)
|
|
4854
|
-
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
|
|
4855
|
-
token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
|
|
4856
|
-
self.gguf_writer.add_eos_token_id(token_id)
|
|
4857
|
-
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
|
|
4858
|
-
token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
|
|
4859
|
-
self.gguf_writer.add_pad_token_id(token_id)
|
|
4860
|
-
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
|
|
4861
|
-
token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
|
|
4862
|
-
self.gguf_writer.add_sep_token_id(token_id)
|
|
4863
|
-
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
|
|
4864
|
-
token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
|
|
4865
|
-
self.gguf_writer.add_unk_token_id(token_id)
|
|
4866
|
-
|
|
4867
|
-
# Add <|plamo:op|> as EOT to ensure appropriate end of generation
|
|
4868
|
-
self.gguf_writer.add_eot_token_id(4)
|
|
4869
|
-
|
|
4870
|
-
self.gguf_writer.add_add_space_prefix(False)
|
|
4898
|
+
self._set_vocab_plamo()
|
|
4871
4899
|
|
|
4872
4900
|
def set_gguf_parameters(self):
|
|
4873
4901
|
hparams = self.hparams
|
|
@@ -4955,6 +4983,55 @@ class Plamo2Model(TextModel):
|
|
|
4955
4983
|
return [(new_name, data_torch)]
|
|
4956
4984
|
|
|
4957
4985
|
|
|
4986
|
+
@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM")
|
|
4987
|
+
class Plamo3Model(TextModel):
|
|
4988
|
+
model_arch = gguf.MODEL_ARCH.PLAMO3
|
|
4989
|
+
|
|
4990
|
+
def set_vocab(self):
|
|
4991
|
+
self._set_vocab_plamo()
|
|
4992
|
+
|
|
4993
|
+
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
|
|
4994
|
+
tokenizer_config = {}
|
|
4995
|
+
|
|
4996
|
+
if tokenizer_config_path.is_file():
|
|
4997
|
+
with open(tokenizer_config_path, encoding="utf-8") as f:
|
|
4998
|
+
tokenizer_config = json.load(f)
|
|
4999
|
+
|
|
5000
|
+
chat_template = tokenizer_config.get("chat_template")
|
|
5001
|
+
chat_template_jinja = self.dir_model / "chat_template.jinja"
|
|
5002
|
+
|
|
5003
|
+
if chat_template_jinja.is_file():
|
|
5004
|
+
with open(chat_template_jinja, encoding="utf-8") as f:
|
|
5005
|
+
chat_template = f.read()
|
|
5006
|
+
|
|
5007
|
+
if chat_template:
|
|
5008
|
+
self.gguf_writer.add_chat_template(chat_template)
|
|
5009
|
+
|
|
5010
|
+
def set_gguf_parameters(self):
|
|
5011
|
+
super().set_gguf_parameters()
|
|
5012
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
5013
|
+
if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None:
|
|
5014
|
+
self.gguf_writer.add_sliding_window(sliding_window)
|
|
5015
|
+
self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"])
|
|
5016
|
+
|
|
5017
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
5018
|
+
|
|
5019
|
+
if name.endswith(".pre_mixer_norm.weight"):
|
|
5020
|
+
data_torch = data_torch + 1.0
|
|
5021
|
+
elif name.endswith(".post_mixer_norm.weight"):
|
|
5022
|
+
data_torch = data_torch + 1.0 / 5
|
|
5023
|
+
elif name.endswith(".pre_mlp_norm.weight"):
|
|
5024
|
+
data_torch = data_torch + 1.0
|
|
5025
|
+
elif name.endswith(".post_mlp_norm.weight"):
|
|
5026
|
+
data_torch = data_torch + 1.0 / (5**1.5)
|
|
5027
|
+
elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")):
|
|
5028
|
+
data_torch = data_torch + 1.0
|
|
5029
|
+
elif name.endswith(".norm.weight"):
|
|
5030
|
+
data_torch = data_torch + 1.0
|
|
5031
|
+
|
|
5032
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
5033
|
+
|
|
5034
|
+
|
|
4958
5035
|
@ModelBase.register("CodeShellForCausalLM")
|
|
4959
5036
|
class CodeShellModel(TextModel):
|
|
4960
5037
|
model_arch = gguf.MODEL_ARCH.CODESHELL
|
|
@@ -5225,13 +5302,14 @@ class BertModel(TextModel):
|
|
|
5225
5302
|
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
|
5226
5303
|
|
|
5227
5304
|
# convert to phantom space vocab
|
|
5228
|
-
def phantom(tok):
|
|
5229
|
-
if
|
|
5305
|
+
def phantom(tok, toktype):
|
|
5306
|
+
if toktype == gguf.TokenType.CONTROL:
|
|
5230
5307
|
return tok
|
|
5231
5308
|
if tok.startswith("##"):
|
|
5232
5309
|
return tok[2:]
|
|
5233
5310
|
return "\u2581" + tok
|
|
5234
|
-
tokens
|
|
5311
|
+
assert len(tokens) == len(toktypes)
|
|
5312
|
+
tokens = list(map(phantom, tokens, toktypes))
|
|
5235
5313
|
|
|
5236
5314
|
# add vocab to gguf
|
|
5237
5315
|
self.gguf_writer.add_tokenizer_model("bert")
|
|
@@ -6345,6 +6423,17 @@ class ARwkv7Model(Rwkv7Model):
|
|
|
6345
6423
|
self.gguf_writer.add_head_count(0)
|
|
6346
6424
|
|
|
6347
6425
|
|
|
6426
|
+
@ModelBase.register("MaincoderForCausalLM")
|
|
6427
|
+
class MaincoderModel(TextModel):
|
|
6428
|
+
model_arch = gguf.MODEL_ARCH.MAINCODER
|
|
6429
|
+
|
|
6430
|
+
def set_gguf_parameters(self):
|
|
6431
|
+
super().set_gguf_parameters()
|
|
6432
|
+
|
|
6433
|
+
if (head_dim := self.hparams.get("head_dim")) is not None:
|
|
6434
|
+
self.gguf_writer.add_rope_dimension_count(head_dim)
|
|
6435
|
+
|
|
6436
|
+
|
|
6348
6437
|
@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
|
6349
6438
|
class MambaModel(TextModel):
|
|
6350
6439
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
|
@@ -7122,6 +7211,7 @@ class DeepseekModel(TextModel):
|
|
|
7122
7211
|
"DeepseekV2ForCausalLM",
|
|
7123
7212
|
"DeepseekV3ForCausalLM",
|
|
7124
7213
|
"KimiVLForConditionalGeneration",
|
|
7214
|
+
"YoutuForCausalLM",
|
|
7125
7215
|
)
|
|
7126
7216
|
class DeepseekV2Model(TextModel):
|
|
7127
7217
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
|
@@ -7188,7 +7278,15 @@ class DeepseekV2Model(TextModel):
|
|
|
7188
7278
|
super().set_gguf_parameters()
|
|
7189
7279
|
hparams = self.hparams
|
|
7190
7280
|
|
|
7191
|
-
|
|
7281
|
+
# first_k_dense_replace: number of leading layers using dense FFN instead of MoE
|
|
7282
|
+
# For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers
|
|
7283
|
+
# For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers
|
|
7284
|
+
has_moe = hparams.get("n_routed_experts") is not None
|
|
7285
|
+
first_k_dense_replace = hparams.get("first_k_dense_replace")
|
|
7286
|
+
if first_k_dense_replace is None:
|
|
7287
|
+
# Default: if no MoE, all layers are dense; if MoE, none are dense
|
|
7288
|
+
first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
|
|
7289
|
+
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
|
|
7192
7290
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
7193
7291
|
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
|
7194
7292
|
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
|
@@ -7200,11 +7298,24 @@ class DeepseekV2Model(TextModel):
|
|
|
7200
7298
|
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
|
7201
7299
|
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
|
|
7202
7300
|
|
|
7203
|
-
|
|
7204
|
-
|
|
7205
|
-
self.
|
|
7206
|
-
self.gguf_writer.
|
|
7207
|
-
|
|
7301
|
+
# MoE parameters (required by C++ code for DEEPSEEK2 arch)
|
|
7302
|
+
# For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
|
|
7303
|
+
moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False)
|
|
7304
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
|
7305
|
+
|
|
7306
|
+
if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
|
|
7307
|
+
self.gguf_writer.add_expert_count(n_routed_experts)
|
|
7308
|
+
|
|
7309
|
+
# expert_shared_count is required by C++ code, default to 0 for non-MoE models
|
|
7310
|
+
n_shared_experts = hparams.get("n_shared_experts", 0)
|
|
7311
|
+
self.gguf_writer.add_expert_shared_count(n_shared_experts)
|
|
7312
|
+
|
|
7313
|
+
# When not set, C++ code will use scale_w = false to skip the no-op scaling
|
|
7314
|
+
if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
|
|
7315
|
+
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
|
|
7316
|
+
|
|
7317
|
+
if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
|
|
7318
|
+
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
|
|
7208
7319
|
|
|
7209
7320
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
|
7210
7321
|
|
|
@@ -7220,10 +7331,17 @@ class DeepseekV2Model(TextModel):
|
|
|
7220
7331
|
# skip vision tensors and remove "language_model." for Kimi-VL
|
|
7221
7332
|
if "vision_tower" in name or "multi_modal_projector" in name:
|
|
7222
7333
|
return []
|
|
7223
|
-
|
|
7334
|
+
if name.startswith("siglip2.") or name.startswith("merger."):
|
|
7335
|
+
return []
|
|
7224
7336
|
if name.startswith("language_model."):
|
|
7225
7337
|
name = name.replace("language_model.", "")
|
|
7226
7338
|
|
|
7339
|
+
# skip lm_head.weight if tie_word_embeddings is True
|
|
7340
|
+
if self.hparams.get("tie_word_embeddings", False):
|
|
7341
|
+
if name == "lm_head.weight" or name == "model.lm_head.weight":
|
|
7342
|
+
logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
|
|
7343
|
+
return []
|
|
7344
|
+
|
|
7227
7345
|
# rename e_score_correction_bias tensors
|
|
7228
7346
|
if name.endswith("e_score_correction_bias"):
|
|
7229
7347
|
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
|
@@ -7351,6 +7469,89 @@ class MiniMaxM2Model(TextModel):
|
|
|
7351
7469
|
return super().modify_tensors(data_torch, name, bid)
|
|
7352
7470
|
|
|
7353
7471
|
|
|
7472
|
+
@ModelBase.register("MiMoV2FlashForCausalLM")
|
|
7473
|
+
class MimoV2Model(TextModel):
|
|
7474
|
+
model_arch = gguf.MODEL_ARCH.MIMO2
|
|
7475
|
+
|
|
7476
|
+
def set_gguf_parameters(self):
|
|
7477
|
+
super().set_gguf_parameters()
|
|
7478
|
+
|
|
7479
|
+
assert self.hparams["swa_head_dim"] == self.hparams["head_dim"]
|
|
7480
|
+
assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"]
|
|
7481
|
+
assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"]
|
|
7482
|
+
assert self.hparams["topk_method"] == "noaux_tc"
|
|
7483
|
+
|
|
7484
|
+
n_head_kv = self.hparams["num_key_value_heads"]
|
|
7485
|
+
n_head_kv_swa = self.hparams["swa_num_key_value_heads"]
|
|
7486
|
+
n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in self.hparams["hybrid_layer_pattern"]]
|
|
7487
|
+
self.gguf_writer.add_head_count_kv(n_head_kv_arr)
|
|
7488
|
+
|
|
7489
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
|
7490
|
+
self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
|
|
7491
|
+
self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
|
|
7492
|
+
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
|
|
7493
|
+
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
|
|
7494
|
+
|
|
7495
|
+
rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
|
|
7496
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
7497
|
+
|
|
7498
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
|
|
7499
|
+
|
|
7500
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
7501
|
+
|
|
7502
|
+
def modify_tensors(self, data_torch, name, bid):
|
|
7503
|
+
if name.endswith("e_score_correction_bias"):
|
|
7504
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
|
7505
|
+
|
|
7506
|
+
if "attention_sink" in name and not name.endswith(".weight"):
|
|
7507
|
+
name += ".weight"
|
|
7508
|
+
|
|
7509
|
+
# TODO: mimo v2 does not indicate the number of next-token-prediction layers, therefore we cannot do the same way as GLM4_MOE
|
|
7510
|
+
if "model.mtp." in name:
|
|
7511
|
+
return []
|
|
7512
|
+
|
|
7513
|
+
# process the experts separately
|
|
7514
|
+
if name.find("mlp.experts") != -1:
|
|
7515
|
+
n_experts = self.hparams["n_routed_experts"]
|
|
7516
|
+
assert bid is not None
|
|
7517
|
+
|
|
7518
|
+
if self._experts is None:
|
|
7519
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
7520
|
+
|
|
7521
|
+
self._experts[bid][name] = data_torch
|
|
7522
|
+
|
|
7523
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
7524
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
7525
|
+
|
|
7526
|
+
# merge the experts into a single 3d tensor
|
|
7527
|
+
for w_name in ["gate_proj", "up_proj", "down_proj"]:
|
|
7528
|
+
datas: list[Tensor] = []
|
|
7529
|
+
|
|
7530
|
+
for xid in range(n_experts):
|
|
7531
|
+
ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
7532
|
+
datas.append(self._experts[bid][ename_to_retrieve])
|
|
7533
|
+
del self._experts[bid][ename_to_retrieve]
|
|
7534
|
+
|
|
7535
|
+
data_torch = torch.stack(datas, dim=0)
|
|
7536
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
7537
|
+
new_name = self.map_tensor_name(merged_name)
|
|
7538
|
+
tensors.append((new_name, data_torch))
|
|
7539
|
+
|
|
7540
|
+
return tensors
|
|
7541
|
+
else:
|
|
7542
|
+
return []
|
|
7543
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
7544
|
+
|
|
7545
|
+
def prepare_tensors(self):
|
|
7546
|
+
super().prepare_tensors()
|
|
7547
|
+
|
|
7548
|
+
if self._experts is not None:
|
|
7549
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
7550
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
7551
|
+
if len(experts) > 0:
|
|
7552
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
7553
|
+
|
|
7554
|
+
|
|
7354
7555
|
@ModelBase.register("PanguEmbeddedForCausalLM")
|
|
7355
7556
|
class PanguEmbeddedModel(TextModel):
|
|
7356
7557
|
model_arch = gguf.MODEL_ARCH.PANGU_EMBED
|
|
@@ -8684,6 +8885,11 @@ class NemotronHModel(GraniteHybridModel):
|
|
|
8684
8885
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
8685
8886
|
|
|
8686
8887
|
|
|
8888
|
+
@ModelBase.register("LlamaBidirectionalModel")
|
|
8889
|
+
class LlamaEmbedNemotronModel(LlamaModel):
|
|
8890
|
+
model_arch = gguf.MODEL_ARCH.LLAMA_EMBED
|
|
8891
|
+
|
|
8892
|
+
|
|
8687
8893
|
@ModelBase.register("BailingMoeForCausalLM")
|
|
8688
8894
|
class BailingMoeModel(TextModel):
|
|
8689
8895
|
model_arch = gguf.MODEL_ARCH.BAILINGMOE
|
|
@@ -9144,6 +9350,19 @@ class VoxtralWhisperEncoderModel(WhisperEncoderModel):
|
|
|
9144
9350
|
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
|
|
9145
9351
|
|
|
9146
9352
|
|
|
9353
|
+
@ModelBase.register("AudioFlamingo3ForConditionalGeneration")
|
|
9354
|
+
class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel):
|
|
9355
|
+
def set_gguf_parameters(self):
|
|
9356
|
+
super().set_gguf_parameters()
|
|
9357
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO)
|
|
9358
|
+
|
|
9359
|
+
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
|
9360
|
+
if ".conv" in name and ".weight" in name:
|
|
9361
|
+
# Was trained in BF16, being safe, avoiding quantizing to FP16
|
|
9362
|
+
return gguf.GGMLQuantizationType.F32
|
|
9363
|
+
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
|
9364
|
+
|
|
9365
|
+
|
|
9147
9366
|
@ModelBase.register("FalconH1ForCausalLM")
|
|
9148
9367
|
class FalconH1Model(Mamba2Model):
|
|
9149
9368
|
model_arch = gguf.MODEL_ARCH.FALCON_H1
|
|
@@ -9991,6 +10210,35 @@ class SmallThinkerModel(TextModel):
|
|
|
9991
10210
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
9992
10211
|
|
|
9993
10212
|
|
|
10213
|
+
@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
|
|
10214
|
+
class ModernBertModel(BertModel):
|
|
10215
|
+
model_arch = gguf.MODEL_ARCH.MODERN_BERT
|
|
10216
|
+
|
|
10217
|
+
def set_vocab(self):
|
|
10218
|
+
self.gguf_writer.add_add_bos_token(True)
|
|
10219
|
+
self.gguf_writer.add_add_eos_token(True)
|
|
10220
|
+
self.gguf_writer.add_add_sep_token(True)
|
|
10221
|
+
self._set_vocab_gpt2()
|
|
10222
|
+
|
|
10223
|
+
def set_gguf_parameters(self):
|
|
10224
|
+
super().set_gguf_parameters()
|
|
10225
|
+
self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
|
|
10226
|
+
if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
|
|
10227
|
+
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
|
10228
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
10229
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
10230
|
+
|
|
10231
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
10232
|
+
# these layers act as MLM head, so we don't need them
|
|
10233
|
+
if name.startswith("decoder."):
|
|
10234
|
+
return []
|
|
10235
|
+
|
|
10236
|
+
if name.startswith("model."):
|
|
10237
|
+
name = name[6:]
|
|
10238
|
+
|
|
10239
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
10240
|
+
|
|
10241
|
+
|
|
9994
10242
|
@ModelBase.register("ApertusForCausalLM")
|
|
9995
10243
|
class ApertusModel(LlamaModel):
|
|
9996
10244
|
model_arch = gguf.MODEL_ARCH.APERTUS
|
|
@@ -10426,6 +10674,79 @@ class JanusProVisionModel(MmprojModel):
|
|
|
10426
10674
|
return []
|
|
10427
10675
|
|
|
10428
10676
|
|
|
10677
|
+
@ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
|
|
10678
|
+
class YOUTUVLVisionModel(MmprojModel):
|
|
10679
|
+
def __init__(self, *args, **kwargs):
|
|
10680
|
+
super().__init__(*args, **kwargs)
|
|
10681
|
+
assert self.hparams_vision is not None
|
|
10682
|
+
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
|
|
10683
|
+
|
|
10684
|
+
def set_gguf_parameters(self):
|
|
10685
|
+
super().set_gguf_parameters()
|
|
10686
|
+
|
|
10687
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL)
|
|
10688
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
|
10689
|
+
|
|
10690
|
+
# Handle activation function
|
|
10691
|
+
hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower()
|
|
10692
|
+
if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"):
|
|
10693
|
+
self.gguf_writer.add_vision_use_gelu(True)
|
|
10694
|
+
elif hidden_act == "silu":
|
|
10695
|
+
self.gguf_writer.add_vision_use_silu(True)
|
|
10696
|
+
else:
|
|
10697
|
+
raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}")
|
|
10698
|
+
|
|
10699
|
+
self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
|
|
10700
|
+
|
|
10701
|
+
window_size = self.hparams.get("window_size")
|
|
10702
|
+
if window_size is not None:
|
|
10703
|
+
self.gguf_writer.add_vision_window_size(window_size)
|
|
10704
|
+
# fullatt_block_indexes contains explicit layer indices that use full attention
|
|
10705
|
+
# e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention
|
|
10706
|
+
# All other layers use window attention
|
|
10707
|
+
fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
|
|
10708
|
+
assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl"
|
|
10709
|
+
# Store the explicit layer indices for YoutuVL (irregular pattern approach)
|
|
10710
|
+
self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes)
|
|
10711
|
+
|
|
10712
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
10713
|
+
del bid # unused
|
|
10714
|
+
|
|
10715
|
+
# Skip language model tensors
|
|
10716
|
+
skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.')
|
|
10717
|
+
if name.startswith(skip_prefixes):
|
|
10718
|
+
return []
|
|
10719
|
+
|
|
10720
|
+
# Try to map the tensor using TensorNameMap (handles vision encoder and projector)
|
|
10721
|
+
try:
|
|
10722
|
+
new_name = self.map_tensor_name(name)
|
|
10723
|
+
return [(new_name, data_torch)]
|
|
10724
|
+
except ValueError:
|
|
10725
|
+
# If mapping fails, log warning and skip
|
|
10726
|
+
logger.warning(f"Cannot map tensor: {name}")
|
|
10727
|
+
return []
|
|
10728
|
+
|
|
10729
|
+
|
|
10730
|
+
@ModelBase.register("SolarOpenForCausalLM")
|
|
10731
|
+
class SolarOpenModel(Glm4MoeModel):
|
|
10732
|
+
model_arch = gguf.MODEL_ARCH.GLM4_MOE
|
|
10733
|
+
|
|
10734
|
+
def set_vocab(self):
|
|
10735
|
+
from transformers import AutoTokenizer
|
|
10736
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
|
10737
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
|
10738
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
|
10739
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
|
10740
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
|
10741
|
+
self.gguf_writer.add_token_list(tokens)
|
|
10742
|
+
self.gguf_writer.add_token_types(toktypes)
|
|
10743
|
+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
|
10744
|
+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"])
|
|
10745
|
+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"])
|
|
10746
|
+
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"])
|
|
10747
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
|
10748
|
+
|
|
10749
|
+
|
|
10429
10750
|
###### CONVERSION LOGIC ######
|
|
10430
10751
|
|
|
10431
10752
|
|
|
@@ -10557,8 +10878,8 @@ def parse_args() -> argparse.Namespace:
|
|
|
10557
10878
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
|
10558
10879
|
)
|
|
10559
10880
|
parser.add_argument(
|
|
10560
|
-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="
|
|
10561
|
-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type
|
|
10881
|
+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
|
|
10882
|
+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type",
|
|
10562
10883
|
)
|
|
10563
10884
|
parser.add_argument(
|
|
10564
10885
|
"--bigendian", action="store_true",
|