@novastera-oss/llamarn 0.4.0 → 0.4.3-beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +4 -1
- package/android/CMakeLists.txt +13 -3
- package/android/src/main/cpp/include/llama.h +44 -21
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -10
- package/cpp/SystemUtils.cpp +3 -7
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +12 -0
- package/cpp/llama.cpp/CODEOWNERS +116 -10
- package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
- package/cpp/llama.cpp/README.md +13 -5
- package/cpp/llama.cpp/build-xcframework.sh +5 -0
- package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
- package/cpp/llama.cpp/common/arg.cpp +303 -795
- package/cpp/llama.cpp/common/arg.h +2 -3
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
- package/cpp/llama.cpp/common/chat-parser.h +13 -0
- package/cpp/llama.cpp/common/chat.cpp +1147 -88
- package/cpp/llama.cpp/common/chat.h +16 -3
- package/cpp/llama.cpp/common/common.cpp +70 -15
- package/cpp/llama.cpp/common/common.h +57 -19
- package/cpp/llama.cpp/common/download.cpp +1072 -0
- package/cpp/llama.cpp/common/download.h +55 -0
- package/cpp/llama.cpp/common/http.h +73 -0
- package/cpp/llama.cpp/common/json-partial.cpp +70 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/cpp/llama.cpp/common/log.cpp +59 -2
- package/cpp/llama.cpp/common/log.h +12 -4
- package/cpp/llama.cpp/common/sampling.cpp +84 -8
- package/cpp/llama.cpp/common/sampling.h +3 -1
- package/cpp/llama.cpp/common/speculative.cpp +1 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
- package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
- package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
- package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
- package/cpp/llama.cpp/include/llama.h +44 -21
- package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
- package/cpp/llama.cpp/media/llama1-icon.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
- package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
- package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
- package/cpp/llama.cpp/src/llama-arch.h +50 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
- package/cpp/llama.cpp/src/llama-batch.h +13 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
- package/cpp/llama.cpp/src/llama-chat.h +4 -0
- package/cpp/llama.cpp/src/llama-context.cpp +300 -45
- package/cpp/llama.cpp/src/llama-context.h +16 -6
- package/cpp/llama.cpp/src/llama-cparams.h +2 -1
- package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
- package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
- package/cpp/llama.cpp/src/llama-graph.h +27 -5
- package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
- package/cpp/llama.cpp/src/llama-hparams.h +48 -8
- package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
- package/cpp/llama.cpp/src/llama-impl.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
- package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
- package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
- package/cpp/llama.cpp/src/llama-model.h +40 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
- package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
- package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
- package/cpp/llama.cpp/src/llama-vocab.h +43 -39
- package/cpp/llama.cpp/src/llama.cpp +69 -10
- package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
- package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
- package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
- package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
- package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
- package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/llama.cpp/src/models/bert.cpp +176 -0
- package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
- package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
- package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
- package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
- package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
- package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
- package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
- package/cpp/llama.cpp/src/models/deci.cpp +135 -0
- package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
- package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
- package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
- package/cpp/llama.cpp/src/models/dream.cpp +105 -0
- package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
- package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
- package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
- package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
- package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
- package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
- package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/llama.cpp/src/models/granite.cpp +211 -0
- package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/cpp/llama.cpp/src/models/grok.cpp +159 -0
- package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
- package/cpp/llama.cpp/src/models/jais.cpp +86 -0
- package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
- package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
- package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/llama.cpp/src/models/llada.cpp +99 -0
- package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/cpp/llama.cpp/src/models/llama.cpp +155 -0
- package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
- package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/llama.cpp/src/models/models.h +485 -0
- package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
- package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
- package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
- package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
- package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
- package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
- package/cpp/llama.cpp/src/models/orion.cpp +123 -0
- package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
- package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
- package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
- package/cpp/llama.cpp/src/models/plm.cpp +168 -0
- package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
- package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/cpp/llama.cpp/src/models/refact.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
- package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
- package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
- package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
- package/cpp/llama.cpp/src/unicode.cpp +77 -0
- package/cpp/llama.cpp/src/unicode.h +43 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
- package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
- package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
- package/cpp/rn-completion.cpp +3 -27
- package/ios/include/chat.h +16 -3
- package/ios/include/common/minja/chat-template.hpp +9 -2
- package/ios/include/common/minja/minja.hpp +101 -22
- package/ios/include/common.h +57 -19
- package/ios/include/json-schema-to-grammar.h +2 -0
- package/ios/include/llama.h +44 -21
- package/ios/include/log.h +12 -4
- package/ios/include/sampling.h +3 -1
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
- package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
- package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
- package/cpp/llama.cpp/models/templates/README.md +0 -25
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
- package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
- package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
- package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
- package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
- package/cpp/llama.cpp/prompts/assistant.txt +0 -31
- package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/cpp/llama.cpp/prompts/chat.txt +0 -28
- package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
- package/cpp/llama.cpp/prompts/dan.txt +0 -1
- package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
- package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -29,12 +29,29 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|
|
29
29
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
|
30
30
|
import gguf
|
|
31
31
|
from gguf.vocab import MistralTokenizerType, MistralVocab
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
from mistral_common.tokens.tokenizers.
|
|
35
|
-
from mistral_common.tokens.tokenizers.
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
|
|
35
|
+
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
|
|
36
|
+
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
|
|
37
|
+
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
|
|
38
|
+
SentencePieceTokenizer,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
_mistral_common_installed = True
|
|
42
|
+
_mistral_import_error_msg = ""
|
|
43
|
+
except ImportError:
|
|
44
|
+
_MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
|
45
|
+
_MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
|
|
46
|
+
|
|
47
|
+
_mistral_common_installed = False
|
|
48
|
+
TokenizerVersion = None
|
|
49
|
+
Tekkenizer = None
|
|
50
|
+
SentencePieceTokenizer = None
|
|
51
|
+
_mistral_import_error_msg = (
|
|
52
|
+
"Mistral format requires `mistral-common` to be installed. Please run "
|
|
53
|
+
"`pip install mistral-common[image,audio]` to install it."
|
|
54
|
+
)
|
|
38
55
|
|
|
39
56
|
|
|
40
57
|
logger = logging.getLogger("hf-to-gguf")
|
|
@@ -73,10 +90,8 @@ class ModelBase:
|
|
|
73
90
|
use_temp_file: bool
|
|
74
91
|
lazy: bool
|
|
75
92
|
dry_run: bool
|
|
76
|
-
part_names: list[str]
|
|
77
|
-
is_safetensors: bool
|
|
78
93
|
hparams: dict[str, Any]
|
|
79
|
-
|
|
94
|
+
model_tensors: dict[str, Callable[[], Tensor]]
|
|
80
95
|
gguf_writer: gguf.GGUFWriter
|
|
81
96
|
model_name: str | None
|
|
82
97
|
metadata_override: Path | None
|
|
@@ -93,18 +108,23 @@ class ModelBase:
|
|
|
93
108
|
# Mistral format specifics
|
|
94
109
|
is_mistral_format: bool = False
|
|
95
110
|
disable_mistral_community_chat_template: bool = False
|
|
111
|
+
sentence_transformers_dense_modules: bool = False
|
|
96
112
|
|
|
97
113
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
|
|
98
114
|
use_temp_file: bool = False, eager: bool = False,
|
|
99
115
|
metadata_override: Path | None = None, model_name: str | None = None,
|
|
100
116
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
|
101
117
|
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
|
|
102
|
-
disable_mistral_community_chat_template: bool = False
|
|
118
|
+
disable_mistral_community_chat_template: bool = False,
|
|
119
|
+
sentence_transformers_dense_modules: bool = False):
|
|
103
120
|
if type(self) is ModelBase or \
|
|
104
121
|
type(self) is TextModel or \
|
|
105
122
|
type(self) is MmprojModel:
|
|
106
123
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
|
107
124
|
|
|
125
|
+
if self.is_mistral_format and not _mistral_common_installed:
|
|
126
|
+
raise ImportError(_mistral_import_error_msg)
|
|
127
|
+
|
|
108
128
|
self.dir_model = dir_model
|
|
109
129
|
self.ftype = ftype
|
|
110
130
|
self.fname_out = fname_out
|
|
@@ -114,25 +134,9 @@ class ModelBase:
|
|
|
114
134
|
self.lazy = not eager or (remote_hf_model_id is not None)
|
|
115
135
|
self.dry_run = dry_run
|
|
116
136
|
self.remote_hf_model_id = remote_hf_model_id
|
|
117
|
-
|
|
118
|
-
self.is_safetensors = True
|
|
119
|
-
|
|
120
|
-
def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
|
|
121
|
-
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
|
|
122
|
-
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
|
|
123
|
-
self.tensor_names = set(name for name in remote_tensors.keys())
|
|
124
|
-
for name, remote_tensor in remote_tensors.items():
|
|
125
|
-
yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))
|
|
126
|
-
|
|
127
|
-
self.get_tensors = get_remote_tensors
|
|
128
|
-
else:
|
|
129
|
-
prefix = "model" if not self.is_mistral_format else "consolidated"
|
|
130
|
-
self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
|
|
131
|
-
self.is_safetensors = len(self.part_names) > 0
|
|
132
|
-
if not self.is_safetensors:
|
|
133
|
-
self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
|
137
|
+
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
|
134
138
|
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
|
|
135
|
-
self.
|
|
139
|
+
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
|
|
136
140
|
self.metadata_override = metadata_override
|
|
137
141
|
self.model_name = model_name
|
|
138
142
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
|
@@ -148,6 +152,8 @@ class ModelBase:
|
|
|
148
152
|
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
|
149
153
|
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
|
150
154
|
|
|
155
|
+
self.dequant_model()
|
|
156
|
+
|
|
151
157
|
# Configure GGUF Writer
|
|
152
158
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
|
153
159
|
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
|
@@ -169,67 +175,292 @@ class ModelBase:
|
|
|
169
175
|
return None
|
|
170
176
|
raise KeyError(f"could not find any of: {keys}")
|
|
171
177
|
|
|
172
|
-
def
|
|
173
|
-
|
|
178
|
+
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
|
|
179
|
+
tensors: dict[str, Callable[[], Tensor]] = {}
|
|
180
|
+
|
|
181
|
+
if remote_hf_model_id is not None:
|
|
182
|
+
is_safetensors = True
|
|
183
|
+
|
|
184
|
+
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
|
|
185
|
+
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
|
|
186
|
+
for name, remote_tensor in remote_tensors.items():
|
|
187
|
+
tensors[name] = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r)
|
|
188
|
+
|
|
189
|
+
return tensors
|
|
190
|
+
|
|
191
|
+
prefix = "model" if not self.is_mistral_format else "consolidated"
|
|
192
|
+
part_names: set[str] = set(ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors"))
|
|
193
|
+
is_safetensors: bool = len(part_names) > 0
|
|
194
|
+
if not is_safetensors:
|
|
195
|
+
part_names = set(ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin"))
|
|
196
|
+
|
|
197
|
+
tensor_names_from_index: set[str] = set()
|
|
174
198
|
|
|
175
199
|
if not self.is_mistral_format:
|
|
176
|
-
index_name = "model.safetensors" if
|
|
200
|
+
index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin"
|
|
177
201
|
index_name += ".index.json"
|
|
178
202
|
index_file = self.dir_model / index_name
|
|
179
203
|
|
|
180
204
|
if index_file.is_file():
|
|
181
|
-
self.tensor_names = set()
|
|
182
205
|
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
|
183
206
|
with open(index_file, "r", encoding="utf-8") as f:
|
|
184
207
|
index: dict[str, Any] = json.load(f)
|
|
185
208
|
weight_map = index.get("weight_map")
|
|
186
209
|
if weight_map is None or not isinstance(weight_map, dict):
|
|
187
210
|
raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
|
|
188
|
-
|
|
211
|
+
tensor_names_from_index.update(weight_map.keys())
|
|
212
|
+
part_names |= set(weight_map.values())
|
|
189
213
|
else:
|
|
190
|
-
self.tensor_names = tensor_names_from_parts
|
|
191
214
|
weight_map = {}
|
|
192
215
|
else:
|
|
193
|
-
self.tensor_names = tensor_names_from_parts
|
|
194
216
|
weight_map = {}
|
|
195
217
|
|
|
196
|
-
for part_name in
|
|
197
|
-
logger.info(f"gguf:
|
|
218
|
+
for part_name in part_names:
|
|
219
|
+
logger.info(f"gguf: indexing model part '{part_name}'")
|
|
198
220
|
ctx: ContextManager[Any]
|
|
199
|
-
if
|
|
200
|
-
|
|
201
|
-
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
|
221
|
+
if is_safetensors:
|
|
222
|
+
ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name))
|
|
202
223
|
else:
|
|
203
224
|
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
|
204
225
|
|
|
205
226
|
with ctx as model_part:
|
|
206
|
-
|
|
227
|
+
assert model_part is not None
|
|
207
228
|
|
|
208
229
|
for name in model_part.keys():
|
|
209
|
-
if
|
|
230
|
+
if is_safetensors:
|
|
231
|
+
data: gguf.utility.LocalTensor = model_part[name]
|
|
210
232
|
if self.lazy:
|
|
211
|
-
|
|
212
|
-
data = LazyTorchTensor.from_safetensors_slice(data)
|
|
233
|
+
data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731
|
|
213
234
|
else:
|
|
214
|
-
|
|
235
|
+
dtype = LazyTorchTensor._dtype_str_map[data.dtype]
|
|
236
|
+
data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731
|
|
215
237
|
else:
|
|
216
|
-
|
|
238
|
+
data_torch: Tensor = model_part[name]
|
|
217
239
|
if self.lazy:
|
|
218
|
-
|
|
219
|
-
|
|
240
|
+
data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731
|
|
241
|
+
else:
|
|
242
|
+
data_gen = lambda data=data_torch: data # noqa: E731
|
|
243
|
+
tensors[name] = data_gen
|
|
220
244
|
|
|
221
245
|
# verify tensor name presence and identify potentially missing files
|
|
222
|
-
if len(
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
246
|
+
if len(tensor_names_from_index) > 0:
|
|
247
|
+
tensor_names_from_parts = set(tensors.keys())
|
|
248
|
+
if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0:
|
|
249
|
+
missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts))
|
|
250
|
+
extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index))
|
|
251
|
+
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
|
252
|
+
if len(extra) == 0 and len(missing_files) > 0:
|
|
253
|
+
raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
|
|
254
|
+
f"Missing tensors: {missing}")
|
|
255
|
+
else:
|
|
256
|
+
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
|
257
|
+
f"Missing tensors: {missing}\n"
|
|
258
|
+
f"Extra tensors: {extra}")
|
|
259
|
+
|
|
260
|
+
return tensors
|
|
261
|
+
|
|
262
|
+
def dequant_model(self):
|
|
263
|
+
tensors_to_remove: list[str] = []
|
|
264
|
+
new_tensors: dict[str, Callable[[], Tensor]] = {}
|
|
265
|
+
|
|
266
|
+
if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict):
|
|
267
|
+
quant_method = quant_config.get("quant_method")
|
|
268
|
+
|
|
269
|
+
def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor:
|
|
270
|
+
weight = weight.view(torch.uint8)
|
|
271
|
+
orig_shape = weight.shape
|
|
272
|
+
|
|
273
|
+
shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape)))))
|
|
274
|
+
data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift
|
|
275
|
+
data = data & 3
|
|
276
|
+
data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:]))
|
|
277
|
+
|
|
278
|
+
# The scale is inverted
|
|
279
|
+
return data / scale.float()
|
|
280
|
+
|
|
281
|
+
def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor:
|
|
282
|
+
scale = scale.float()
|
|
283
|
+
|
|
284
|
+
if block_size is not None:
|
|
285
|
+
for i, size in enumerate(block_size):
|
|
286
|
+
scale = scale.repeat_interleave(size, i)
|
|
287
|
+
# unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
|
|
288
|
+
scale = scale[tuple(slice(0, size) for size in weight.shape)]
|
|
289
|
+
|
|
290
|
+
return weight.float() * scale
|
|
291
|
+
|
|
292
|
+
# ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476
|
|
293
|
+
def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor:
|
|
294
|
+
bits = quant_config["bits"]
|
|
295
|
+
assert bits in (2, 3, 4, 8)
|
|
296
|
+
assert qweight.dtype == qzeros.dtype
|
|
297
|
+
maxq = (2 ** bits) - 1
|
|
298
|
+
weight = None
|
|
299
|
+
zeros = None
|
|
300
|
+
pack_dtype_bits = qweight.dtype.itemsize * 8
|
|
301
|
+
|
|
302
|
+
if bits in [2, 4, 8]:
|
|
303
|
+
pack_factor = pack_dtype_bits // bits
|
|
304
|
+
wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0)
|
|
305
|
+
if self.lazy:
|
|
306
|
+
wf = LazyTorchTensor.from_eager(wf)
|
|
307
|
+
|
|
308
|
+
zeros = torch.bitwise_right_shift(
|
|
309
|
+
qzeros.unsqueeze(2).expand(-1, -1, pack_factor),
|
|
310
|
+
wf.unsqueeze(0)
|
|
311
|
+
).to(torch.int16 if bits == 8 else torch.int8)
|
|
312
|
+
zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape)
|
|
313
|
+
|
|
314
|
+
weight = torch.bitwise_and(
|
|
315
|
+
torch.bitwise_right_shift(
|
|
316
|
+
qweight.unsqueeze(1).expand(-1, pack_factor, -1),
|
|
317
|
+
wf.unsqueeze(-1)
|
|
318
|
+
).to(torch.int16 if bits == 8 else torch.int8),
|
|
319
|
+
maxq
|
|
320
|
+
)
|
|
321
|
+
elif bits == 3:
|
|
322
|
+
raise NotImplementedError("3-bit gptq dequantization is not yet implemented")
|
|
323
|
+
|
|
324
|
+
assert weight is not None
|
|
325
|
+
assert zeros is not None
|
|
326
|
+
|
|
327
|
+
weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
|
|
328
|
+
|
|
329
|
+
# gptq_v2 doesn't need to offset zeros
|
|
330
|
+
if quant_config.get("checkpoint_format", "gptq") == "gptq":
|
|
331
|
+
zeros += 1
|
|
332
|
+
|
|
333
|
+
return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T
|
|
334
|
+
|
|
335
|
+
def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int):
|
|
336
|
+
assert w.dtype == torch.int32
|
|
337
|
+
shape = tuple(shape_tensor.tolist())
|
|
338
|
+
assert len(shape) == 2
|
|
339
|
+
mask = (1 << num_bits) - 1
|
|
340
|
+
|
|
341
|
+
shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32)
|
|
342
|
+
if self.lazy:
|
|
343
|
+
shifts = LazyTorchTensor.from_eager(shifts)
|
|
344
|
+
|
|
345
|
+
if zero_point is None:
|
|
346
|
+
offset = 1 << (num_bits - 1)
|
|
347
|
+
else:
|
|
348
|
+
assert len(zero_point.shape) == 2
|
|
349
|
+
offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask
|
|
350
|
+
offset = offset.reshape(-1, zero_point.shape[1])
|
|
351
|
+
# trim padding, and prepare for broadcast
|
|
352
|
+
# NOTE: the zero-point is packed along dim 0
|
|
353
|
+
offset = offset[:shape[0], :].unsqueeze(-1)
|
|
354
|
+
|
|
355
|
+
# extract values
|
|
356
|
+
# NOTE: the weights are packed along dim 1
|
|
357
|
+
unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask
|
|
358
|
+
unpacked = unpacked.reshape(shape[0], -1)
|
|
359
|
+
|
|
360
|
+
# trim padding
|
|
361
|
+
unpacked = unpacked[:, :shape[1]]
|
|
362
|
+
|
|
363
|
+
# prepare for broadcast of the scale
|
|
364
|
+
unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size)
|
|
365
|
+
unpacked = unpacked - offset
|
|
366
|
+
|
|
367
|
+
return (unpacked * scale.unsqueeze(-1).float()).reshape(shape)
|
|
368
|
+
|
|
369
|
+
if quant_method == "bitnet":
|
|
370
|
+
for name in self.model_tensors.keys():
|
|
371
|
+
if name.endswith(".weight_scale"):
|
|
372
|
+
weight_name = name.removesuffix("_scale")
|
|
373
|
+
w = self.model_tensors[weight_name]
|
|
374
|
+
s = self.model_tensors[name]
|
|
375
|
+
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s())
|
|
376
|
+
tensors_to_remove.append(name)
|
|
377
|
+
elif quant_method == "fp8":
|
|
378
|
+
block_size = quant_config.get("weight_block_size")
|
|
379
|
+
for name in self.model_tensors.keys():
|
|
380
|
+
if name.endswith(".weight_scale_inv"):
|
|
381
|
+
weight_name = name.removesuffix("_scale_inv")
|
|
382
|
+
w = self.model_tensors[weight_name]
|
|
383
|
+
s = self.model_tensors[name]
|
|
384
|
+
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
|
385
|
+
tensors_to_remove.append(name)
|
|
386
|
+
elif quant_method == "gptq":
|
|
387
|
+
for name in self.model_tensors.keys():
|
|
388
|
+
if name.endswith(".qweight"):
|
|
389
|
+
base_name = name.removesuffix(".qweight")
|
|
390
|
+
g_idx = self.model_tensors[base_name + ".g_idx"]
|
|
391
|
+
qweight = self.model_tensors[base_name + ".qweight"]
|
|
392
|
+
qzeros = self.model_tensors[base_name + ".qzeros"]
|
|
393
|
+
scales = self.model_tensors[base_name + ".scales"]
|
|
394
|
+
new_tensors[base_name + ".weight"] = (
|
|
395
|
+
lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq(
|
|
396
|
+
g(), w(), z(), s()
|
|
397
|
+
)
|
|
398
|
+
)
|
|
399
|
+
tensors_to_remove += [
|
|
400
|
+
base_name + n
|
|
401
|
+
for n in (
|
|
402
|
+
".g_idx",
|
|
403
|
+
".qzeros",
|
|
404
|
+
".qweight",
|
|
405
|
+
".scales",
|
|
406
|
+
)
|
|
407
|
+
]
|
|
408
|
+
elif quant_method == "compressed-tensors":
|
|
409
|
+
quant_format = quant_config["format"]
|
|
410
|
+
groups = quant_config["config_groups"]
|
|
411
|
+
if len(groups) > 1:
|
|
412
|
+
raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
|
|
413
|
+
weight_config = tuple(groups.values())[0]["weights"]
|
|
414
|
+
|
|
415
|
+
if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized":
|
|
416
|
+
block_size = weight_config.get("block_structure", None)
|
|
417
|
+
strategy = weight_config.get("strategy")
|
|
418
|
+
assert strategy == "channel" or strategy == "block"
|
|
419
|
+
assert weight_config.get("group_size") is None # didn't find a model using this yet
|
|
420
|
+
for name in self.model_tensors.keys():
|
|
421
|
+
if name.endswith(".weight_scale"):
|
|
422
|
+
weight_name = name.removesuffix("_scale")
|
|
423
|
+
w = self.model_tensors[weight_name]
|
|
424
|
+
s = self.model_tensors[name]
|
|
425
|
+
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
|
|
426
|
+
tensors_to_remove.append(name)
|
|
427
|
+
elif quant_format == "pack-quantized":
|
|
428
|
+
assert weight_config.get("strategy") == "group"
|
|
429
|
+
assert weight_config.get("type", "int") == "int"
|
|
430
|
+
num_bits = weight_config.get("num_bits")
|
|
431
|
+
group_size = weight_config.get("group_size")
|
|
432
|
+
assert isinstance(num_bits, int)
|
|
433
|
+
assert isinstance(group_size, int)
|
|
434
|
+
for name in self.model_tensors.keys():
|
|
435
|
+
if name.endswith(".weight_packed"):
|
|
436
|
+
base_name = name.removesuffix("_packed")
|
|
437
|
+
w = self.model_tensors[name]
|
|
438
|
+
scale = self.model_tensors[base_name + "_scale"]
|
|
439
|
+
shape = self.model_tensors[base_name + "_shape"]
|
|
440
|
+
zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None)
|
|
441
|
+
new_tensors[base_name] = (
|
|
442
|
+
lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed(
|
|
443
|
+
w(), scale(), shape(), zero_point(), num_bits, group_size,
|
|
444
|
+
)
|
|
445
|
+
)
|
|
446
|
+
tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
|
|
447
|
+
if (base_name + "_zero_point") in self.model_tensors:
|
|
448
|
+
tensors_to_remove.append(base_name + "_zero_point")
|
|
449
|
+
else:
|
|
450
|
+
raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
|
|
229
451
|
else:
|
|
230
|
-
raise
|
|
231
|
-
|
|
232
|
-
|
|
452
|
+
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
|
|
453
|
+
|
|
454
|
+
for name in tensors_to_remove:
|
|
455
|
+
if name in self.model_tensors:
|
|
456
|
+
del self.model_tensors[name]
|
|
457
|
+
|
|
458
|
+
for name, value in new_tensors.items():
|
|
459
|
+
self.model_tensors[name] = value
|
|
460
|
+
|
|
461
|
+
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
|
462
|
+
for name, gen in self.model_tensors.items():
|
|
463
|
+
yield name, gen()
|
|
233
464
|
|
|
234
465
|
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
|
235
466
|
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
|
@@ -302,10 +533,6 @@ class ModelBase:
|
|
|
302
533
|
# data = data_torch.squeeze().numpy()
|
|
303
534
|
data = data_torch.numpy()
|
|
304
535
|
|
|
305
|
-
# if data ends up empty, it means data_torch was a scalar tensor -> restore
|
|
306
|
-
if len(data.shape) == 0:
|
|
307
|
-
data = data_torch.numpy()
|
|
308
|
-
|
|
309
536
|
n_dims = len(data.shape)
|
|
310
537
|
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
|
|
311
538
|
|
|
@@ -592,6 +819,21 @@ class TextModel(ModelBase):
|
|
|
592
819
|
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
|
593
820
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
|
594
821
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
|
822
|
+
if (n_expert_groups := self.hparams.get("n_group")) is not None:
|
|
823
|
+
self.gguf_writer.add_expert_group_count(n_expert_groups)
|
|
824
|
+
logger.info(f"gguf: expert groups count = {n_expert_groups}")
|
|
825
|
+
if (n_group_used := self.hparams.get("topk_group")) is not None:
|
|
826
|
+
self.gguf_writer.add_expert_group_used_count(n_group_used)
|
|
827
|
+
logger.info(f"gguf: expert groups used count = {n_group_used}")
|
|
828
|
+
|
|
829
|
+
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None:
|
|
830
|
+
if score_func == "sigmoid":
|
|
831
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
|
832
|
+
elif score_func == "softmax":
|
|
833
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
|
834
|
+
else:
|
|
835
|
+
raise ValueError(f"Unsupported expert score gating function value: {score_func}")
|
|
836
|
+
logger.info(f"gguf: expert score gating function = {score_func}")
|
|
595
837
|
|
|
596
838
|
if (head_dim := self.hparams.get("head_dim")) is not None:
|
|
597
839
|
self.gguf_writer.add_key_length(head_dim)
|
|
@@ -739,6 +981,9 @@ class TextModel(ModelBase):
|
|
|
739
981
|
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
|
|
740
982
|
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
|
|
741
983
|
res = "qwen2"
|
|
984
|
+
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
|
|
985
|
+
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
|
|
986
|
+
res = "grok-2"
|
|
742
987
|
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
|
743
988
|
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
|
744
989
|
res = "llama-bpe"
|
|
@@ -889,6 +1134,18 @@ class TextModel(ModelBase):
|
|
|
889
1134
|
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
|
|
890
1135
|
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
|
|
891
1136
|
res = "mellum"
|
|
1137
|
+
if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
|
|
1138
|
+
# ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
|
|
1139
|
+
res = "afmoe"
|
|
1140
|
+
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
|
|
1141
|
+
# ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0
|
|
1142
|
+
res = "bailingmoe2"
|
|
1143
|
+
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
|
|
1144
|
+
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
|
|
1145
|
+
res = "granite-docling"
|
|
1146
|
+
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
|
|
1147
|
+
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
|
|
1148
|
+
res = "minimax-m2"
|
|
892
1149
|
|
|
893
1150
|
if res is None:
|
|
894
1151
|
logger.warning("\n")
|
|
@@ -1323,6 +1580,7 @@ class MmprojModel(ModelBase):
|
|
|
1323
1580
|
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
|
|
1324
1581
|
|
|
1325
1582
|
# load preprocessor config
|
|
1583
|
+
self.preprocessor_config = {}
|
|
1326
1584
|
if not self.is_mistral_format:
|
|
1327
1585
|
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
|
|
1328
1586
|
self.preprocessor_config = json.load(f)
|
|
@@ -1337,6 +1595,17 @@ class MmprojModel(ModelBase):
|
|
|
1337
1595
|
def set_type(self):
|
|
1338
1596
|
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
|
1339
1597
|
|
|
1598
|
+
def prepare_metadata(self, vocab_only: bool):
|
|
1599
|
+
super().prepare_metadata(vocab_only=vocab_only)
|
|
1600
|
+
|
|
1601
|
+
output_type: str = self.ftype.name.partition("_")[2]
|
|
1602
|
+
|
|
1603
|
+
if self.fname_out.is_dir():
|
|
1604
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None)
|
|
1605
|
+
self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf"
|
|
1606
|
+
else:
|
|
1607
|
+
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
|
1608
|
+
|
|
1340
1609
|
def set_gguf_parameters(self):
|
|
1341
1610
|
self.gguf_writer.add_file_type(self.ftype)
|
|
1342
1611
|
|
|
@@ -1345,16 +1614,17 @@ class MmprojModel(ModelBase):
|
|
|
1345
1614
|
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
|
|
1346
1615
|
|
|
1347
1616
|
# vision config
|
|
1348
|
-
self.
|
|
1617
|
+
self.image_size = self.find_vparam(["image_size"])
|
|
1618
|
+
self.gguf_writer.add_vision_image_size(self.image_size)
|
|
1349
1619
|
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
|
|
1350
1620
|
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
|
|
1351
1621
|
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
|
|
1352
1622
|
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
|
|
1353
|
-
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
|
|
1623
|
+
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
|
|
1354
1624
|
|
|
1355
1625
|
# preprocessor config
|
|
1356
|
-
image_mean =
|
|
1357
|
-
image_std =
|
|
1626
|
+
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
|
|
1627
|
+
image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
|
|
1358
1628
|
|
|
1359
1629
|
self.gguf_writer.add_vision_image_mean(image_mean)
|
|
1360
1630
|
self.gguf_writer.add_vision_image_std(image_std)
|
|
@@ -1403,11 +1673,9 @@ class GPTNeoXModel(TextModel):
|
|
|
1403
1673
|
model_arch = gguf.MODEL_ARCH.GPTNEOX
|
|
1404
1674
|
|
|
1405
1675
|
def set_gguf_parameters(self):
|
|
1406
|
-
block_count = self.hparams["num_hidden_layers"]
|
|
1407
|
-
|
|
1408
1676
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
1409
1677
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
1410
|
-
self.gguf_writer.add_block_count(block_count)
|
|
1678
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1411
1679
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
1412
1680
|
self.gguf_writer.add_rope_dimension_count(
|
|
1413
1681
|
int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
|
|
@@ -1465,7 +1733,7 @@ class BloomModel(TextModel):
|
|
|
1465
1733
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
|
1466
1734
|
self.gguf_writer.add_embedding_length(n_embed)
|
|
1467
1735
|
self.gguf_writer.add_feed_forward_length(4 * n_embed)
|
|
1468
|
-
self.gguf_writer.add_block_count(self.
|
|
1736
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1469
1737
|
self.gguf_writer.add_head_count(n_head)
|
|
1470
1738
|
self.gguf_writer.add_head_count_kv(n_head)
|
|
1471
1739
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
|
@@ -1528,10 +1796,9 @@ class MPTModel(TextModel):
|
|
|
1528
1796
|
self.gguf_writer.add_unk_token_id(0)
|
|
1529
1797
|
|
|
1530
1798
|
def set_gguf_parameters(self):
|
|
1531
|
-
block_count = self.hparams["n_layers"]
|
|
1532
1799
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
|
1533
1800
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
|
1534
|
-
self.gguf_writer.add_block_count(block_count)
|
|
1801
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1535
1802
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
|
|
1536
1803
|
self.gguf_writer.add_head_count(self.hparams["n_heads"])
|
|
1537
1804
|
if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
|
|
@@ -1564,7 +1831,6 @@ class OrionModel(TextModel):
|
|
|
1564
1831
|
self._set_vocab_sentencepiece()
|
|
1565
1832
|
|
|
1566
1833
|
def set_gguf_parameters(self):
|
|
1567
|
-
block_count = self.hparams["num_hidden_layers"]
|
|
1568
1834
|
head_count = self.hparams["num_attention_heads"]
|
|
1569
1835
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
|
1570
1836
|
|
|
@@ -1582,7 +1848,7 @@ class OrionModel(TextModel):
|
|
|
1582
1848
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
|
1583
1849
|
self.gguf_writer.add_context_length(ctx_length)
|
|
1584
1850
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
1585
|
-
self.gguf_writer.add_block_count(block_count)
|
|
1851
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1586
1852
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
1587
1853
|
self.gguf_writer.add_head_count(head_count)
|
|
1588
1854
|
self.gguf_writer.add_head_count_kv(head_count_kv)
|
|
@@ -1599,7 +1865,6 @@ class BaichuanModel(TextModel):
|
|
|
1599
1865
|
self._set_vocab_sentencepiece()
|
|
1600
1866
|
|
|
1601
1867
|
def set_gguf_parameters(self):
|
|
1602
|
-
block_count = self.hparams["num_hidden_layers"]
|
|
1603
1868
|
head_count = self.hparams["num_attention_heads"]
|
|
1604
1869
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
|
1605
1870
|
|
|
@@ -1616,7 +1881,7 @@ class BaichuanModel(TextModel):
|
|
|
1616
1881
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
|
1617
1882
|
self.gguf_writer.add_context_length(ctx_length)
|
|
1618
1883
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
1619
|
-
self.gguf_writer.add_block_count(block_count)
|
|
1884
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1620
1885
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
1621
1886
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
|
1622
1887
|
self.gguf_writer.add_head_count(head_count)
|
|
@@ -1723,7 +1988,6 @@ class XverseModel(TextModel):
|
|
|
1723
1988
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
1724
1989
|
|
|
1725
1990
|
def set_gguf_parameters(self):
|
|
1726
|
-
block_count = self.hparams["num_hidden_layers"]
|
|
1727
1991
|
head_count = self.hparams["num_attention_heads"]
|
|
1728
1992
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
|
1729
1993
|
|
|
@@ -1740,7 +2004,7 @@ class XverseModel(TextModel):
|
|
|
1740
2004
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
|
1741
2005
|
self.gguf_writer.add_context_length(ctx_length)
|
|
1742
2006
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
1743
|
-
self.gguf_writer.add_block_count(block_count)
|
|
2007
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1744
2008
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
1745
2009
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
|
1746
2010
|
self.gguf_writer.add_head_count(head_count)
|
|
@@ -1783,10 +2047,6 @@ class FalconModel(TextModel):
|
|
|
1783
2047
|
model_arch = gguf.MODEL_ARCH.FALCON
|
|
1784
2048
|
|
|
1785
2049
|
def set_gguf_parameters(self):
|
|
1786
|
-
block_count = self.hparams.get("num_hidden_layers")
|
|
1787
|
-
if block_count is None:
|
|
1788
|
-
block_count = self.hparams["n_layer"] # old name
|
|
1789
|
-
|
|
1790
2050
|
n_head = self.hparams.get("num_attention_heads")
|
|
1791
2051
|
if n_head is None:
|
|
1792
2052
|
n_head = self.hparams["n_head"] # old name
|
|
@@ -1799,7 +2059,7 @@ class FalconModel(TextModel):
|
|
|
1799
2059
|
self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
|
1800
2060
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
1801
2061
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
|
|
1802
|
-
self.gguf_writer.add_block_count(block_count)
|
|
2062
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1803
2063
|
self.gguf_writer.add_head_count(n_head)
|
|
1804
2064
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
|
1805
2065
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
|
@@ -1837,12 +2097,10 @@ class StarCoderModel(TextModel):
|
|
|
1837
2097
|
model_arch = gguf.MODEL_ARCH.STARCODER
|
|
1838
2098
|
|
|
1839
2099
|
def set_gguf_parameters(self):
|
|
1840
|
-
block_count = self.hparams["n_layer"]
|
|
1841
|
-
|
|
1842
2100
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
|
1843
2101
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
|
1844
2102
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
|
1845
|
-
self.gguf_writer.add_block_count(block_count)
|
|
2103
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1846
2104
|
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
|
1847
2105
|
self.gguf_writer.add_head_count_kv(1)
|
|
1848
2106
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
|
@@ -1872,14 +2130,12 @@ class RefactModel(TextModel):
|
|
|
1872
2130
|
multiple_of = 256
|
|
1873
2131
|
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
|
1874
2132
|
|
|
1875
|
-
block_count = self.hparams["n_layer"]
|
|
1876
|
-
|
|
1877
2133
|
# refact uses Alibi. So this is from config.json which might be used by training.
|
|
1878
2134
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
|
1879
2135
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
|
1880
2136
|
|
|
1881
2137
|
self.gguf_writer.add_feed_forward_length(ff_dim)
|
|
1882
|
-
self.gguf_writer.add_block_count(block_count)
|
|
2138
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1883
2139
|
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
|
1884
2140
|
self.gguf_writer.add_head_count_kv(1)
|
|
1885
2141
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
|
@@ -1926,11 +2182,10 @@ class StableLMModel(TextModel):
|
|
|
1926
2182
|
|
|
1927
2183
|
def set_gguf_parameters(self):
|
|
1928
2184
|
hparams = self.hparams
|
|
1929
|
-
block_count = hparams["num_hidden_layers"]
|
|
1930
2185
|
|
|
1931
2186
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
|
1932
2187
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
1933
|
-
self.gguf_writer.add_block_count(block_count)
|
|
2188
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
1934
2189
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
1935
2190
|
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
|
1936
2191
|
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
|
@@ -2023,6 +2278,9 @@ class LlamaModel(TextModel):
|
|
|
2023
2278
|
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
|
2024
2279
|
|
|
2025
2280
|
def _set_vocab_mistral(self):
|
|
2281
|
+
if not _mistral_common_installed:
|
|
2282
|
+
raise ImportError(_mistral_import_error_msg)
|
|
2283
|
+
|
|
2026
2284
|
vocab = MistralVocab(self.dir_model)
|
|
2027
2285
|
logger.info(
|
|
2028
2286
|
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
|
|
@@ -2273,24 +2531,93 @@ class ArceeModel(LlamaModel):
|
|
|
2273
2531
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
2274
2532
|
|
|
2275
2533
|
|
|
2534
|
+
@ModelBase.register("AfmoeForCausalLM")
|
|
2535
|
+
class AfmoeModel(LlamaModel):
|
|
2536
|
+
model_arch = gguf.MODEL_ARCH.AFMOE
|
|
2537
|
+
|
|
2538
|
+
def set_gguf_parameters(self):
|
|
2539
|
+
super().set_gguf_parameters()
|
|
2540
|
+
|
|
2541
|
+
# MoE parameters
|
|
2542
|
+
if (n_experts := self.hparams.get("num_experts")) is not None:
|
|
2543
|
+
self.gguf_writer.add_expert_count(n_experts)
|
|
2544
|
+
if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None:
|
|
2545
|
+
self.gguf_writer.add_expert_shared_count(n_shared_experts)
|
|
2546
|
+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
|
2547
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
|
2548
|
+
if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None:
|
|
2549
|
+
self.gguf_writer.add_leading_dense_block_count(n_dense_layers)
|
|
2550
|
+
|
|
2551
|
+
# Route normalization and scaling
|
|
2552
|
+
if (route_norm := self.hparams.get("route_norm")) is not None:
|
|
2553
|
+
self.gguf_writer.add_expert_weights_norm(route_norm)
|
|
2554
|
+
if (route_scale := self.hparams.get("route_scale")) is not None:
|
|
2555
|
+
self.gguf_writer.add_expert_weights_scale(route_scale)
|
|
2556
|
+
|
|
2557
|
+
# Sliding window attention
|
|
2558
|
+
if (sliding_window := self.hparams.get("sliding_window")) is not None:
|
|
2559
|
+
self.gguf_writer.add_sliding_window(sliding_window)
|
|
2560
|
+
|
|
2561
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
2562
|
+
# Handle expert weights - they're already merged in the HF format
|
|
2563
|
+
# process the experts separately
|
|
2564
|
+
if name.find("mlp.experts") != -1:
|
|
2565
|
+
n_experts = self.hparams["num_experts"]
|
|
2566
|
+
assert bid is not None
|
|
2567
|
+
|
|
2568
|
+
if self._experts is None:
|
|
2569
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
2570
|
+
|
|
2571
|
+
self._experts[bid][name] = data_torch
|
|
2572
|
+
|
|
2573
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
2574
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
2575
|
+
|
|
2576
|
+
# merge the experts into a single 3d tensor
|
|
2577
|
+
for w_name in ["gate_proj", "up_proj", "down_proj"]:
|
|
2578
|
+
datas: list[Tensor] = []
|
|
2579
|
+
|
|
2580
|
+
for xid in range(n_experts):
|
|
2581
|
+
ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
2582
|
+
datas.append(self._experts[bid][ename_to_retrieve])
|
|
2583
|
+
del self._experts[bid][ename_to_retrieve]
|
|
2584
|
+
|
|
2585
|
+
data_torch = torch.stack(datas, dim=0)
|
|
2586
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
2587
|
+
new_name = self.map_tensor_name(merged_name)
|
|
2588
|
+
tensors.append((new_name, data_torch))
|
|
2589
|
+
|
|
2590
|
+
return tensors
|
|
2591
|
+
else:
|
|
2592
|
+
return []
|
|
2593
|
+
|
|
2594
|
+
if name.endswith(".expert_bias"):
|
|
2595
|
+
name = name.replace(".expert_bias", ".expert_bias.bias")
|
|
2596
|
+
|
|
2597
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
2598
|
+
|
|
2599
|
+
|
|
2276
2600
|
@ModelBase.register(
|
|
2277
2601
|
"LlavaForConditionalGeneration", # pixtral
|
|
2278
2602
|
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
|
2279
2603
|
)
|
|
2280
2604
|
class LlavaVisionModel(MmprojModel):
|
|
2281
2605
|
img_break_tok_id = -1
|
|
2606
|
+
use_break_tok = True
|
|
2282
2607
|
|
|
2283
2608
|
def __init__(self, *args, **kwargs):
|
|
2284
2609
|
super().__init__(*args, **kwargs)
|
|
2285
2610
|
if self.hparams.get("model_type") == "pixtral":
|
|
2286
2611
|
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
|
|
2287
2612
|
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
|
|
2288
|
-
|
|
2613
|
+
if self.use_break_tok:
|
|
2614
|
+
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
|
|
2289
2615
|
elif self.is_mistral_format:
|
|
2290
2616
|
# hparams is already vision config here so norm_eps is only defined in global_config.
|
|
2291
2617
|
self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
|
|
2292
2618
|
assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
|
|
2293
|
-
|
|
2619
|
+
if self.use_break_tok:
|
|
2620
|
+
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
|
|
2294
2621
|
else:
|
|
2295
2622
|
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
|
|
2296
2623
|
logger.info(f"Image break token id: {self.img_break_tok_id}")
|
|
@@ -2376,6 +2703,10 @@ class SmolVLMModel(MmprojModel):
|
|
|
2376
2703
|
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
|
|
2377
2704
|
self.gguf_writer.add_vision_use_gelu(True)
|
|
2378
2705
|
|
|
2706
|
+
# Add the preprocessor longest edge size
|
|
2707
|
+
preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
|
|
2708
|
+
self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
|
|
2709
|
+
|
|
2379
2710
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
|
2380
2711
|
if ".embeddings." in name:
|
|
2381
2712
|
return gguf.GGMLQuantizationType.F32
|
|
@@ -2391,7 +2722,10 @@ class SmolVLMModel(MmprojModel):
|
|
|
2391
2722
|
return [] # skip other tensors
|
|
2392
2723
|
|
|
2393
2724
|
|
|
2394
|
-
@ModelBase.register(
|
|
2725
|
+
@ModelBase.register(
|
|
2726
|
+
"Llama4ForConditionalGeneration",
|
|
2727
|
+
"Llama4ForCausalLM",
|
|
2728
|
+
)
|
|
2395
2729
|
class Llama4Model(LlamaModel):
|
|
2396
2730
|
model_arch = gguf.MODEL_ARCH.LLAMA4
|
|
2397
2731
|
undo_permute = False
|
|
@@ -2409,6 +2743,10 @@ class Llama4Model(LlamaModel):
|
|
|
2409
2743
|
super().set_gguf_parameters()
|
|
2410
2744
|
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
|
|
2411
2745
|
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
|
|
2746
|
+
if "layer_types" in self.hparams:
|
|
2747
|
+
if all(lt == "full_attention" for lt in self.hparams["layer_types"]):
|
|
2748
|
+
# all layers are full attention (for MobileLLM), disable swa
|
|
2749
|
+
self.gguf_writer.add_sliding_window(0)
|
|
2412
2750
|
|
|
2413
2751
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
|
2414
2752
|
if name.startswith("language_model."):
|
|
@@ -2686,12 +3024,20 @@ class BitnetModel(TextModel):
|
|
|
2686
3024
|
yield (new_name, data_torch)
|
|
2687
3025
|
|
|
2688
3026
|
|
|
2689
|
-
@ModelBase.register("GrokForCausalLM")
|
|
3027
|
+
@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM")
|
|
2690
3028
|
class GrokModel(TextModel):
|
|
2691
3029
|
model_arch = gguf.MODEL_ARCH.GROK
|
|
2692
3030
|
|
|
2693
3031
|
def set_vocab(self):
|
|
2694
|
-
self.
|
|
3032
|
+
if (self.dir_model / 'tokenizer.model').is_file():
|
|
3033
|
+
self._set_vocab_sentencepiece()
|
|
3034
|
+
return
|
|
3035
|
+
|
|
3036
|
+
if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
|
|
3037
|
+
logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
|
|
3038
|
+
sys.exit(1)
|
|
3039
|
+
|
|
3040
|
+
self._set_vocab_gpt2()
|
|
2695
3041
|
|
|
2696
3042
|
def __init__(self, *args, **kwargs):
|
|
2697
3043
|
super().__init__(*args, **kwargs)
|
|
@@ -2699,11 +3045,46 @@ class GrokModel(TextModel):
|
|
|
2699
3045
|
def set_gguf_parameters(self):
|
|
2700
3046
|
super().set_gguf_parameters()
|
|
2701
3047
|
|
|
2702
|
-
|
|
3048
|
+
self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0))
|
|
3049
|
+
self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0))
|
|
3050
|
+
if (final_logit_softcap := self.hparams.get("final_logit_softcapping")):
|
|
3051
|
+
self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
|
|
3052
|
+
|
|
3053
|
+
if (rope_dim := self.hparams.get("head_dim")) is None:
|
|
3054
|
+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
3055
|
+
|
|
3056
|
+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
|
3057
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
|
3058
|
+
|
|
3059
|
+
# Treat "original" as "yarn", seems to have been a mistake
|
|
3060
|
+
if self.hparams.get("rope_type") in ("yarn", "original"):
|
|
3061
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
3062
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
|
|
3063
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
|
|
3064
|
+
self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"])
|
|
3065
|
+
self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"])
|
|
3066
|
+
self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"])
|
|
3067
|
+
self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"])
|
|
3068
|
+
|
|
3069
|
+
if temp_len := self.hparams.get("attn_temperature_len"):
|
|
3070
|
+
self.gguf_writer.add_attn_temperature_length(temp_len)
|
|
3071
|
+
|
|
3072
|
+
self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
|
|
3073
|
+
self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
|
|
3074
|
+
self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
|
|
3075
|
+
|
|
3076
|
+
_experts: list[dict[str, list[Tensor]]] | None = None
|
|
3077
|
+
_cur_expert = ""
|
|
2703
3078
|
|
|
2704
3079
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
3080
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
3081
|
+
is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
|
|
3082
|
+
|
|
3083
|
+
if not is_expert:
|
|
3084
|
+
tensors.append((self.map_tensor_name(name), data_torch))
|
|
3085
|
+
|
|
2705
3086
|
# process the experts separately
|
|
2706
|
-
if
|
|
3087
|
+
if is_expert or self._cur_expert:
|
|
2707
3088
|
n_experts = self.hparams["num_local_experts"]
|
|
2708
3089
|
|
|
2709
3090
|
assert bid is not None
|
|
@@ -2711,32 +3092,41 @@ class GrokModel(TextModel):
|
|
|
2711
3092
|
if self._experts is None:
|
|
2712
3093
|
self._experts = [{} for _ in range(self.block_count)]
|
|
2713
3094
|
|
|
2714
|
-
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
|
|
3095
|
+
# concatenate split tensors
|
|
3096
|
+
if name in self._experts[bid]:
|
|
3097
|
+
self._cur_expert = name
|
|
3098
|
+
self._experts[bid][name].append(data_torch)
|
|
3099
|
+
return []
|
|
3100
|
+
elif is_expert:
|
|
3101
|
+
self._cur_expert = name
|
|
3102
|
+
self._experts[bid][name] = [data_torch]
|
|
3103
|
+
return []
|
|
3104
|
+
else:
|
|
3105
|
+
self._cur_expert = ""
|
|
2718
3106
|
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
|
|
3107
|
+
for bid in range(self.block_count):
|
|
3108
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
3109
|
+
# merge the experts into a single 3d tensor
|
|
3110
|
+
for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]:
|
|
3111
|
+
datas: list[Tensor] = []
|
|
2722
3112
|
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
3113
|
+
for xid in range(n_experts):
|
|
3114
|
+
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
|
|
3115
|
+
if ename not in self._experts[bid]:
|
|
3116
|
+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
|
|
3117
|
+
tensor_list = self._experts[bid][ename]
|
|
3118
|
+
datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0])
|
|
3119
|
+
del self._experts[bid][ename]
|
|
2727
3120
|
|
|
2728
|
-
|
|
3121
|
+
data_torch = torch.stack(datas, dim=0)
|
|
2729
3122
|
|
|
2730
|
-
|
|
3123
|
+
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
|
|
2731
3124
|
|
|
2732
|
-
|
|
3125
|
+
new_name = self.map_tensor_name(merged_name)
|
|
2733
3126
|
|
|
2734
|
-
|
|
2735
|
-
return tensors
|
|
2736
|
-
else:
|
|
2737
|
-
return []
|
|
3127
|
+
yield (new_name, data_torch)
|
|
2738
3128
|
|
|
2739
|
-
|
|
3129
|
+
yield from tensors
|
|
2740
3130
|
|
|
2741
3131
|
|
|
2742
3132
|
@ModelBase.register("DbrxForCausalLM")
|
|
@@ -2746,7 +3136,7 @@ class DbrxModel(TextModel):
|
|
|
2746
3136
|
def set_gguf_parameters(self):
|
|
2747
3137
|
ffn_config = self.hparams["ffn_config"]
|
|
2748
3138
|
attn_config = self.hparams["attn_config"]
|
|
2749
|
-
self.gguf_writer.add_block_count(self.
|
|
3139
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
2750
3140
|
|
|
2751
3141
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
|
2752
3142
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
|
@@ -2948,7 +3338,7 @@ class QwenModel(TextModel):
|
|
|
2948
3338
|
|
|
2949
3339
|
def set_gguf_parameters(self):
|
|
2950
3340
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
2951
|
-
self.gguf_writer.add_block_count(self.
|
|
3341
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
2952
3342
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
2953
3343
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
2954
3344
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
|
@@ -3605,7 +3995,43 @@ class Qwen2MoeModel(TextModel):
|
|
|
3605
3995
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
3606
3996
|
# process the experts separately
|
|
3607
3997
|
name = name.replace("language_model.", "") # InternVL
|
|
3608
|
-
|
|
3998
|
+
|
|
3999
|
+
# handle aggregated expert tensors
|
|
4000
|
+
# GGUF stores dimensions reversed from PyTorch, so:
|
|
4001
|
+
# PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
|
|
4002
|
+
# Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
|
|
4003
|
+
# Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
|
|
4004
|
+
if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
|
|
4005
|
+
mapped = f"{name}.weight" if not name.endswith(".weight") else name
|
|
4006
|
+
# Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
|
|
4007
|
+
# Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
|
|
4008
|
+
# Need PyTorch: (128, 2048, 768) [reversed of GGML]
|
|
4009
|
+
# So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
|
|
4010
|
+
permuted = data_torch.permute(0, 2, 1).contiguous()
|
|
4011
|
+
return [(self.map_tensor_name(mapped), permuted)]
|
|
4012
|
+
|
|
4013
|
+
if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
|
|
4014
|
+
if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
|
|
4015
|
+
raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
|
|
4016
|
+
split_dim = data_torch.shape[-1] // 2
|
|
4017
|
+
gate = data_torch[..., :split_dim].contiguous()
|
|
4018
|
+
up = data_torch[..., split_dim:].contiguous()
|
|
4019
|
+
# Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
|
|
4020
|
+
# Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
|
|
4021
|
+
# Need PyTorch: (128, 768, 2048) [reversed of GGML]
|
|
4022
|
+
# So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
|
|
4023
|
+
base_name = name.removesuffix(".weight")
|
|
4024
|
+
base = base_name.rsplit('.', 1)[0]
|
|
4025
|
+
mapped_gate = f"{base}.gate_proj.weight"
|
|
4026
|
+
mapped_up = f"{base}.up_proj.weight"
|
|
4027
|
+
perm_gate = gate.permute(0, 2, 1).contiguous()
|
|
4028
|
+
perm_up = up.permute(0, 2, 1).contiguous()
|
|
4029
|
+
return [
|
|
4030
|
+
(self.map_tensor_name(mapped_gate), perm_gate),
|
|
4031
|
+
(self.map_tensor_name(mapped_up), perm_up),
|
|
4032
|
+
]
|
|
4033
|
+
|
|
4034
|
+
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
|
|
3609
4035
|
# skip visual tensors
|
|
3610
4036
|
return []
|
|
3611
4037
|
if name.find("experts") != -1:
|
|
@@ -3656,11 +4082,29 @@ class Qwen2MoeModel(TextModel):
|
|
|
3656
4082
|
class Qwen3Model(Qwen2Model):
|
|
3657
4083
|
model_arch = gguf.MODEL_ARCH.QWEN3
|
|
3658
4084
|
|
|
4085
|
+
# extra logic for rerank models
|
|
4086
|
+
is_rerank: bool = False
|
|
4087
|
+
is_tied_embeddings: bool = False
|
|
4088
|
+
token_false_id: int | None = None
|
|
4089
|
+
token_true_id: int | None = None
|
|
4090
|
+
|
|
3659
4091
|
def __init__(self, *args, **kwargs):
|
|
3660
4092
|
super().__init__(*args, **kwargs)
|
|
4093
|
+
|
|
4094
|
+
# track for intern-s1-mini
|
|
3661
4095
|
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
|
3662
4096
|
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
|
3663
4097
|
|
|
4098
|
+
# a bit hacky, but currently the only way to detect if this is a rerank model
|
|
4099
|
+
# ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
|
|
4100
|
+
readme_path = self.dir_model / "README.md"
|
|
4101
|
+
readme_text = ""
|
|
4102
|
+
if readme_path.exists():
|
|
4103
|
+
with readme_path.open("r", encoding="utf-8") as f:
|
|
4104
|
+
readme_text = f.read()
|
|
4105
|
+
if "# Qwen3-Reranker" in readme_text:
|
|
4106
|
+
self._find_rerank_config()
|
|
4107
|
+
|
|
3664
4108
|
def set_vocab(self):
|
|
3665
4109
|
# deal with intern-s1-mini
|
|
3666
4110
|
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
|
|
@@ -3669,6 +4113,57 @@ class Qwen3Model(Qwen2Model):
|
|
|
3669
4113
|
|
|
3670
4114
|
super().set_vocab()
|
|
3671
4115
|
|
|
4116
|
+
def _find_rerank_config(self):
|
|
4117
|
+
from transformers import AutoTokenizer
|
|
4118
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
|
4119
|
+
|
|
4120
|
+
self.is_rerank = True
|
|
4121
|
+
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
|
|
4122
|
+
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
|
|
4123
|
+
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
|
|
4124
|
+
self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
|
|
4125
|
+
|
|
4126
|
+
assert self.token_false_id is not None and self.token_true_id is not None
|
|
4127
|
+
|
|
4128
|
+
def set_gguf_parameters(self):
|
|
4129
|
+
super().set_gguf_parameters()
|
|
4130
|
+
if self.is_rerank:
|
|
4131
|
+
self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
|
|
4132
|
+
self.gguf_writer.add_classifier_output_labels(["yes", "no"])
|
|
4133
|
+
self.gguf_writer.add_chat_template([{
|
|
4134
|
+
"name": "rerank",
|
|
4135
|
+
"template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
|
|
4136
|
+
"<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
|
|
4137
|
+
"<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
|
4138
|
+
}])
|
|
4139
|
+
|
|
4140
|
+
def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
|
|
4141
|
+
# extract "yes" and "no" tokens from the output lm_head tensor
|
|
4142
|
+
false_row = data_torch[self.token_false_id]
|
|
4143
|
+
true_row = data_torch[self.token_true_id]
|
|
4144
|
+
return torch.stack([true_row, false_row], dim=0)
|
|
4145
|
+
|
|
4146
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4147
|
+
if "model.vision_" in name:
|
|
4148
|
+
# skip multimodal tensors
|
|
4149
|
+
return []
|
|
4150
|
+
|
|
4151
|
+
if self.is_rerank:
|
|
4152
|
+
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
|
|
4153
|
+
is_real_head = not self.is_tied_embeddings and "lm_head" in name
|
|
4154
|
+
if is_tied_head or is_real_head:
|
|
4155
|
+
cls_out_head = (
|
|
4156
|
+
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
|
|
4157
|
+
self._get_cls_out_tensor(data_torch),
|
|
4158
|
+
)
|
|
4159
|
+
if is_tied_head:
|
|
4160
|
+
embed = (self.map_tensor_name(name), data_torch)
|
|
4161
|
+
return [cls_out_head, embed]
|
|
4162
|
+
if is_real_head:
|
|
4163
|
+
return [cls_out_head]
|
|
4164
|
+
|
|
4165
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4166
|
+
|
|
3672
4167
|
|
|
3673
4168
|
@ModelBase.register("Qwen3MoeForCausalLM")
|
|
3674
4169
|
class Qwen3MoeModel(Qwen2MoeModel):
|
|
@@ -3688,12 +4183,193 @@ class Qwen3MoeModel(Qwen2MoeModel):
|
|
|
3688
4183
|
super().set_vocab()
|
|
3689
4184
|
|
|
3690
4185
|
|
|
4186
|
+
@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
|
|
4187
|
+
class Qwen3VLVisionModel(MmprojModel):
|
|
4188
|
+
def __init__(self, *args, **kwargs):
|
|
4189
|
+
super().__init__(*args, **kwargs)
|
|
4190
|
+
assert self.hparams_vision is not None
|
|
4191
|
+
# Compute image_size if not present
|
|
4192
|
+
if "image_size" not in self.hparams_vision:
|
|
4193
|
+
# For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
|
|
4194
|
+
num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
|
|
4195
|
+
patch_size = self.hparams_vision.get("patch_size", 16)
|
|
4196
|
+
# num_position_embeddings = (image_size / patch_size) ** 2
|
|
4197
|
+
# So image_size = sqrt(num_position_embeddings) * patch_size
|
|
4198
|
+
image_size = int(num_pos**0.5 * patch_size)
|
|
4199
|
+
self.hparams_vision["image_size"] = image_size
|
|
4200
|
+
|
|
4201
|
+
# Rename config values for compatibility
|
|
4202
|
+
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
|
|
4203
|
+
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
|
|
4204
|
+
|
|
4205
|
+
self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0)
|
|
4206
|
+
for idx in self.hparams_vision.get("deepstack_visual_indexes", []):
|
|
4207
|
+
self.is_deepstack_layers[idx] = True
|
|
4208
|
+
|
|
4209
|
+
def set_gguf_parameters(self):
|
|
4210
|
+
super().set_gguf_parameters()
|
|
4211
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
|
|
4212
|
+
self.gguf_writer.add_vision_use_gelu(True)
|
|
4213
|
+
|
|
4214
|
+
if self.hparams_vision is not None:
|
|
4215
|
+
merge_size = self.hparams_vision.get("spatial_merge_size")
|
|
4216
|
+
if merge_size is not None:
|
|
4217
|
+
self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
|
|
4218
|
+
|
|
4219
|
+
# Use text config's rms_norm_eps for vision attention layernorm eps
|
|
4220
|
+
rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6)
|
|
4221
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
|
|
4222
|
+
|
|
4223
|
+
if self.is_deepstack_layers:
|
|
4224
|
+
self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers)
|
|
4225
|
+
|
|
4226
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4227
|
+
assert self.hparams_vision is not None
|
|
4228
|
+
# Skip text model tensors - they go in the text model file
|
|
4229
|
+
if name.startswith("model.language_model.") or name.startswith("lm_head."):
|
|
4230
|
+
return []
|
|
4231
|
+
|
|
4232
|
+
if name.startswith("model.visual."):
|
|
4233
|
+
name = name.replace("model.visual.", "visual.", 1)
|
|
4234
|
+
|
|
4235
|
+
if name.startswith("visual.deepstack_merger_list."):
|
|
4236
|
+
prefix, rest = name.split(".", maxsplit=3)[2:]
|
|
4237
|
+
# prefix is the layer index, convert to absolute clip layer index!
|
|
4238
|
+
idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)]
|
|
4239
|
+
target = rest
|
|
4240
|
+
|
|
4241
|
+
tensor_type: gguf.MODEL_TENSOR
|
|
4242
|
+
if target.startswith("norm."):
|
|
4243
|
+
tensor_type = gguf.MODEL_TENSOR.V_DS_NORM
|
|
4244
|
+
suffix = target.split(".", 1)[1]
|
|
4245
|
+
elif target.startswith("linear_fc1."):
|
|
4246
|
+
tensor_type = gguf.MODEL_TENSOR.V_DS_FC1
|
|
4247
|
+
suffix = target.split(".", 1)[1]
|
|
4248
|
+
elif target.startswith("linear_fc2."):
|
|
4249
|
+
tensor_type = gguf.MODEL_TENSOR.V_DS_FC2
|
|
4250
|
+
suffix = target.split(".", 1)[1]
|
|
4251
|
+
else:
|
|
4252
|
+
raise ValueError(f"Unexpected deepstack tensor: {name}")
|
|
4253
|
+
|
|
4254
|
+
new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}")
|
|
4255
|
+
return [(new_name, data_torch)]
|
|
4256
|
+
|
|
4257
|
+
if name.startswith("visual.merger."):
|
|
4258
|
+
suffix = name.split(".", 2)[2]
|
|
4259
|
+
if suffix.startswith("linear_fc"):
|
|
4260
|
+
fc_idx_str, tail = suffix.split(".", 1)
|
|
4261
|
+
fc_num = int(fc_idx_str.replace("linear_fc", ""))
|
|
4262
|
+
# Qwen3VL has linear_fc1 and linear_fc2
|
|
4263
|
+
# Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
|
|
4264
|
+
if fc_num == 1:
|
|
4265
|
+
fc_idx = 0
|
|
4266
|
+
elif fc_num == 2:
|
|
4267
|
+
fc_idx = 2
|
|
4268
|
+
else:
|
|
4269
|
+
raise ValueError(f"unexpected fc index {fc_num} in {name}")
|
|
4270
|
+
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}")
|
|
4271
|
+
elif suffix.startswith("norm."):
|
|
4272
|
+
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}")
|
|
4273
|
+
else:
|
|
4274
|
+
raise ValueError(f"Unexpected merger tensor: {name}")
|
|
4275
|
+
return [(new_name, data_torch)]
|
|
4276
|
+
|
|
4277
|
+
if name == "visual.patch_embed.proj.weight":
|
|
4278
|
+
# split Conv3D into Conv2Ds along temporal dimension
|
|
4279
|
+
c1, c2, kt, _, _ = data_torch.shape
|
|
4280
|
+
del c1, c2
|
|
4281
|
+
if kt != 2:
|
|
4282
|
+
raise ValueError("Current implementation only supports temporal_patch_size of 2")
|
|
4283
|
+
return [
|
|
4284
|
+
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]),
|
|
4285
|
+
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
|
|
4286
|
+
]
|
|
4287
|
+
|
|
4288
|
+
if name == "visual.patch_embed.proj.bias":
|
|
4289
|
+
# Include the bias - it's used by the C++ code
|
|
4290
|
+
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)]
|
|
4291
|
+
|
|
4292
|
+
if name.startswith("visual."):
|
|
4293
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
4294
|
+
|
|
4295
|
+
# Fall back to parent class for other tensors
|
|
4296
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4297
|
+
|
|
4298
|
+
|
|
4299
|
+
@ModelBase.register("Qwen3VLForConditionalGeneration")
|
|
4300
|
+
class Qwen3VLTextModel(Qwen3Model):
|
|
4301
|
+
model_arch = gguf.MODEL_ARCH.QWEN3VL
|
|
4302
|
+
|
|
4303
|
+
def set_gguf_parameters(self):
|
|
4304
|
+
super().set_gguf_parameters()
|
|
4305
|
+
|
|
4306
|
+
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
|
4307
|
+
text_config = self.hparams.get("text_config", {})
|
|
4308
|
+
# rope_scaling is deprecated in V5, use rope_parameters instead
|
|
4309
|
+
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
|
|
4310
|
+
|
|
4311
|
+
if rope_scaling.get("mrope_section"):
|
|
4312
|
+
# mrope_section contains [time, height, width] dimensions
|
|
4313
|
+
mrope_section = rope_scaling["mrope_section"]
|
|
4314
|
+
# Pad to 4 dimensions [time, height, width, extra]
|
|
4315
|
+
while len(mrope_section) < 4:
|
|
4316
|
+
mrope_section.append(0)
|
|
4317
|
+
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
|
|
4318
|
+
|
|
4319
|
+
logger.info(f"MRoPE sections: {mrope_section[:4]}")
|
|
4320
|
+
|
|
4321
|
+
vision_config = self.hparams.get("vision_config", {})
|
|
4322
|
+
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
|
4323
|
+
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
|
4324
|
+
|
|
4325
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4326
|
+
# Skip vision tensors - they go in the mmproj file
|
|
4327
|
+
if name.startswith("model.visual."):
|
|
4328
|
+
return []
|
|
4329
|
+
|
|
4330
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4331
|
+
|
|
4332
|
+
|
|
4333
|
+
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
|
|
4334
|
+
class Qwen3VLMoeTextModel(Qwen3MoeModel):
|
|
4335
|
+
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
|
|
4336
|
+
|
|
4337
|
+
def set_gguf_parameters(self):
|
|
4338
|
+
super().set_gguf_parameters()
|
|
4339
|
+
|
|
4340
|
+
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
|
4341
|
+
text_config = self.hparams.get("text_config", {})
|
|
4342
|
+
# rope_scaling is deprecated in V5, use rope_parameters instead
|
|
4343
|
+
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
|
|
4344
|
+
|
|
4345
|
+
if rope_scaling.get("mrope_section"):
|
|
4346
|
+
# mrope_section contains [time, height, width] dimensions
|
|
4347
|
+
mrope_section = rope_scaling["mrope_section"]
|
|
4348
|
+
# Pad to 4 dimensions [time, height, width, extra]
|
|
4349
|
+
while len(mrope_section) < 4:
|
|
4350
|
+
mrope_section.append(0)
|
|
4351
|
+
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
|
|
4352
|
+
|
|
4353
|
+
logger.info(f"MRoPE sections: {mrope_section[:4]}")
|
|
4354
|
+
|
|
4355
|
+
vision_config = self.hparams.get("vision_config", {})
|
|
4356
|
+
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
|
4357
|
+
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
|
4358
|
+
|
|
4359
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4360
|
+
# Skip vision tensors - they go in the mmproj file
|
|
4361
|
+
if name.startswith("model.visual."):
|
|
4362
|
+
return []
|
|
4363
|
+
|
|
4364
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4365
|
+
|
|
4366
|
+
|
|
3691
4367
|
@ModelBase.register("GPT2LMHeadModel")
|
|
3692
4368
|
class GPT2Model(TextModel):
|
|
3693
4369
|
model_arch = gguf.MODEL_ARCH.GPT2
|
|
3694
4370
|
|
|
3695
4371
|
def set_gguf_parameters(self):
|
|
3696
|
-
self.gguf_writer.add_block_count(self.
|
|
4372
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
3697
4373
|
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
|
3698
4374
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
|
3699
4375
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
|
@@ -3725,8 +4401,6 @@ class Phi2Model(TextModel):
|
|
|
3725
4401
|
model_arch = gguf.MODEL_ARCH.PHI2
|
|
3726
4402
|
|
|
3727
4403
|
def set_gguf_parameters(self):
|
|
3728
|
-
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
|
3729
|
-
|
|
3730
4404
|
rot_pct = self.find_hparam(["partial_rotary_factor"])
|
|
3731
4405
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
|
3732
4406
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
|
@@ -3735,7 +4409,7 @@ class Phi2Model(TextModel):
|
|
|
3735
4409
|
|
|
3736
4410
|
self.gguf_writer.add_embedding_length(n_embd)
|
|
3737
4411
|
self.gguf_writer.add_feed_forward_length(4 * n_embd)
|
|
3738
|
-
self.gguf_writer.add_block_count(block_count)
|
|
4412
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
3739
4413
|
self.gguf_writer.add_head_count(n_head)
|
|
3740
4414
|
self.gguf_writer.add_head_count_kv(n_head)
|
|
3741
4415
|
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
|
|
@@ -3853,8 +4527,6 @@ class Phi3MiniModel(TextModel):
|
|
|
3853
4527
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
3854
4528
|
|
|
3855
4529
|
def set_gguf_parameters(self):
|
|
3856
|
-
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
|
3857
|
-
|
|
3858
4530
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
|
3859
4531
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
|
3860
4532
|
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
|
@@ -3868,7 +4540,7 @@ class Phi3MiniModel(TextModel):
|
|
|
3868
4540
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
|
3869
4541
|
self.gguf_writer.add_embedding_length(n_embd)
|
|
3870
4542
|
self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
|
3871
|
-
self.gguf_writer.add_block_count(block_count)
|
|
4543
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
3872
4544
|
self.gguf_writer.add_head_count(n_head)
|
|
3873
4545
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
|
3874
4546
|
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
|
@@ -3988,12 +4660,11 @@ class PlamoModel(TextModel):
|
|
|
3988
4660
|
|
|
3989
4661
|
def set_gguf_parameters(self):
|
|
3990
4662
|
hparams = self.hparams
|
|
3991
|
-
block_count = hparams["num_hidden_layers"]
|
|
3992
4663
|
|
|
3993
4664
|
self.gguf_writer.add_context_length(4096) # not in config.json
|
|
3994
4665
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
3995
4666
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
3996
|
-
self.gguf_writer.add_block_count(block_count)
|
|
4667
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
3997
4668
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
|
3998
4669
|
self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
|
|
3999
4670
|
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
|
@@ -4116,7 +4787,6 @@ class Plamo2Model(TextModel):
|
|
|
4116
4787
|
|
|
4117
4788
|
def set_gguf_parameters(self):
|
|
4118
4789
|
hparams = self.hparams
|
|
4119
|
-
block_count = hparams["num_hidden_layers"]
|
|
4120
4790
|
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
4121
4791
|
|
|
4122
4792
|
# Which layers are Mamba layers
|
|
@@ -4124,27 +4794,32 @@ class Plamo2Model(TextModel):
|
|
|
4124
4794
|
# This logic matches modeling_plamo.py's is_mamba function
|
|
4125
4795
|
mamba_step = hparams.get("mamba_step", 2)
|
|
4126
4796
|
mamba_enabled = hparams.get("mamba_enabled", True)
|
|
4127
|
-
|
|
4797
|
+
num_key_value_heads = []
|
|
4798
|
+
num_attention_heads = []
|
|
4128
4799
|
|
|
4129
4800
|
if mamba_enabled:
|
|
4130
|
-
for i in range(block_count):
|
|
4131
|
-
if block_count <= (mamba_step // 2):
|
|
4801
|
+
for i in range(self.block_count):
|
|
4802
|
+
if self.block_count <= (mamba_step // 2):
|
|
4132
4803
|
# use attention in last layer
|
|
4133
|
-
is_mamba = (i != block_count - 1)
|
|
4804
|
+
is_mamba = (i != self.block_count - 1)
|
|
4134
4805
|
else:
|
|
4135
4806
|
is_mamba = (i % mamba_step) != (mamba_step // 2)
|
|
4136
4807
|
if is_mamba:
|
|
4137
|
-
|
|
4808
|
+
num_key_value_heads.append(0)
|
|
4809
|
+
num_attention_heads.append(0)
|
|
4138
4810
|
else:
|
|
4139
|
-
|
|
4811
|
+
num_key_value_heads.append(hparams.get("num_key_value_heads", 4))
|
|
4812
|
+
num_attention_heads.append(hparams.get("num_attention_heads", 32))
|
|
4140
4813
|
|
|
4141
|
-
if
|
|
4142
|
-
self.gguf_writer.add_head_count_kv(
|
|
4814
|
+
if num_key_value_heads and num_attention_heads:
|
|
4815
|
+
self.gguf_writer.add_head_count_kv(num_key_value_heads)
|
|
4816
|
+
self.gguf_writer.add_head_count(num_attention_heads)
|
|
4143
4817
|
|
|
4144
4818
|
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
|
|
4145
4819
|
self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
|
|
4146
|
-
self.gguf_writer.
|
|
4147
|
-
self.gguf_writer.
|
|
4820
|
+
self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128))
|
|
4821
|
+
self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
|
|
4822
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
4148
4823
|
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
|
4149
4824
|
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
|
|
4150
4825
|
|
|
@@ -4201,12 +4876,10 @@ class CodeShellModel(TextModel):
|
|
|
4201
4876
|
model_arch = gguf.MODEL_ARCH.CODESHELL
|
|
4202
4877
|
|
|
4203
4878
|
def set_gguf_parameters(self):
|
|
4204
|
-
block_count = self.hparams["n_layer"]
|
|
4205
|
-
|
|
4206
4879
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
|
4207
4880
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
|
4208
4881
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
|
4209
|
-
self.gguf_writer.add_block_count(block_count)
|
|
4882
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
4210
4883
|
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
|
4211
4884
|
self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
|
|
4212
4885
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
|
@@ -4215,27 +4888,6 @@ class CodeShellModel(TextModel):
|
|
|
4215
4888
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
4216
4889
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
|
4217
4890
|
|
|
4218
|
-
_has_tok_embd = False
|
|
4219
|
-
|
|
4220
|
-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4221
|
-
del bid # unused
|
|
4222
|
-
|
|
4223
|
-
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
|
4224
|
-
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
|
|
4225
|
-
|
|
4226
|
-
new_name = self.map_tensor_name(name)
|
|
4227
|
-
|
|
4228
|
-
# assuming token_embd.weight is seen before output.weight
|
|
4229
|
-
if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
|
4230
|
-
# even though the tensor file(s) does not contain the word embeddings they are still in the weight map
|
|
4231
|
-
if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
|
|
4232
|
-
logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
|
|
4233
|
-
self.tensor_names.remove("transformer.wte.weight")
|
|
4234
|
-
elif new_name == tok_embd_name:
|
|
4235
|
-
self._has_tok_embd = True
|
|
4236
|
-
|
|
4237
|
-
return [(new_name, data_torch)]
|
|
4238
|
-
|
|
4239
4891
|
|
|
4240
4892
|
@ModelBase.register("InternLM2ForCausalLM")
|
|
4241
4893
|
class InternLM2Model(TextModel):
|
|
@@ -4369,7 +5021,7 @@ class InternLM2Model(TextModel):
|
|
|
4369
5021
|
|
|
4370
5022
|
def set_gguf_parameters(self):
|
|
4371
5023
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
4372
|
-
self.gguf_writer.add_block_count(self.
|
|
5024
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
4373
5025
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
4374
5026
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
4375
5027
|
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
|
@@ -4990,11 +5642,10 @@ class GemmaModel(TextModel):
|
|
|
4990
5642
|
|
|
4991
5643
|
def set_gguf_parameters(self):
|
|
4992
5644
|
hparams = self.hparams
|
|
4993
|
-
block_count = hparams["num_hidden_layers"]
|
|
4994
5645
|
|
|
4995
5646
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
|
4996
5647
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
4997
|
-
self.gguf_writer.add_block_count(block_count)
|
|
5648
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
4998
5649
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
4999
5650
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
|
5000
5651
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
|
@@ -5030,11 +5681,10 @@ class Gemma2Model(TextModel):
|
|
|
5030
5681
|
|
|
5031
5682
|
def set_gguf_parameters(self):
|
|
5032
5683
|
hparams = self.hparams
|
|
5033
|
-
block_count = hparams["num_hidden_layers"]
|
|
5034
5684
|
|
|
5035
5685
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
|
5036
5686
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
5037
|
-
self.gguf_writer.add_block_count(block_count)
|
|
5687
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5038
5688
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
5039
5689
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
|
5040
5690
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
|
@@ -5078,12 +5728,11 @@ class Gemma3Model(TextModel):
|
|
|
5078
5728
|
|
|
5079
5729
|
def set_gguf_parameters(self):
|
|
5080
5730
|
hparams = self.hparams
|
|
5081
|
-
block_count = hparams["num_hidden_layers"]
|
|
5082
5731
|
|
|
5083
5732
|
# some default values are not specified in the hparams
|
|
5084
5733
|
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
|
5085
5734
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
5086
|
-
self.gguf_writer.add_block_count(block_count)
|
|
5735
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5087
5736
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
5088
5737
|
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
|
5089
5738
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
|
@@ -5126,6 +5775,80 @@ class Gemma3Model(TextModel):
|
|
|
5126
5775
|
return [(self.map_tensor_name(name), data_torch)]
|
|
5127
5776
|
|
|
5128
5777
|
|
|
5778
|
+
@ModelBase.register("Gemma3TextModel")
|
|
5779
|
+
class EmbeddingGemma(Gemma3Model):
|
|
5780
|
+
model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
|
|
5781
|
+
module_paths = []
|
|
5782
|
+
dense_features_dims = {}
|
|
5783
|
+
|
|
5784
|
+
def __init__(self, *args, **kwargs):
|
|
5785
|
+
super().__init__(*args, **kwargs)
|
|
5786
|
+
if self.sentence_transformers_dense_modules:
|
|
5787
|
+
# read modules.json to determine if model has Dense layers
|
|
5788
|
+
modules_file = self.dir_model / "modules.json"
|
|
5789
|
+
if modules_file.is_file():
|
|
5790
|
+
with open(modules_file, encoding="utf-8") as modules_json_file:
|
|
5791
|
+
mods = json.load(modules_json_file)
|
|
5792
|
+
for mod in mods:
|
|
5793
|
+
if mod["type"] == "sentence_transformers.models.Dense":
|
|
5794
|
+
mod_path = mod["path"]
|
|
5795
|
+
# check if model.safetensors file for Dense layer exists
|
|
5796
|
+
model_tensors_file = self.dir_model / mod_path / "model.safetensors"
|
|
5797
|
+
if model_tensors_file.is_file():
|
|
5798
|
+
self.module_paths.append(mod_path)
|
|
5799
|
+
# read config.json of the Dense layer to get in/out features
|
|
5800
|
+
mod_conf_file = self.dir_model / mod_path / "config.json"
|
|
5801
|
+
if mod_conf_file.is_file():
|
|
5802
|
+
with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file:
|
|
5803
|
+
mod_conf = json.load(mod_conf_json_file)
|
|
5804
|
+
# hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights
|
|
5805
|
+
prefix = self._get_dense_prefix(mod_path)
|
|
5806
|
+
if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None:
|
|
5807
|
+
self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"])
|
|
5808
|
+
|
|
5809
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
|
5810
|
+
from safetensors.torch import load_file
|
|
5811
|
+
module_paths = list(self.module_paths)
|
|
5812
|
+
for i, module_path in enumerate(module_paths):
|
|
5813
|
+
tensors_file = self.dir_model / module_path / "model.safetensors"
|
|
5814
|
+
local_tensors = load_file(tensors_file)
|
|
5815
|
+
tensor_name = self._get_dense_prefix(module_path)
|
|
5816
|
+
for name, local_tensor in local_tensors.items():
|
|
5817
|
+
if not name.endswith(".weight"):
|
|
5818
|
+
continue
|
|
5819
|
+
orig_name = name.replace("linear", tensor_name)
|
|
5820
|
+
name = self.map_tensor_name(orig_name)
|
|
5821
|
+
yield name, local_tensor.clone()
|
|
5822
|
+
|
|
5823
|
+
@staticmethod
|
|
5824
|
+
def _get_dense_prefix(module_path) -> str:
|
|
5825
|
+
"""Get the tensor name prefix for the Dense layer from module path."""
|
|
5826
|
+
tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3"
|
|
5827
|
+
return tensor_name
|
|
5828
|
+
|
|
5829
|
+
def set_gguf_parameters(self):
|
|
5830
|
+
super().set_gguf_parameters()
|
|
5831
|
+
|
|
5832
|
+
# Override the sliding window size as it gets adjusted by the Gemma3TextConfig
|
|
5833
|
+
# constructor. We want to use the value from the original model's config.json.
|
|
5834
|
+
# ref: https://github.com/huggingface/transformers/pull/40700
|
|
5835
|
+
with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
5836
|
+
config = json.load(f)
|
|
5837
|
+
orig_sliding_window = config.get("sliding_window")
|
|
5838
|
+
if orig_sliding_window is None:
|
|
5839
|
+
raise ValueError("sliding_window not found in model config - this is required for the model")
|
|
5840
|
+
|
|
5841
|
+
logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
|
|
5842
|
+
f"instead of {self.hparams['sliding_window']}")
|
|
5843
|
+
self.gguf_writer.add_sliding_window(orig_sliding_window)
|
|
5844
|
+
if self.sentence_transformers_dense_modules:
|
|
5845
|
+
for dense, dims in self.dense_features_dims.items():
|
|
5846
|
+
logger.info(f"Setting dense layer {dense} in/out features to {dims}")
|
|
5847
|
+
self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1])
|
|
5848
|
+
|
|
5849
|
+
self._try_set_pooling_type()
|
|
5850
|
+
|
|
5851
|
+
|
|
5129
5852
|
@ModelBase.register("Gemma3ForConditionalGeneration")
|
|
5130
5853
|
class Gemma3VisionModel(MmprojModel):
|
|
5131
5854
|
def set_gguf_parameters(self):
|
|
@@ -5285,7 +6008,6 @@ class Rwkv6Model(TextModel):
|
|
|
5285
6008
|
self._set_vocab_rwkv_world()
|
|
5286
6009
|
|
|
5287
6010
|
def set_gguf_parameters(self):
|
|
5288
|
-
block_count = self.hparams["num_hidden_layers"]
|
|
5289
6011
|
head_size = self.hparams["head_size"]
|
|
5290
6012
|
hidden_size = self.hparams["hidden_size"]
|
|
5291
6013
|
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
|
@@ -5297,7 +6019,7 @@ class Rwkv6Model(TextModel):
|
|
|
5297
6019
|
# RWKV isn't context limited
|
|
5298
6020
|
self.gguf_writer.add_context_length(1048576)
|
|
5299
6021
|
self.gguf_writer.add_embedding_length(hidden_size)
|
|
5300
|
-
self.gguf_writer.add_block_count(block_count)
|
|
6022
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5301
6023
|
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
|
5302
6024
|
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
|
5303
6025
|
self.gguf_writer.add_wkv_head_size(head_size)
|
|
@@ -5361,7 +6083,6 @@ class RWKV6Qwen2Model(Rwkv6Model):
|
|
|
5361
6083
|
self._set_vocab_gpt2()
|
|
5362
6084
|
|
|
5363
6085
|
def set_gguf_parameters(self):
|
|
5364
|
-
block_count = self.hparams["num_hidden_layers"]
|
|
5365
6086
|
num_attention_heads = self.hparams["num_attention_heads"]
|
|
5366
6087
|
num_key_value_heads = self.hparams["num_key_value_heads"]
|
|
5367
6088
|
hidden_size = self.hparams["hidden_size"]
|
|
@@ -5374,7 +6095,7 @@ class RWKV6Qwen2Model(Rwkv6Model):
|
|
|
5374
6095
|
# RWKV isn't context limited
|
|
5375
6096
|
self.gguf_writer.add_context_length(1048576)
|
|
5376
6097
|
self.gguf_writer.add_embedding_length(hidden_size)
|
|
5377
|
-
self.gguf_writer.add_block_count(block_count)
|
|
6098
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5378
6099
|
self.gguf_writer.add_wkv_head_size(head_size)
|
|
5379
6100
|
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
|
5380
6101
|
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
|
@@ -5415,7 +6136,6 @@ class Rwkv7Model(TextModel):
|
|
|
5415
6136
|
return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
|
|
5416
6137
|
|
|
5417
6138
|
def set_gguf_parameters(self):
|
|
5418
|
-
block_count = self.hparams["num_hidden_layers"]
|
|
5419
6139
|
try:
|
|
5420
6140
|
head_size = self.hparams["head_size"]
|
|
5421
6141
|
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
|
@@ -5440,7 +6160,7 @@ class Rwkv7Model(TextModel):
|
|
|
5440
6160
|
# RWKV isn't context limited
|
|
5441
6161
|
self.gguf_writer.add_context_length(1048576)
|
|
5442
6162
|
self.gguf_writer.add_embedding_length(hidden_size)
|
|
5443
|
-
self.gguf_writer.add_block_count(block_count)
|
|
6163
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5444
6164
|
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
|
5445
6165
|
self.gguf_writer.add_wkv_head_size(head_size)
|
|
5446
6166
|
self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
|
|
@@ -5534,7 +6254,6 @@ class ARwkv7Model(Rwkv7Model):
|
|
|
5534
6254
|
self._set_vocab_gpt2()
|
|
5535
6255
|
|
|
5536
6256
|
def set_gguf_parameters(self):
|
|
5537
|
-
block_count = self.hparams["num_hidden_layers"]
|
|
5538
6257
|
hidden_size = self.hparams["hidden_size"]
|
|
5539
6258
|
head_size = self.hparams["head_size"]
|
|
5540
6259
|
rms_norm_eps = self.hparams["rms_norm_eps"]
|
|
@@ -5551,7 +6270,7 @@ class ARwkv7Model(Rwkv7Model):
|
|
|
5551
6270
|
# RWKV isn't context limited
|
|
5552
6271
|
self.gguf_writer.add_context_length(1048576)
|
|
5553
6272
|
self.gguf_writer.add_embedding_length(hidden_size)
|
|
5554
|
-
self.gguf_writer.add_block_count(block_count)
|
|
6273
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5555
6274
|
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
|
5556
6275
|
self.gguf_writer.add_wkv_head_size(head_size)
|
|
5557
6276
|
self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
|
|
@@ -5749,20 +6468,12 @@ class Mamba2Model(TextModel):
|
|
|
5749
6468
|
class JambaModel(TextModel):
|
|
5750
6469
|
model_arch = gguf.MODEL_ARCH.JAMBA
|
|
5751
6470
|
|
|
5752
|
-
def get_vocab_base_pre(self, tokenizer) -> str:
|
|
5753
|
-
del tokenizer # unused
|
|
5754
|
-
|
|
5755
|
-
return "gpt-2"
|
|
5756
|
-
|
|
5757
6471
|
def set_vocab(self):
|
|
5758
6472
|
if (self.dir_model / "tokenizer.model").is_file():
|
|
5759
|
-
# Using Jamba's tokenizer.json causes errors on model load
|
|
5760
|
-
# (something about "byte not found in vocab"),
|
|
5761
|
-
# but there's a working tokenizer.model
|
|
5762
6473
|
self._set_vocab_sentencepiece()
|
|
5763
6474
|
else:
|
|
5764
|
-
|
|
5765
|
-
self.
|
|
6475
|
+
self._set_vocab_llama_hf()
|
|
6476
|
+
self.gguf_writer.add_add_space_prefix(False)
|
|
5766
6477
|
|
|
5767
6478
|
def set_gguf_parameters(self):
|
|
5768
6479
|
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
|
|
@@ -5932,9 +6643,34 @@ class SeedOssModel(TextModel):
|
|
|
5932
6643
|
|
|
5933
6644
|
|
|
5934
6645
|
@ModelBase.register("Olmo2ForCausalLM")
|
|
6646
|
+
@ModelBase.register("Olmo3ForCausalLM")
|
|
5935
6647
|
class Olmo2Model(TextModel):
|
|
5936
6648
|
model_arch = gguf.MODEL_ARCH.OLMO2
|
|
5937
6649
|
|
|
6650
|
+
def set_gguf_parameters(self):
|
|
6651
|
+
super().set_gguf_parameters()
|
|
6652
|
+
|
|
6653
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
6654
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
6655
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
6656
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
6657
|
+
self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
|
|
6658
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
6659
|
+
|
|
6660
|
+
if "sliding_window" in self.hparams:
|
|
6661
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
|
6662
|
+
|
|
6663
|
+
sliding_window_pattern = []
|
|
6664
|
+
if "layer_types" in self.hparams:
|
|
6665
|
+
sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
|
|
6666
|
+
else:
|
|
6667
|
+
# Olmo2 does not use sliding window attention.
|
|
6668
|
+
# Olmo3 defaults to using sliding window for all layers except every 4th.
|
|
6669
|
+
for i in range(self.hparams["num_hidden_layers"]):
|
|
6670
|
+
sliding_window_pattern.append((i + 1) % 4 != 0)
|
|
6671
|
+
|
|
6672
|
+
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
|
6673
|
+
|
|
5938
6674
|
|
|
5939
6675
|
@ModelBase.register("OlmoeForCausalLM")
|
|
5940
6676
|
class OlmoeModel(TextModel):
|
|
@@ -6417,13 +7153,6 @@ class DeepseekV2Model(TextModel):
|
|
|
6417
7153
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
|
6418
7154
|
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
|
6419
7155
|
|
|
6420
|
-
if hparams["scoring_func"] == "sigmoid":
|
|
6421
|
-
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
|
6422
|
-
elif hparams["scoring_func"] == "softmax":
|
|
6423
|
-
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
|
6424
|
-
else:
|
|
6425
|
-
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
|
|
6426
|
-
|
|
6427
7156
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
|
6428
7157
|
|
|
6429
7158
|
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
@@ -6518,6 +7247,94 @@ class DeepseekV2Model(TextModel):
|
|
|
6518
7247
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
6519
7248
|
|
|
6520
7249
|
|
|
7250
|
+
@ModelBase.register("MiniMaxM2ForCausalLM")
|
|
7251
|
+
class MiniMaxM2Model(TextModel):
|
|
7252
|
+
model_arch = gguf.MODEL_ARCH.MINIMAXM2
|
|
7253
|
+
_experts_cache: dict[int, dict[str, Tensor]] = {}
|
|
7254
|
+
|
|
7255
|
+
def __init__(self, *args, **kwargs):
|
|
7256
|
+
super().__init__(*args, **kwargs)
|
|
7257
|
+
self.hparams["num_experts"] = self.hparams["num_local_experts"]
|
|
7258
|
+
|
|
7259
|
+
def set_gguf_parameters(self):
|
|
7260
|
+
super().set_gguf_parameters()
|
|
7261
|
+
|
|
7262
|
+
self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
|
7263
|
+
self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))
|
|
7264
|
+
|
|
7265
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
|
7266
|
+
if name.endswith("e_score_correction_bias"):
|
|
7267
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
|
7268
|
+
|
|
7269
|
+
# merge expert weights
|
|
7270
|
+
if 'experts' in name:
|
|
7271
|
+
n_experts = self.hparams["num_experts"]
|
|
7272
|
+
assert bid is not None
|
|
7273
|
+
|
|
7274
|
+
expert_cache = self._experts_cache.setdefault(bid, {})
|
|
7275
|
+
expert_cache[name] = data_torch
|
|
7276
|
+
expert_weights = ["w1", "w2", "w3"]
|
|
7277
|
+
|
|
7278
|
+
# not enough expert weights to merge
|
|
7279
|
+
if len(expert_cache) < n_experts * len(expert_weights):
|
|
7280
|
+
return []
|
|
7281
|
+
|
|
7282
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
7283
|
+
for w_name in expert_weights:
|
|
7284
|
+
datas: list[Tensor] = []
|
|
7285
|
+
|
|
7286
|
+
for xid in range(n_experts):
|
|
7287
|
+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
|
|
7288
|
+
datas.append(expert_cache[ename])
|
|
7289
|
+
del expert_cache[ename]
|
|
7290
|
+
|
|
7291
|
+
data_torch = torch.stack(datas, dim=0)
|
|
7292
|
+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
|
|
7293
|
+
new_name = self.map_tensor_name(merged_name)
|
|
7294
|
+
tensors.append((new_name, data_torch))
|
|
7295
|
+
|
|
7296
|
+
del self._experts_cache[bid]
|
|
7297
|
+
return tensors
|
|
7298
|
+
|
|
7299
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
7300
|
+
|
|
7301
|
+
|
|
7302
|
+
@ModelBase.register("PanguEmbeddedForCausalLM")
|
|
7303
|
+
class PanguEmbeddedModel(TextModel):
|
|
7304
|
+
model_arch = gguf.MODEL_ARCH.PANGU_EMBED
|
|
7305
|
+
|
|
7306
|
+
def set_vocab(self):
|
|
7307
|
+
self._set_vocab_sentencepiece()
|
|
7308
|
+
|
|
7309
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
|
7310
|
+
if tokenizer_config_file.is_file():
|
|
7311
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
|
7312
|
+
tokenizer_config_json = json.load(f)
|
|
7313
|
+
if "add_prefix_space" in tokenizer_config_json:
|
|
7314
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
|
7315
|
+
|
|
7316
|
+
def set_gguf_parameters(self):
|
|
7317
|
+
super().set_gguf_parameters()
|
|
7318
|
+
hparams = self.hparams
|
|
7319
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
7320
|
+
|
|
7321
|
+
# PanguEmbedded's hparam loaded from config.json without head_dim
|
|
7322
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
7323
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
7324
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
7325
|
+
|
|
7326
|
+
if hparams.get("head_dim") is None:
|
|
7327
|
+
self.gguf_writer.add_key_length(rope_dim)
|
|
7328
|
+
self.gguf_writer.add_value_length(rope_dim)
|
|
7329
|
+
|
|
7330
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
7331
|
+
if name == "lm_head.weight":
|
|
7332
|
+
if self.hparams.get("tie_word_embeddings", False):
|
|
7333
|
+
logger.info("Skipping tied output layer 'lm_head.weight'")
|
|
7334
|
+
return []
|
|
7335
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
7336
|
+
|
|
7337
|
+
|
|
6521
7338
|
@ModelBase.register("Dots1ForCausalLM")
|
|
6522
7339
|
class Dots1Model(Qwen2MoeModel):
|
|
6523
7340
|
model_arch = gguf.MODEL_ARCH.DOTS1
|
|
@@ -6533,11 +7350,6 @@ class Dots1Model(Qwen2MoeModel):
|
|
|
6533
7350
|
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
|
6534
7351
|
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
|
6535
7352
|
|
|
6536
|
-
if self.hparams["scoring_func"] == "noaux_tc":
|
|
6537
|
-
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
|
6538
|
-
else:
|
|
6539
|
-
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
|
|
6540
|
-
|
|
6541
7353
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
|
6542
7354
|
if name.endswith("e_score_correction_bias"):
|
|
6543
7355
|
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
|
@@ -6573,6 +7385,7 @@ class PLMModel(TextModel):
|
|
|
6573
7385
|
@ModelBase.register("T5ForConditionalGeneration")
|
|
6574
7386
|
@ModelBase.register("MT5ForConditionalGeneration")
|
|
6575
7387
|
@ModelBase.register("UMT5ForConditionalGeneration")
|
|
7388
|
+
@ModelBase.register("UMT5Model")
|
|
6576
7389
|
class T5Model(TextModel):
|
|
6577
7390
|
model_arch = gguf.MODEL_ARCH.T5
|
|
6578
7391
|
|
|
@@ -6681,7 +7494,9 @@ class T5Model(TextModel):
|
|
|
6681
7494
|
self.gguf_writer.add_context_length(n_ctx)
|
|
6682
7495
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
|
6683
7496
|
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
|
6684
|
-
self.gguf_writer.add_block_count(self.
|
|
7497
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
7498
|
+
if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
|
|
7499
|
+
self.gguf_writer.add_decoder_block_count(dec_n_layer)
|
|
6685
7500
|
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
|
6686
7501
|
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
|
6687
7502
|
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
|
@@ -6818,7 +7633,7 @@ class T5EncoderModel(TextModel):
|
|
|
6818
7633
|
self.gguf_writer.add_context_length(n_ctx)
|
|
6819
7634
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
|
6820
7635
|
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
|
6821
|
-
self.gguf_writer.add_block_count(self.
|
|
7636
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
6822
7637
|
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
|
6823
7638
|
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
|
6824
7639
|
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
|
@@ -6881,7 +7696,7 @@ class JaisModel(TextModel):
|
|
|
6881
7696
|
self._set_vocab_gpt2()
|
|
6882
7697
|
|
|
6883
7698
|
def set_gguf_parameters(self):
|
|
6884
|
-
self.gguf_writer.add_block_count(self.
|
|
7699
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
6885
7700
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
|
6886
7701
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
|
6887
7702
|
self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
|
|
@@ -6995,12 +7810,6 @@ class Glm4MoeModel(TextModel):
|
|
|
6995
7810
|
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
|
|
6996
7811
|
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
|
|
6997
7812
|
|
|
6998
|
-
# Patch broken chat template
|
|
6999
|
-
if isinstance(special_vocab.chat_template, str) and "visible_text(m.content).endswith" in special_vocab.chat_template:
|
|
7000
|
-
special_vocab.chat_template = special_vocab.chat_template.replace(
|
|
7001
|
-
"""{{ visible_text(m.content) }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}""",
|
|
7002
|
-
"""{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothink")) else '' -}}""")
|
|
7003
|
-
|
|
7004
7813
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
7005
7814
|
|
|
7006
7815
|
def set_gguf_parameters(self):
|
|
@@ -7229,7 +8038,7 @@ class ChatGLMModel(TextModel):
|
|
|
7229
8038
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
|
7230
8039
|
self.gguf_writer.add_embedding_length(n_embed)
|
|
7231
8040
|
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
|
|
7232
|
-
self.gguf_writer.add_block_count(self.
|
|
8041
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
7233
8042
|
self.gguf_writer.add_head_count(n_head)
|
|
7234
8043
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
|
7235
8044
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
|
|
@@ -7311,7 +8120,6 @@ class ExaoneModel(TextModel):
|
|
|
7311
8120
|
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
|
|
7312
8121
|
layer_norm_eps = hparams["layer_norm_epsilon"]
|
|
7313
8122
|
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
|
|
7314
|
-
num_layers = hparams["num_layers"]
|
|
7315
8123
|
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
|
|
7316
8124
|
# attention_dropout_rate = hparams["attention_dropout"]
|
|
7317
8125
|
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
|
|
@@ -7322,7 +8130,7 @@ class ExaoneModel(TextModel):
|
|
|
7322
8130
|
self.gguf_writer.add_context_length(max_position_embeddings)
|
|
7323
8131
|
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
|
|
7324
8132
|
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
|
7325
|
-
self.gguf_writer.add_block_count(
|
|
8133
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
7326
8134
|
self.gguf_writer.add_file_type(self.ftype)
|
|
7327
8135
|
|
|
7328
8136
|
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
|
@@ -7545,6 +8353,21 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
|
|
7545
8353
|
if i not in self._attn_layers
|
|
7546
8354
|
]
|
|
7547
8355
|
|
|
8356
|
+
# There are some models in this family that are non-hybrid, but keep the
|
|
8357
|
+
# same parent class by setting all layers to "attention." If this is the
|
|
8358
|
+
# case, the model architecture needs to be updated to a standard
|
|
8359
|
+
# "granite" or "granitemoe" model
|
|
8360
|
+
if not self._ssm_layers:
|
|
8361
|
+
has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
|
|
8362
|
+
new_arch = (
|
|
8363
|
+
gguf.MODEL_ARCH.GRANITE_MOE
|
|
8364
|
+
if has_experts else
|
|
8365
|
+
gguf.MODEL_ARCH.GRANITE
|
|
8366
|
+
)
|
|
8367
|
+
self.model_arch = new_arch
|
|
8368
|
+
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
|
|
8369
|
+
self.gguf_writer.add_architecture()
|
|
8370
|
+
|
|
7548
8371
|
# n_group and d_inner are used during reshape_tensors for mamba2
|
|
7549
8372
|
# NOTE: Explicitly include hparam prefix prefix for d_model to
|
|
7550
8373
|
# disambiguate with top-level head_dim
|
|
@@ -7629,8 +8452,11 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
|
|
7629
8452
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
7630
8453
|
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
|
|
7631
8454
|
|
|
7632
|
-
## If Bamba, use rope, otherwise don't
|
|
7633
|
-
use_rope =
|
|
8455
|
+
## If Bamba or non-hybrid, use rope, otherwise don't
|
|
8456
|
+
use_rope = (
|
|
8457
|
+
"BambaForCausalLM" in self.hparams["architectures"]
|
|
8458
|
+
or not self._ssm_layers
|
|
8459
|
+
)
|
|
7634
8460
|
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
|
|
7635
8461
|
if not use_rope:
|
|
7636
8462
|
self.gguf_writer.add_context_length(2**20)
|
|
@@ -7801,6 +8627,209 @@ class BailingMoeModel(TextModel):
|
|
|
7801
8627
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
7802
8628
|
|
|
7803
8629
|
|
|
8630
|
+
@ModelBase.register("BailingMoeV2ForCausalLM")
|
|
8631
|
+
class BailingMoeV2Model(TextModel):
|
|
8632
|
+
model_arch = gguf.MODEL_ARCH.BAILINGMOE2
|
|
8633
|
+
|
|
8634
|
+
def __init__(self, *args, **kwargs):
|
|
8635
|
+
super().__init__(*args, **kwargs)
|
|
8636
|
+
if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0):
|
|
8637
|
+
self.block_count = self.hparams["num_hidden_layers"] + nextn_layers
|
|
8638
|
+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
|
8639
|
+
|
|
8640
|
+
def set_vocab(self):
|
|
8641
|
+
self._set_vocab_gpt2()
|
|
8642
|
+
|
|
8643
|
+
def set_gguf_parameters(self):
|
|
8644
|
+
super().set_gguf_parameters()
|
|
8645
|
+
hparams = self.hparams
|
|
8646
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
8647
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
8648
|
+
|
|
8649
|
+
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
|
8650
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
8651
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
8652
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
8653
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
8654
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
8655
|
+
else:
|
|
8656
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
8657
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
|
8658
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
8659
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
|
8660
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"]))
|
|
8661
|
+
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
|
8662
|
+
self.gguf_writer.add_expert_count(hparams["num_experts"])
|
|
8663
|
+
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
|
|
8664
|
+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
|
8665
|
+
|
|
8666
|
+
if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
|
|
8667
|
+
self.gguf_writer.add_nextn_predict_layers(nextn_layers)
|
|
8668
|
+
|
|
8669
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
8670
|
+
|
|
8671
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
8672
|
+
if "mlp.experts" in name:
|
|
8673
|
+
n_experts = self.hparams["num_experts"]
|
|
8674
|
+
assert bid is not None
|
|
8675
|
+
|
|
8676
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
8677
|
+
|
|
8678
|
+
if self._experts is None:
|
|
8679
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
8680
|
+
|
|
8681
|
+
self._experts[bid][name] = data_torch
|
|
8682
|
+
|
|
8683
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
8684
|
+
# merge the experts into a single 3d tensor
|
|
8685
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
|
8686
|
+
datas: list[Tensor] = []
|
|
8687
|
+
|
|
8688
|
+
for xid in range(n_experts):
|
|
8689
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
8690
|
+
datas.append(self._experts[bid][ename])
|
|
8691
|
+
del self._experts[bid][ename]
|
|
8692
|
+
|
|
8693
|
+
data_torch = torch.stack(datas, dim=0)
|
|
8694
|
+
|
|
8695
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
8696
|
+
|
|
8697
|
+
new_name = self.map_tensor_name(merged_name)
|
|
8698
|
+
|
|
8699
|
+
tensors.append((new_name, data_torch))
|
|
8700
|
+
|
|
8701
|
+
return tensors
|
|
8702
|
+
|
|
8703
|
+
if name.endswith(".expert_bias"):
|
|
8704
|
+
name = name.replace(".expert_bias", ".expert_bias.bias")
|
|
8705
|
+
|
|
8706
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
8707
|
+
|
|
8708
|
+
def prepare_tensors(self):
|
|
8709
|
+
super().prepare_tensors()
|
|
8710
|
+
|
|
8711
|
+
if self._experts is not None:
|
|
8712
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
8713
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
8714
|
+
if len(experts) > 0:
|
|
8715
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
8716
|
+
|
|
8717
|
+
|
|
8718
|
+
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
|
|
8719
|
+
class GroveMoeModel(TextModel):
|
|
8720
|
+
model_arch = gguf.MODEL_ARCH.GROVEMOE
|
|
8721
|
+
|
|
8722
|
+
def set_gguf_parameters(self):
|
|
8723
|
+
super().set_gguf_parameters()
|
|
8724
|
+
if (n_experts := self.hparams.get("num_experts")) is not None:
|
|
8725
|
+
self.gguf_writer.add_expert_count(n_experts)
|
|
8726
|
+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
|
8727
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
|
8728
|
+
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
|
8729
|
+
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
|
|
8730
|
+
self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
|
|
8731
|
+
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
|
|
8732
|
+
self.gguf_writer.add_experts_per_group(2)
|
|
8733
|
+
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
|
|
8734
|
+
self.gguf_writer.add_expert_group_scale(0.05)
|
|
8735
|
+
# YaRN is not enabled by default
|
|
8736
|
+
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
8737
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
8738
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
8739
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
8740
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
8741
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
8742
|
+
|
|
8743
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
8744
|
+
_chunk_experts: list[dict[str, Tensor]] | None = None
|
|
8745
|
+
|
|
8746
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
8747
|
+
if name.endswith(".expert_bias"):
|
|
8748
|
+
# FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
|
|
8749
|
+
return []
|
|
8750
|
+
|
|
8751
|
+
# process the experts separately
|
|
8752
|
+
if name.find("chunk_experts") != -1:
|
|
8753
|
+
n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
|
|
8754
|
+
assert bid is not None
|
|
8755
|
+
|
|
8756
|
+
if self._chunk_experts is None:
|
|
8757
|
+
self._chunk_experts = [{} for _ in range(self.block_count)]
|
|
8758
|
+
|
|
8759
|
+
self._chunk_experts[bid][name] = data_torch
|
|
8760
|
+
|
|
8761
|
+
if len(self._chunk_experts[bid]) >= n_experts * 3:
|
|
8762
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
8763
|
+
|
|
8764
|
+
# merge the experts into a single 3d tensor
|
|
8765
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
|
8766
|
+
datas: list[Tensor] = []
|
|
8767
|
+
|
|
8768
|
+
for xid in range(n_experts):
|
|
8769
|
+
ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
|
|
8770
|
+
datas.append(self._chunk_experts[bid][ename])
|
|
8771
|
+
del self._chunk_experts[bid][ename]
|
|
8772
|
+
|
|
8773
|
+
data_torch = torch.stack(datas, dim=0)
|
|
8774
|
+
|
|
8775
|
+
merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
|
|
8776
|
+
|
|
8777
|
+
new_name = self.map_tensor_name(merged_name)
|
|
8778
|
+
|
|
8779
|
+
tensors.append((new_name, data_torch))
|
|
8780
|
+
return tensors
|
|
8781
|
+
else:
|
|
8782
|
+
return []
|
|
8783
|
+
elif name.find("experts") != -1:
|
|
8784
|
+
n_experts = self.hparams["num_experts"]
|
|
8785
|
+
assert bid is not None
|
|
8786
|
+
|
|
8787
|
+
if self._experts is None:
|
|
8788
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
8789
|
+
|
|
8790
|
+
self._experts[bid][name] = data_torch
|
|
8791
|
+
|
|
8792
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
8793
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
8794
|
+
|
|
8795
|
+
# merge the experts into a single 3d tensor
|
|
8796
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
|
8797
|
+
datas: list[Tensor] = []
|
|
8798
|
+
|
|
8799
|
+
for xid in range(n_experts):
|
|
8800
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
8801
|
+
datas.append(self._experts[bid][ename])
|
|
8802
|
+
del self._experts[bid][ename]
|
|
8803
|
+
|
|
8804
|
+
data_torch = torch.stack(datas, dim=0)
|
|
8805
|
+
|
|
8806
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
8807
|
+
|
|
8808
|
+
new_name = self.map_tensor_name(merged_name)
|
|
8809
|
+
|
|
8810
|
+
tensors.append((new_name, data_torch))
|
|
8811
|
+
return tensors
|
|
8812
|
+
else:
|
|
8813
|
+
return []
|
|
8814
|
+
|
|
8815
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
8816
|
+
|
|
8817
|
+
def prepare_tensors(self):
|
|
8818
|
+
super().prepare_tensors()
|
|
8819
|
+
|
|
8820
|
+
if self._chunk_experts is not None:
|
|
8821
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
8822
|
+
chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
|
|
8823
|
+
if len(chunk_experts) > 0:
|
|
8824
|
+
raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
|
|
8825
|
+
|
|
8826
|
+
if self._experts is not None:
|
|
8827
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
8828
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
8829
|
+
if len(experts) > 0:
|
|
8830
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
8831
|
+
|
|
8832
|
+
|
|
7804
8833
|
@ModelBase.register("ChameleonForConditionalGeneration")
|
|
7805
8834
|
@ModelBase.register("ChameleonForCausalLM") # obsolete
|
|
7806
8835
|
class ChameleonModel(TextModel):
|
|
@@ -8163,6 +9192,76 @@ class HunYuanMoEModel(TextModel):
|
|
|
8163
9192
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
8164
9193
|
|
|
8165
9194
|
|
|
9195
|
+
@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
|
|
9196
|
+
class LLaDAMoEModel(TextModel):
|
|
9197
|
+
model_arch = gguf.MODEL_ARCH.LLADA_MOE
|
|
9198
|
+
|
|
9199
|
+
def set_gguf_parameters(self):
|
|
9200
|
+
super().set_gguf_parameters()
|
|
9201
|
+
if (n_experts := self.hparams.get("num_experts")) is not None:
|
|
9202
|
+
self.gguf_writer.add_expert_count(n_experts)
|
|
9203
|
+
|
|
9204
|
+
if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
|
|
9205
|
+
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
|
|
9206
|
+
|
|
9207
|
+
# number of experts used per token (top-k)
|
|
9208
|
+
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
|
9209
|
+
self.gguf_writer.add_expert_used_count(n_experts_used)
|
|
9210
|
+
|
|
9211
|
+
self.gguf_writer.add_mask_token_id(156895)
|
|
9212
|
+
self.gguf_writer.add_causal_attention(False)
|
|
9213
|
+
self.gguf_writer.add_diffusion_shift_logits(False)
|
|
9214
|
+
|
|
9215
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
9216
|
+
|
|
9217
|
+
# Copied from: Qwen2MoeModel
|
|
9218
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
9219
|
+
# process the experts separately
|
|
9220
|
+
if name.find("experts") != -1:
|
|
9221
|
+
n_experts = self.hparams["num_experts"]
|
|
9222
|
+
assert bid is not None
|
|
9223
|
+
|
|
9224
|
+
if self._experts is None:
|
|
9225
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
9226
|
+
|
|
9227
|
+
self._experts[bid][name] = data_torch
|
|
9228
|
+
|
|
9229
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
9230
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
9231
|
+
|
|
9232
|
+
# merge the experts into a single 3d tensor
|
|
9233
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
|
9234
|
+
datas: list[Tensor] = []
|
|
9235
|
+
|
|
9236
|
+
for xid in range(n_experts):
|
|
9237
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
9238
|
+
datas.append(self._experts[bid][ename])
|
|
9239
|
+
del self._experts[bid][ename]
|
|
9240
|
+
|
|
9241
|
+
data_torch = torch.stack(datas, dim=0)
|
|
9242
|
+
|
|
9243
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
9244
|
+
|
|
9245
|
+
new_name = self.map_tensor_name(merged_name)
|
|
9246
|
+
|
|
9247
|
+
tensors.append((new_name, data_torch))
|
|
9248
|
+
return tensors
|
|
9249
|
+
else:
|
|
9250
|
+
return []
|
|
9251
|
+
|
|
9252
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
9253
|
+
|
|
9254
|
+
# Copied from: Qwen2MoeModel
|
|
9255
|
+
def prepare_tensors(self):
|
|
9256
|
+
super().prepare_tensors()
|
|
9257
|
+
|
|
9258
|
+
if self._experts is not None:
|
|
9259
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
9260
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
9261
|
+
if len(experts) > 0:
|
|
9262
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
9263
|
+
|
|
9264
|
+
|
|
8166
9265
|
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
|
8167
9266
|
class HunYuanModel(TextModel):
|
|
8168
9267
|
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
|
@@ -8259,21 +9358,18 @@ class HunYuanModel(TextModel):
|
|
|
8259
9358
|
class SmolLM3Model(LlamaModel):
|
|
8260
9359
|
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
|
8261
9360
|
|
|
8262
|
-
def set_vocab(self):
|
|
8263
|
-
super().set_vocab()
|
|
8264
|
-
# remove unsupported array slicing in chat template
|
|
8265
|
-
# ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
|
|
8266
|
-
from transformers import AutoTokenizer
|
|
8267
|
-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
|
8268
|
-
if tokenizer.chat_template is not None:
|
|
8269
|
-
chat_template = tokenizer.chat_template.replace("[:]", "")
|
|
8270
|
-
self.gguf_writer.add_chat_template(chat_template)
|
|
8271
|
-
|
|
8272
9361
|
|
|
8273
9362
|
@ModelBase.register("GptOssForCausalLM")
|
|
8274
9363
|
class GptOssModel(TextModel):
|
|
8275
9364
|
model_arch = gguf.MODEL_ARCH.GPT_OSS
|
|
8276
9365
|
|
|
9366
|
+
# TODO: remove once MXFP4 is supported more generally
|
|
9367
|
+
def dequant_model(self):
|
|
9368
|
+
quant_config = self.hparams.get("quantization_config")
|
|
9369
|
+
if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
|
|
9370
|
+
return
|
|
9371
|
+
return super().dequant_model()
|
|
9372
|
+
|
|
8277
9373
|
def transform_nibble_layout(self, tensor):
|
|
8278
9374
|
assert tensor.dtype == torch.uint8
|
|
8279
9375
|
assert tensor.shape[-1] == 16
|
|
@@ -8443,6 +9539,75 @@ class LFM2Model(TextModel):
|
|
|
8443
9539
|
return [(self.map_tensor_name(name), data_torch)]
|
|
8444
9540
|
|
|
8445
9541
|
|
|
9542
|
+
@ModelBase.register("Lfm2MoeForCausalLM")
|
|
9543
|
+
class LFM2MoeModel(TextModel):
|
|
9544
|
+
model_arch = gguf.MODEL_ARCH.LFM2MOE
|
|
9545
|
+
|
|
9546
|
+
def set_gguf_parameters(self):
|
|
9547
|
+
# set num_key_value_heads only for attention layers
|
|
9548
|
+
self.hparams["num_key_value_heads"] = [
|
|
9549
|
+
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
|
|
9550
|
+
for layer_type in self.hparams["layer_types"]
|
|
9551
|
+
]
|
|
9552
|
+
|
|
9553
|
+
super().set_gguf_parameters()
|
|
9554
|
+
|
|
9555
|
+
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
|
9556
|
+
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
|
|
9557
|
+
self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"])
|
|
9558
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
|
9559
|
+
|
|
9560
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
9561
|
+
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
|
|
9562
|
+
|
|
9563
|
+
# cache for experts weights for merging
|
|
9564
|
+
_experts_cache: dict[int, dict[str, Tensor]] = {}
|
|
9565
|
+
|
|
9566
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
9567
|
+
# conv op requires 2d tensor
|
|
9568
|
+
if 'conv.conv' in name:
|
|
9569
|
+
data_torch = data_torch.squeeze(1)
|
|
9570
|
+
|
|
9571
|
+
if name.endswith(".expert_bias"):
|
|
9572
|
+
name = name.replace(".expert_bias", ".expert_bias.bias")
|
|
9573
|
+
|
|
9574
|
+
# merge expert weights
|
|
9575
|
+
if 'experts' in name:
|
|
9576
|
+
n_experts = self.hparams["num_experts"]
|
|
9577
|
+
assert bid is not None
|
|
9578
|
+
|
|
9579
|
+
expert_cache = self._experts_cache.setdefault(bid, {})
|
|
9580
|
+
expert_cache[name] = data_torch
|
|
9581
|
+
expert_weights = ["w1", "w2", "w3"]
|
|
9582
|
+
|
|
9583
|
+
# not enough expert weights to merge
|
|
9584
|
+
if len(expert_cache) < n_experts * len(expert_weights):
|
|
9585
|
+
return []
|
|
9586
|
+
|
|
9587
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
9588
|
+
for w_name in expert_weights:
|
|
9589
|
+
datas: list[Tensor] = []
|
|
9590
|
+
|
|
9591
|
+
for xid in range(n_experts):
|
|
9592
|
+
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight"
|
|
9593
|
+
datas.append(expert_cache[ename])
|
|
9594
|
+
del expert_cache[ename]
|
|
9595
|
+
|
|
9596
|
+
data_torch = torch.stack(datas, dim=0)
|
|
9597
|
+
merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight"
|
|
9598
|
+
new_name = self.map_tensor_name(merged_name)
|
|
9599
|
+
tensors.append((new_name, data_torch))
|
|
9600
|
+
|
|
9601
|
+
del self._experts_cache[bid]
|
|
9602
|
+
return tensors
|
|
9603
|
+
|
|
9604
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
9605
|
+
|
|
9606
|
+
def prepare_tensors(self):
|
|
9607
|
+
super().prepare_tensors()
|
|
9608
|
+
assert not self._experts_cache
|
|
9609
|
+
|
|
9610
|
+
|
|
8446
9611
|
@ModelBase.register("Lfm2VlForConditionalGeneration")
|
|
8447
9612
|
class LFM2VLModel(MmprojModel):
|
|
8448
9613
|
def __init__(self, *args, **kwargs):
|
|
@@ -8561,6 +9726,43 @@ class SmallThinkerModel(TextModel):
|
|
|
8561
9726
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
8562
9727
|
|
|
8563
9728
|
|
|
9729
|
+
@ModelBase.register("ApertusForCausalLM")
|
|
9730
|
+
class ApertusModel(LlamaModel):
|
|
9731
|
+
model_arch = gguf.MODEL_ARCH.APERTUS
|
|
9732
|
+
undo_permute = False
|
|
9733
|
+
|
|
9734
|
+
_alpha_n = {}
|
|
9735
|
+
_alpha_p = {}
|
|
9736
|
+
_beta = {}
|
|
9737
|
+
_eps = {}
|
|
9738
|
+
|
|
9739
|
+
def modify_tensors(self, data_torch, name, bid):
|
|
9740
|
+
# Handle xIELU activation parameters
|
|
9741
|
+
n_layers = self.hparams["num_hidden_layers"]
|
|
9742
|
+
if name.endswith(".act_fn.alpha_n"):
|
|
9743
|
+
self._alpha_n[bid] = data_torch.to("cpu").float().item()
|
|
9744
|
+
if (len(self._alpha_n) == n_layers):
|
|
9745
|
+
self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)])
|
|
9746
|
+
return []
|
|
9747
|
+
if name.endswith(".act_fn.alpha_p"):
|
|
9748
|
+
self._alpha_p[bid] = data_torch.to("cpu").float().item()
|
|
9749
|
+
if (len(self._alpha_p) == n_layers):
|
|
9750
|
+
self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)])
|
|
9751
|
+
return []
|
|
9752
|
+
if name.endswith(".act_fn.beta"):
|
|
9753
|
+
self._beta[bid] = data_torch.to("cpu").float().item()
|
|
9754
|
+
if (len(self._beta) == n_layers):
|
|
9755
|
+
self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)])
|
|
9756
|
+
return []
|
|
9757
|
+
if name.endswith(".act_fn.eps"):
|
|
9758
|
+
self._eps[bid] = data_torch.to("cpu").float().item()
|
|
9759
|
+
if (len(self._eps) == n_layers):
|
|
9760
|
+
self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)])
|
|
9761
|
+
return []
|
|
9762
|
+
|
|
9763
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
9764
|
+
|
|
9765
|
+
|
|
8564
9766
|
class MistralModel(LlamaModel):
|
|
8565
9767
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
|
8566
9768
|
model_name = "Mistral"
|
|
@@ -8570,7 +9772,7 @@ class MistralModel(LlamaModel):
|
|
|
8570
9772
|
|
|
8571
9773
|
@staticmethod
|
|
8572
9774
|
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
|
|
8573
|
-
assert TokenizerVersion is not None
|
|
9775
|
+
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
|
|
8574
9776
|
assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
|
|
8575
9777
|
f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
|
|
8576
9778
|
)
|
|
@@ -8638,6 +9840,21 @@ class PixtralModel(LlavaVisionModel):
|
|
|
8638
9840
|
return super().map_tensor_name(name, try_suffixes)
|
|
8639
9841
|
|
|
8640
9842
|
|
|
9843
|
+
@ModelBase.register("LightOnOCRForConditionalGeneration")
|
|
9844
|
+
class LightOnOCRVisionModel(LlavaVisionModel):
|
|
9845
|
+
is_mistral_format = False
|
|
9846
|
+
use_break_tok = False
|
|
9847
|
+
|
|
9848
|
+
def set_gguf_parameters(self):
|
|
9849
|
+
super().set_gguf_parameters()
|
|
9850
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR)
|
|
9851
|
+
|
|
9852
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
|
9853
|
+
name = name.replace("model.vision_encoder.", "vision_tower.")
|
|
9854
|
+
name = name.replace("model.vision_projection.", "multi_modal_projector.")
|
|
9855
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
9856
|
+
|
|
9857
|
+
|
|
8641
9858
|
@ModelBase.register("KimiVLForConditionalGeneration")
|
|
8642
9859
|
class KimiVLModel(MmprojModel):
|
|
8643
9860
|
def __init__(self, *args, **kwargs):
|
|
@@ -8674,6 +9891,144 @@ class KimiVLModel(MmprojModel):
|
|
|
8674
9891
|
|
|
8675
9892
|
return [] # skip other tensors
|
|
8676
9893
|
|
|
9894
|
+
|
|
9895
|
+
@ModelBase.register("CogVLMForCausalLM")
|
|
9896
|
+
class CogVLMVisionModel(MmprojModel):
|
|
9897
|
+
|
|
9898
|
+
def set_gguf_parameters(self):
|
|
9899
|
+
super().set_gguf_parameters()
|
|
9900
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
|
9901
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
|
|
9902
|
+
|
|
9903
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
9904
|
+
del bid # unused
|
|
9905
|
+
|
|
9906
|
+
if not name.startswith("model.vision."):
|
|
9907
|
+
return []
|
|
9908
|
+
|
|
9909
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
9910
|
+
|
|
9911
|
+
|
|
9912
|
+
@ModelBase.register("CogVLMForCausalLM")
|
|
9913
|
+
class CogVLMModel(LlamaModel):
|
|
9914
|
+
model_arch = gguf.MODEL_ARCH.COGVLM
|
|
9915
|
+
|
|
9916
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
9917
|
+
del bid # unused
|
|
9918
|
+
|
|
9919
|
+
# block vision tensors
|
|
9920
|
+
if name.startswith("model.vision."):
|
|
9921
|
+
return []
|
|
9922
|
+
|
|
9923
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
9924
|
+
|
|
9925
|
+
|
|
9926
|
+
@ModelBase.register("JanusForConditionalGeneration")
|
|
9927
|
+
class JanusProModel(LlamaModel):
|
|
9928
|
+
model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
|
|
9929
|
+
|
|
9930
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
9931
|
+
# Skip vision, aligner, and generation tensors
|
|
9932
|
+
skip_prefixes = (
|
|
9933
|
+
'model.vision_model.',
|
|
9934
|
+
'model.aligner.',
|
|
9935
|
+
'model.vqmodel.',
|
|
9936
|
+
'model.generation_embeddings.',
|
|
9937
|
+
'model.generation_aligner.',
|
|
9938
|
+
'model.generation_head.',
|
|
9939
|
+
)
|
|
9940
|
+
if name.startswith(skip_prefixes):
|
|
9941
|
+
return []
|
|
9942
|
+
|
|
9943
|
+
if name.startswith('model.language_model.'):
|
|
9944
|
+
name = name.replace('model.language_model.', 'model.')
|
|
9945
|
+
elif name.startswith('language_model.'):
|
|
9946
|
+
name = name.replace('language_model.', '')
|
|
9947
|
+
|
|
9948
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
9949
|
+
|
|
9950
|
+
|
|
9951
|
+
@ModelBase.register("JanusForConditionalGeneration")
|
|
9952
|
+
class JanusProVisionModel(MmprojModel):
|
|
9953
|
+
def __init__(self, *args, **kwargs):
|
|
9954
|
+
super().__init__(*args, **kwargs)
|
|
9955
|
+
assert self.hparams_vision is not None
|
|
9956
|
+
if "intermediate_size" not in self.hparams_vision:
|
|
9957
|
+
mlp_ratio = self.hparams_vision.get("mlp_ratio")
|
|
9958
|
+
hidden_size = self.hparams_vision.get("hidden_size")
|
|
9959
|
+
if mlp_ratio is not None and hidden_size is not None:
|
|
9960
|
+
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
|
|
9961
|
+
|
|
9962
|
+
def set_gguf_parameters(self):
|
|
9963
|
+
super().set_gguf_parameters()
|
|
9964
|
+
assert self.hparams_vision is not None
|
|
9965
|
+
|
|
9966
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
|
|
9967
|
+
|
|
9968
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
|
|
9969
|
+
|
|
9970
|
+
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
|
|
9971
|
+
if hidden_act == "gelu":
|
|
9972
|
+
self.gguf_writer.add_vision_use_gelu(True)
|
|
9973
|
+
elif hidden_act == "silu":
|
|
9974
|
+
self.gguf_writer.add_vision_use_silu(True)
|
|
9975
|
+
|
|
9976
|
+
def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
|
|
9977
|
+
"""Map aligner tensors to projector format"""
|
|
9978
|
+
suffix = ".bias" if name.endswith(".bias") else ".weight"
|
|
9979
|
+
|
|
9980
|
+
if name.startswith("model.aligner."):
|
|
9981
|
+
local_name = name[len("model.aligner."):]
|
|
9982
|
+
elif name.startswith("aligner."):
|
|
9983
|
+
local_name = name[len("aligner."):]
|
|
9984
|
+
else:
|
|
9985
|
+
raise ValueError(f"Unsupported Janus aligner prefix: {name}")
|
|
9986
|
+
|
|
9987
|
+
if local_name.startswith("fc1."):
|
|
9988
|
+
mm_index = 0
|
|
9989
|
+
elif local_name.startswith("hidden_layers."):
|
|
9990
|
+
parts = local_name.split(".", 2)
|
|
9991
|
+
if len(parts) < 3:
|
|
9992
|
+
raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
|
|
9993
|
+
mm_index = int(parts[1]) + 1
|
|
9994
|
+
else:
|
|
9995
|
+
raise ValueError(f"Unsupported Janus aligner tensor: {name}")
|
|
9996
|
+
|
|
9997
|
+
tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
|
|
9998
|
+
return [(tensor_name, data_torch)]
|
|
9999
|
+
|
|
10000
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
10001
|
+
del bid # unused
|
|
10002
|
+
|
|
10003
|
+
# Skip language model tensors as they will be handled by `JanusProModel`
|
|
10004
|
+
if name.startswith(('model.language_model.', 'language_model.')):
|
|
10005
|
+
return []
|
|
10006
|
+
|
|
10007
|
+
# Skip generation-related components
|
|
10008
|
+
skip_generation_prefixes = (
|
|
10009
|
+
'model.vqmodel.',
|
|
10010
|
+
'vqmodel.',
|
|
10011
|
+
'model.generation_embeddings.',
|
|
10012
|
+
'generation_embeddings.',
|
|
10013
|
+
'model.generation_aligner.',
|
|
10014
|
+
'generation_aligner.',
|
|
10015
|
+
'model.generation_head.',
|
|
10016
|
+
'generation_head.',
|
|
10017
|
+
)
|
|
10018
|
+
if name.startswith(skip_generation_prefixes):
|
|
10019
|
+
return []
|
|
10020
|
+
|
|
10021
|
+
# Handle aligner tensors
|
|
10022
|
+
if name.startswith(('model.aligner.', 'aligner.')):
|
|
10023
|
+
return list(self._map_aligner_tensor(data_torch, name))
|
|
10024
|
+
|
|
10025
|
+
# Handle vision tensors
|
|
10026
|
+
if name.startswith(('model.vision_model.', 'vision_model.')):
|
|
10027
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
10028
|
+
|
|
10029
|
+
return []
|
|
10030
|
+
|
|
10031
|
+
|
|
8677
10032
|
###### CONVERSION LOGIC ######
|
|
8678
10033
|
|
|
8679
10034
|
|
|
@@ -8728,7 +10083,17 @@ class LazyTorchTensor(gguf.LazyBase):
|
|
|
8728
10083
|
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
|
8729
10084
|
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
|
8730
10085
|
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
|
8731
|
-
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
|
10086
|
+
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
|
|
10087
|
+
return cast(torch.Tensor, lazy)
|
|
10088
|
+
|
|
10089
|
+
@classmethod
|
|
10090
|
+
def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
|
|
10091
|
+
def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
|
|
10092
|
+
dtype = cls._dtype_str_map[tensor.dtype]
|
|
10093
|
+
return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
|
|
10094
|
+
dtype = cls._dtype_str_map[t.dtype]
|
|
10095
|
+
shape = t.shape
|
|
10096
|
+
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
|
|
8732
10097
|
return cast(torch.Tensor, lazy)
|
|
8733
10098
|
|
|
8734
10099
|
@classmethod
|
|
@@ -8836,6 +10201,13 @@ def parse_args() -> argparse.Namespace:
|
|
|
8836
10201
|
)
|
|
8837
10202
|
)
|
|
8838
10203
|
|
|
10204
|
+
parser.add_argument(
|
|
10205
|
+
"--sentence-transformers-dense-modules", action="store_true",
|
|
10206
|
+
help=("Whether to include sentence-transformers dense modules."
|
|
10207
|
+
"It can be used for sentence-transformers models, like google/embeddinggemma-300m"
|
|
10208
|
+
"Default these modules are not included.")
|
|
10209
|
+
)
|
|
10210
|
+
|
|
8839
10211
|
args = parser.parse_args()
|
|
8840
10212
|
if not args.print_supported_models and args.model is None:
|
|
8841
10213
|
parser.error("the following arguments are required: model")
|
|
@@ -8898,9 +10270,13 @@ def main() -> None:
|
|
|
8898
10270
|
if args.remote:
|
|
8899
10271
|
hf_repo_id = args.model
|
|
8900
10272
|
from huggingface_hub import snapshot_download
|
|
10273
|
+
allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]
|
|
10274
|
+
if args.sentence_transformers_dense_modules:
|
|
10275
|
+
# include sentence-transformers dense modules safetensors files
|
|
10276
|
+
allowed_patterns.append("*.safetensors")
|
|
8901
10277
|
local_dir = snapshot_download(
|
|
8902
10278
|
repo_id=hf_repo_id,
|
|
8903
|
-
allow_patterns=
|
|
10279
|
+
allow_patterns=allowed_patterns)
|
|
8904
10280
|
dir_model = Path(local_dir)
|
|
8905
10281
|
logger.info(f"Downloaded config and tokenizer to {local_dir}")
|
|
8906
10282
|
else:
|
|
@@ -8936,11 +10312,9 @@ def main() -> None:
|
|
|
8936
10312
|
|
|
8937
10313
|
logger.info(f"Loading model: {dir_model.name}")
|
|
8938
10314
|
|
|
8939
|
-
if args.mmproj:
|
|
8940
|
-
if "mmproj" not in fname_out.name:
|
|
8941
|
-
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
|
|
8942
|
-
|
|
8943
10315
|
is_mistral_format = args.mistral_format
|
|
10316
|
+
if is_mistral_format and not _mistral_common_installed:
|
|
10317
|
+
raise ImportError(_mistral_import_error_msg)
|
|
8944
10318
|
disable_mistral_community_chat_template = args.disable_mistral_community_chat_template
|
|
8945
10319
|
|
|
8946
10320
|
with torch.inference_mode():
|
|
@@ -8968,7 +10342,8 @@ def main() -> None:
|
|
|
8968
10342
|
split_max_tensors=args.split_max_tensors,
|
|
8969
10343
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
|
8970
10344
|
small_first_shard=args.no_tensor_first_split,
|
|
8971
|
-
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template
|
|
10345
|
+
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
|
|
10346
|
+
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
|
|
8972
10347
|
)
|
|
8973
10348
|
|
|
8974
10349
|
if args.vocab_only:
|