@novastera-oss/llamarn 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -2
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +24 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +5 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
- package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -43
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
- package/cpp/llama.cpp/src/llama-arch.h +36 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
- package/cpp/llama.cpp/src/llama-batch.h +105 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
- package/cpp/llama.cpp/src/llama-graph.h +78 -79
- package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
- package/cpp/llama.cpp/src/llama-hparams.h +11 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
- package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +21 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
- package/cpp/llama.cpp/src/llama-model.h +40 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
- package/cpp/llama.cpp/src/llama-vocab.h +42 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +5 -0
- package/ios/include/llama.h +8 -43
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -300,6 +300,7 @@ class ModelBase:
|
|
|
300
300
|
gguf.MODEL_TENSOR.POS_EMBD,
|
|
301
301
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
|
302
302
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
|
303
|
+
gguf.MODEL_TENSOR.SHORTCONV_CONV,
|
|
303
304
|
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
|
304
305
|
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
|
305
306
|
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
|
@@ -310,6 +311,8 @@ class ModelBase:
|
|
|
310
311
|
gguf.MODEL_TENSOR.POSNET_NORM2,
|
|
311
312
|
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
|
|
312
313
|
gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
|
|
314
|
+
gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
|
|
315
|
+
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
|
|
313
316
|
)
|
|
314
317
|
)
|
|
315
318
|
or not new_name.endswith(".weight")
|
|
@@ -320,7 +323,11 @@ class ModelBase:
|
|
|
320
323
|
self.match_model_tensor_name(new_name, key, bid)
|
|
321
324
|
for key in (
|
|
322
325
|
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
|
326
|
+
gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
|
|
323
327
|
gguf.MODEL_TENSOR.OUTPUT,
|
|
328
|
+
gguf.MODEL_TENSOR.ALTUP_ROUTER,
|
|
329
|
+
gguf.MODEL_TENSOR.LAUREL_L,
|
|
330
|
+
gguf.MODEL_TENSOR.LAUREL_R,
|
|
324
331
|
)
|
|
325
332
|
):
|
|
326
333
|
if self.ftype in (
|
|
@@ -809,6 +816,30 @@ class TextModel(ModelBase):
|
|
|
809
816
|
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
|
810
817
|
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
|
811
818
|
res = "minerva-7b"
|
|
819
|
+
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
|
|
820
|
+
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
|
|
821
|
+
res = "hunyuan"
|
|
822
|
+
if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
|
|
823
|
+
# ref: https://huggingface.co/skt/A.X-4.0
|
|
824
|
+
res = "a.x-4.0"
|
|
825
|
+
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
|
|
826
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
|
|
827
|
+
res = "falcon-h1"
|
|
828
|
+
if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
|
|
829
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
|
|
830
|
+
res = "falcon-h1"
|
|
831
|
+
if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
|
|
832
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
|
|
833
|
+
res = "falcon-h1"
|
|
834
|
+
if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
|
|
835
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
|
|
836
|
+
res = "falcon-h1"
|
|
837
|
+
if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
|
|
838
|
+
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
|
|
839
|
+
res = "midm-2.0"
|
|
840
|
+
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
|
|
841
|
+
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
|
|
842
|
+
res = "lfm2"
|
|
812
843
|
|
|
813
844
|
if res is None:
|
|
814
845
|
logger.warning("\n")
|
|
@@ -921,13 +952,20 @@ class TextModel(ModelBase):
|
|
|
921
952
|
tokenizer = SentencePieceProcessor()
|
|
922
953
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
|
923
954
|
|
|
924
|
-
vocab_size = self.
|
|
955
|
+
vocab_size = self.find_hparam([
|
|
956
|
+
"vocab_size_per_layer_input", # gemma3n
|
|
957
|
+
"vocab_size",
|
|
958
|
+
], optional=True) or tokenizer.vocab_size()
|
|
925
959
|
|
|
926
960
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
|
927
961
|
scores: list[float] = [-10000.0] * vocab_size
|
|
928
962
|
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
|
929
963
|
|
|
930
964
|
for token_id in range(tokenizer.vocab_size()):
|
|
965
|
+
if token_id >= vocab_size:
|
|
966
|
+
logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
|
|
967
|
+
break
|
|
968
|
+
|
|
931
969
|
piece = tokenizer.IdToPiece(token_id)
|
|
932
970
|
text = piece.encode("utf-8")
|
|
933
971
|
score = tokenizer.GetScore(token_id)
|
|
@@ -2145,7 +2183,6 @@ class Llama4Model(LlamaModel):
|
|
|
2145
2183
|
|
|
2146
2184
|
def set_vocab(self):
|
|
2147
2185
|
self._set_vocab_gpt2()
|
|
2148
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
2149
2186
|
|
|
2150
2187
|
def set_gguf_parameters(self):
|
|
2151
2188
|
super().set_gguf_parameters()
|
|
@@ -2194,7 +2231,7 @@ class Llama4VisionModel(MmprojModel):
|
|
|
2194
2231
|
name += ".weight"
|
|
2195
2232
|
if "multi_modal_projector.linear_1" in name:
|
|
2196
2233
|
# despite the name with number postfix, this is a single fully connected layer
|
|
2197
|
-
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
|
|
2234
|
+
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
|
|
2198
2235
|
return [(self.map_tensor_name(name), data_torch)]
|
|
2199
2236
|
return []
|
|
2200
2237
|
|
|
@@ -2731,6 +2768,52 @@ class Qwen2Model(TextModel):
|
|
|
2731
2768
|
yield from super().modify_tensors(data_torch, name, bid)
|
|
2732
2769
|
|
|
2733
2770
|
|
|
2771
|
+
@ModelBase.register("Ernie4_5_ForCausalLM")
|
|
2772
|
+
class Ernie4_5Model(TextModel):
|
|
2773
|
+
model_arch = gguf.MODEL_ARCH.ERNIE4_5
|
|
2774
|
+
|
|
2775
|
+
def set_vocab(self):
|
|
2776
|
+
self._set_vocab_sentencepiece()
|
|
2777
|
+
|
|
2778
|
+
def set_gguf_parameters(self):
|
|
2779
|
+
super().set_gguf_parameters()
|
|
2780
|
+
|
|
2781
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
2782
|
+
num_heads = self.hparams["num_attention_heads"]
|
|
2783
|
+
num_kv_heads = self.hparams["num_key_value_heads"]
|
|
2784
|
+
head_dim = self.hparams["head_dim"]
|
|
2785
|
+
|
|
2786
|
+
if "ernie." in name:
|
|
2787
|
+
name = name.replace("ernie.", "model.")
|
|
2788
|
+
# split the qkv weights
|
|
2789
|
+
# qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size]
|
|
2790
|
+
if "qkv_proj" in name:
|
|
2791
|
+
name_q = name.replace("qkv_proj.weight", "q_proj.weight")
|
|
2792
|
+
name_k = name.replace("qkv_proj.weight", "k_proj.weight")
|
|
2793
|
+
name_v = name.replace("qkv_proj.weight", "v_proj.weight")
|
|
2794
|
+
total_q_dim = num_heads * head_dim
|
|
2795
|
+
total_k_dim = num_kv_heads * head_dim
|
|
2796
|
+
total_v_dim = num_kv_heads * head_dim
|
|
2797
|
+
q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0)
|
|
2798
|
+
return [
|
|
2799
|
+
(self.map_tensor_name(name_q), q_proj_weight),
|
|
2800
|
+
(self.map_tensor_name(name_k), k_proj_weight),
|
|
2801
|
+
(self.map_tensor_name(name_v), v_proj_weight)
|
|
2802
|
+
]
|
|
2803
|
+
# split the up_gate_proj into gate and up
|
|
2804
|
+
# up_gate_proj shape: [2 * intermediate_size, hidden_size]
|
|
2805
|
+
if "up_gate_proj" in name:
|
|
2806
|
+
name_up = name.replace("up_gate_proj.weight", "up_proj.weight")
|
|
2807
|
+
name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight")
|
|
2808
|
+
dim_half = data_torch.shape[0] // 2
|
|
2809
|
+
gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0)
|
|
2810
|
+
return [
|
|
2811
|
+
(self.map_tensor_name(name_gate), gate_proj_weight),
|
|
2812
|
+
(self.map_tensor_name(name_up), up_proj_weight)
|
|
2813
|
+
]
|
|
2814
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
2815
|
+
|
|
2816
|
+
|
|
2734
2817
|
@ModelBase.register(
|
|
2735
2818
|
"Qwen2VLModel",
|
|
2736
2819
|
"Qwen2VLForConditionalGeneration",
|
|
@@ -3918,9 +4001,6 @@ class BertModel(TextModel):
|
|
|
3918
4001
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
3919
4002
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
3920
4003
|
|
|
3921
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
3922
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
3923
|
-
|
|
3924
4004
|
|
|
3925
4005
|
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
|
|
3926
4006
|
class DistilBertModel(BertModel):
|
|
@@ -3962,8 +4042,6 @@ class RobertaModel(BertModel):
|
|
|
3962
4042
|
bpe_tok_path = self.dir_model / "tokenizer.json"
|
|
3963
4043
|
if bpe_tok_path.exists():
|
|
3964
4044
|
self._set_vocab_gpt2()
|
|
3965
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
3966
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
3967
4045
|
|
|
3968
4046
|
# we need this to validate the size of the token_type embeddings
|
|
3969
4047
|
# though currently we are passing all zeros to the token_type embeddings
|
|
@@ -4223,6 +4301,7 @@ class Gemma2Model(TextModel):
|
|
|
4223
4301
|
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
|
4224
4302
|
class Gemma3Model(TextModel):
|
|
4225
4303
|
model_arch = gguf.MODEL_ARCH.GEMMA3
|
|
4304
|
+
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
|
4226
4305
|
|
|
4227
4306
|
def set_vocab(self):
|
|
4228
4307
|
self._set_vocab_sentencepiece()
|
|
@@ -4244,9 +4323,8 @@ class Gemma3Model(TextModel):
|
|
|
4244
4323
|
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
|
4245
4324
|
self.gguf_writer.add_file_type(self.ftype)
|
|
4246
4325
|
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
|
4247
|
-
#
|
|
4326
|
+
# attn_logit_softcapping is removed in Gemma3
|
|
4248
4327
|
assert hparams.get("attn_logit_softcapping") is None
|
|
4249
|
-
assert hparams.get("final_logit_softcapping") is None
|
|
4250
4328
|
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
|
4251
4329
|
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
|
4252
4330
|
if hparams.get("rope_scaling") is not None:
|
|
@@ -4258,7 +4336,7 @@ class Gemma3Model(TextModel):
|
|
|
4258
4336
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4259
4337
|
del bid # unused
|
|
4260
4338
|
|
|
4261
|
-
if
|
|
4339
|
+
if "language_model." in name:
|
|
4262
4340
|
name = name.replace("language_model.", "")
|
|
4263
4341
|
|
|
4264
4342
|
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
|
@@ -4273,8 +4351,9 @@ class Gemma3Model(TextModel):
|
|
|
4273
4351
|
|
|
4274
4352
|
# ref code in Gemma3RMSNorm
|
|
4275
4353
|
# output = output * (1.0 + self.weight.float())
|
|
4354
|
+
# note: this is not the case on gemma3n
|
|
4276
4355
|
if name.endswith("norm.weight"):
|
|
4277
|
-
data_torch = data_torch +
|
|
4356
|
+
data_torch = data_torch + self.norm_shift
|
|
4278
4357
|
|
|
4279
4358
|
return [(self.map_tensor_name(name), data_torch)]
|
|
4280
4359
|
|
|
@@ -4331,6 +4410,101 @@ class Gemma3VisionModel(MmprojModel):
|
|
|
4331
4410
|
return [] # skip other tensors
|
|
4332
4411
|
|
|
4333
4412
|
|
|
4413
|
+
@ModelBase.register("Gemma3nForConditionalGeneration")
|
|
4414
|
+
class Gemma3NModel(Gemma3Model):
|
|
4415
|
+
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
|
4416
|
+
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
|
4417
|
+
|
|
4418
|
+
_altup_proj: list[Tensor] = []
|
|
4419
|
+
_altup_unembd: list[Tensor] = []
|
|
4420
|
+
|
|
4421
|
+
def __init__(self, *args, **kwargs):
|
|
4422
|
+
super().__init__(*args, **kwargs)
|
|
4423
|
+
assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
|
|
4424
|
+
self._altup_proj = [
|
|
4425
|
+
torch.Tensor(), # to be replaced
|
|
4426
|
+
torch.Tensor(), # to be replaced
|
|
4427
|
+
torch.Tensor(), # to be replaced
|
|
4428
|
+
]
|
|
4429
|
+
self._altup_unembd = [
|
|
4430
|
+
torch.Tensor(), # to be replaced
|
|
4431
|
+
torch.Tensor(), # to be replaced
|
|
4432
|
+
torch.Tensor(), # to be replaced
|
|
4433
|
+
]
|
|
4434
|
+
|
|
4435
|
+
def set_vocab(self):
|
|
4436
|
+
super().set_vocab()
|
|
4437
|
+
|
|
4438
|
+
def set_gguf_parameters(self):
|
|
4439
|
+
super().set_gguf_parameters()
|
|
4440
|
+
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
|
|
4441
|
+
self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
|
|
4442
|
+
self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
|
|
4443
|
+
self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
|
|
4444
|
+
|
|
4445
|
+
activation_sparsity_scale = []
|
|
4446
|
+
for s in self.hparams["activation_sparsity_pattern"]:
|
|
4447
|
+
normal_dist = torch.distributions.normal.Normal(0, 1)
|
|
4448
|
+
std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
|
|
4449
|
+
activation_sparsity_scale.append(std_multiplier.item())
|
|
4450
|
+
self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
|
|
4451
|
+
|
|
4452
|
+
sliding_window_pattern = []
|
|
4453
|
+
for t in self.hparams["layer_types"]:
|
|
4454
|
+
sliding_window_pattern.append(t == "sliding_attention")
|
|
4455
|
+
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
|
4456
|
+
|
|
4457
|
+
def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
|
|
4458
|
+
has_all = all(m.numel() > 0 for m in matrices)
|
|
4459
|
+
if not has_all:
|
|
4460
|
+
return None
|
|
4461
|
+
else:
|
|
4462
|
+
return torch.stack(matrices, dim=0)
|
|
4463
|
+
|
|
4464
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4465
|
+
if name.endswith("_scale"):
|
|
4466
|
+
name = name + ".weight"
|
|
4467
|
+
|
|
4468
|
+
# TODO: implement self.prediction_coefs.weight.clamp_(...)
|
|
4469
|
+
|
|
4470
|
+
if "language_model." not in name:
|
|
4471
|
+
return [] # skip non-language model tensors
|
|
4472
|
+
|
|
4473
|
+
if "altup_unembed_projections" in name:
|
|
4474
|
+
data_torch = data_torch.to(device="cpu")
|
|
4475
|
+
if ".0." in name:
|
|
4476
|
+
self._altup_unembd[0] = data_torch
|
|
4477
|
+
elif ".1." in name:
|
|
4478
|
+
self._altup_unembd[1] = data_torch
|
|
4479
|
+
elif ".2." in name:
|
|
4480
|
+
self._altup_unembd[2] = data_torch
|
|
4481
|
+
else:
|
|
4482
|
+
raise ValueError(f"Unknown name: {name}")
|
|
4483
|
+
out = self._stack_matrices(self._altup_unembd)
|
|
4484
|
+
if out is not None:
|
|
4485
|
+
return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
|
|
4486
|
+
else:
|
|
4487
|
+
return []
|
|
4488
|
+
|
|
4489
|
+
if "altup_projections" in name:
|
|
4490
|
+
data_torch = data_torch.to(device="cpu")
|
|
4491
|
+
if ".0." in name:
|
|
4492
|
+
self._altup_proj[0] = data_torch
|
|
4493
|
+
elif ".1." in name:
|
|
4494
|
+
self._altup_proj[1] = data_torch
|
|
4495
|
+
elif ".2." in name:
|
|
4496
|
+
self._altup_proj[2] = data_torch
|
|
4497
|
+
else:
|
|
4498
|
+
raise ValueError(f"Unknown name: {name}")
|
|
4499
|
+
out = self._stack_matrices(self._altup_proj)
|
|
4500
|
+
if out is not None:
|
|
4501
|
+
return [(self.map_tensor_name("model.altup_projections.weight"), out)]
|
|
4502
|
+
else:
|
|
4503
|
+
return []
|
|
4504
|
+
|
|
4505
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4506
|
+
|
|
4507
|
+
|
|
4334
4508
|
@ModelBase.register("Starcoder2ForCausalLM")
|
|
4335
4509
|
class StarCoder2Model(TextModel):
|
|
4336
4510
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
|
@@ -4629,6 +4803,14 @@ class ARwkv7Model(Rwkv7Model):
|
|
|
4629
4803
|
class MambaModel(TextModel):
|
|
4630
4804
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
|
4631
4805
|
|
|
4806
|
+
def __init__(self, dir_model: Path, *args, **kwargs):
|
|
4807
|
+
# Avoid using AutoConfig for hparams
|
|
4808
|
+
hparams = kwargs.pop("hparams", None)
|
|
4809
|
+
if hparams is None:
|
|
4810
|
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
4811
|
+
hparams = json.load(f)
|
|
4812
|
+
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
|
4813
|
+
|
|
4632
4814
|
def set_vocab(self):
|
|
4633
4815
|
vocab_size = self.hparams["vocab_size"]
|
|
4634
4816
|
# Round vocab size to next multiple of 8
|
|
@@ -4703,6 +4885,216 @@ class MambaModel(TextModel):
|
|
|
4703
4885
|
return [(new_name, data_torch)]
|
|
4704
4886
|
|
|
4705
4887
|
|
|
4888
|
+
@ModelBase.register("Mamba2ForCausalLM")
|
|
4889
|
+
class Mamba2Model(TextModel):
|
|
4890
|
+
model_arch = gguf.MODEL_ARCH.MAMBA2
|
|
4891
|
+
|
|
4892
|
+
def __init__(self, dir_model: Path, *args, **kwargs):
|
|
4893
|
+
# Avoid using AutoConfig for hparams
|
|
4894
|
+
# It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
|
|
4895
|
+
hparams = kwargs.pop("hparams", None)
|
|
4896
|
+
if hparams is None:
|
|
4897
|
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
4898
|
+
hparams = json.load(f)
|
|
4899
|
+
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
|
4900
|
+
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
|
4901
|
+
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
|
|
4902
|
+
self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
|
|
4903
|
+
|
|
4904
|
+
def set_vocab(self):
|
|
4905
|
+
vocab_size = self.hparams["vocab_size"]
|
|
4906
|
+
# Round vocab size to next multiple of 16
|
|
4907
|
+
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
|
|
4908
|
+
# pad using ceiling division
|
|
4909
|
+
# ref: https://stackoverflow.com/a/17511341/22827863
|
|
4910
|
+
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
|
4911
|
+
self.hparams["vocab_size"] = vocab_size
|
|
4912
|
+
|
|
4913
|
+
if (self.dir_model / "tokenizer.model").is_file():
|
|
4914
|
+
self._set_vocab_sentencepiece()
|
|
4915
|
+
elif (self.dir_model / "tokenizer.model.v3").is_file():
|
|
4916
|
+
# mamba-codestral
|
|
4917
|
+
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
|
|
4918
|
+
elif (self.dir_model / "tokenizer.json").is_file():
|
|
4919
|
+
self._set_vocab_gpt2()
|
|
4920
|
+
else:
|
|
4921
|
+
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
|
4922
|
+
self._set_vocab_builtin("gpt-neox", vocab_size)
|
|
4923
|
+
|
|
4924
|
+
def set_gguf_parameters(self):
|
|
4925
|
+
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
|
|
4926
|
+
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
|
|
4927
|
+
head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
|
|
4928
|
+
|
|
4929
|
+
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
|
4930
|
+
|
|
4931
|
+
# Fail early for models which don't have a block expansion factor of 2
|
|
4932
|
+
# TODO: does this really matter?
|
|
4933
|
+
# skip the assertion for FalconH1 Model
|
|
4934
|
+
if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
|
|
4935
|
+
assert self.d_inner == 2 * self.d_model
|
|
4936
|
+
assert self.d_inner % head_dim == 0
|
|
4937
|
+
|
|
4938
|
+
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
|
4939
|
+
self.gguf_writer.add_embedding_length(self.d_model)
|
|
4940
|
+
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
|
4941
|
+
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
|
4942
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
4943
|
+
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
|
4944
|
+
self.gguf_writer.add_ssm_inner_size(self.d_inner)
|
|
4945
|
+
self.gguf_writer.add_ssm_state_size(d_state)
|
|
4946
|
+
self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
|
|
4947
|
+
self.gguf_writer.add_ssm_group_count(self.n_group)
|
|
4948
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
|
4949
|
+
self.gguf_writer.add_file_type(self.ftype)
|
|
4950
|
+
|
|
4951
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4952
|
+
|
|
4953
|
+
if name.startswith("model.backbone") or name.startswith("model.lm_head"):
|
|
4954
|
+
# map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
|
|
4955
|
+
name = name.removeprefix("model.")
|
|
4956
|
+
|
|
4957
|
+
if name.endswith(".dt_bias"):
|
|
4958
|
+
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
|
|
4959
|
+
|
|
4960
|
+
new_name = self.map_tensor_name(name)
|
|
4961
|
+
|
|
4962
|
+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
|
4963
|
+
data_torch = data_torch.squeeze()
|
|
4964
|
+
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
|
|
4965
|
+
gguf.MODEL_TENSOR.SSM_A,
|
|
4966
|
+
gguf.MODEL_TENSOR.SSM_D,
|
|
4967
|
+
]):
|
|
4968
|
+
# unsqueeze A to use similar shape semantics as Mamba-1
|
|
4969
|
+
# (D is also unsqueezed, but for more straightforward broadcast internally)
|
|
4970
|
+
data_torch = data_torch.reshape((*data_torch.shape, 1))
|
|
4971
|
+
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
|
|
4972
|
+
data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
|
|
4973
|
+
|
|
4974
|
+
if name.endswith(".A_log"):
|
|
4975
|
+
logger.debug("A_log --> A ==> " + new_name)
|
|
4976
|
+
data_torch = -torch.exp(data_torch)
|
|
4977
|
+
|
|
4978
|
+
yield (new_name, data_torch)
|
|
4979
|
+
|
|
4980
|
+
|
|
4981
|
+
@ModelBase.register("JambaForCausalLM")
|
|
4982
|
+
class JambaModel(TextModel):
|
|
4983
|
+
model_arch = gguf.MODEL_ARCH.JAMBA
|
|
4984
|
+
|
|
4985
|
+
def get_vocab_base_pre(self, tokenizer) -> str:
|
|
4986
|
+
del tokenizer # unused
|
|
4987
|
+
|
|
4988
|
+
return "gpt-2"
|
|
4989
|
+
|
|
4990
|
+
def set_vocab(self):
|
|
4991
|
+
if (self.dir_model / "tokenizer.model").is_file():
|
|
4992
|
+
# Using Jamba's tokenizer.json causes errors on model load
|
|
4993
|
+
# (something about "byte not found in vocab"),
|
|
4994
|
+
# but there's a working tokenizer.model
|
|
4995
|
+
self._set_vocab_sentencepiece()
|
|
4996
|
+
else:
|
|
4997
|
+
# Some Jamba models only have a tokenizer.json, which works.
|
|
4998
|
+
self._set_vocab_gpt2()
|
|
4999
|
+
|
|
5000
|
+
def set_gguf_parameters(self):
|
|
5001
|
+
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
|
|
5002
|
+
d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
|
|
5003
|
+
d_inner = self.hparams["mamba_expand"] * d_model
|
|
5004
|
+
d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
|
|
5005
|
+
# ceiling division
|
|
5006
|
+
# ref: https://stackoverflow.com/a/17511341/22827863
|
|
5007
|
+
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
|
5008
|
+
dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
|
|
5009
|
+
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
|
|
5010
|
+
n_kv_head = self.hparams["num_key_value_heads"]
|
|
5011
|
+
attn_offset = self.hparams["attn_layer_offset"]
|
|
5012
|
+
attn_period = self.hparams["attn_layer_period"]
|
|
5013
|
+
n_kv_vec = [0 for _ in range(attn_offset)] + [
|
|
5014
|
+
n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
|
|
5015
|
+
]
|
|
5016
|
+
|
|
5017
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5018
|
+
self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
|
|
5019
|
+
self.gguf_writer.add_embedding_length(d_model)
|
|
5020
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
5021
|
+
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
5022
|
+
self.gguf_writer.add_head_count_kv(n_kv_vec)
|
|
5023
|
+
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
|
5024
|
+
self.gguf_writer.add_ssm_inner_size(d_inner)
|
|
5025
|
+
self.gguf_writer.add_ssm_state_size(d_state)
|
|
5026
|
+
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
|
5027
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
|
5028
|
+
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
|
5029
|
+
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
|
5030
|
+
self.gguf_writer.add_file_type(self.ftype)
|
|
5031
|
+
|
|
5032
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
5033
|
+
|
|
5034
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
5035
|
+
|
|
5036
|
+
# Mini-Jamba
|
|
5037
|
+
name = name.replace(".moe.", ".feed_forward.")
|
|
5038
|
+
if bid is not None:
|
|
5039
|
+
moe_offset = self.hparams["expert_layer_offset"]
|
|
5040
|
+
moe_period = self.hparams["expert_layer_period"]
|
|
5041
|
+
|
|
5042
|
+
if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
|
|
5043
|
+
name = name.replace(".experts.0.", ".")
|
|
5044
|
+
|
|
5045
|
+
# process the experts separately
|
|
5046
|
+
if ".feed_forward.experts." in name:
|
|
5047
|
+
n_experts = self.hparams["num_experts"]
|
|
5048
|
+
|
|
5049
|
+
assert bid is not None
|
|
5050
|
+
|
|
5051
|
+
if self._experts is None:
|
|
5052
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
5053
|
+
|
|
5054
|
+
self._experts[bid][name] = data_torch
|
|
5055
|
+
|
|
5056
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
5057
|
+
|
|
5058
|
+
# merge the experts into a single 3d tensor
|
|
5059
|
+
for wid in ["down_proj", "gate_proj", "up_proj"]:
|
|
5060
|
+
datas: list[Tensor] = []
|
|
5061
|
+
|
|
5062
|
+
for xid in range(n_experts):
|
|
5063
|
+
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
|
|
5064
|
+
datas.append(self._experts[bid][ename])
|
|
5065
|
+
del self._experts[bid][ename]
|
|
5066
|
+
|
|
5067
|
+
data_torch = torch.stack(datas, dim=0)
|
|
5068
|
+
|
|
5069
|
+
# using the same merged name as qwen2moe
|
|
5070
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
|
|
5071
|
+
|
|
5072
|
+
new_name = self.map_tensor_name(merged_name)
|
|
5073
|
+
|
|
5074
|
+
yield new_name, data_torch
|
|
5075
|
+
return
|
|
5076
|
+
|
|
5077
|
+
new_name = self.map_tensor_name(name)
|
|
5078
|
+
|
|
5079
|
+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
|
5080
|
+
data_torch = data_torch.squeeze()
|
|
5081
|
+
|
|
5082
|
+
if name.endswith(".A_log"):
|
|
5083
|
+
logger.debug("A_log --> A ==> " + new_name)
|
|
5084
|
+
data_torch = -torch.exp(data_torch)
|
|
5085
|
+
|
|
5086
|
+
yield (new_name, data_torch)
|
|
5087
|
+
|
|
5088
|
+
def prepare_tensors(self):
|
|
5089
|
+
super().prepare_tensors()
|
|
5090
|
+
|
|
5091
|
+
if self._experts is not None:
|
|
5092
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
5093
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
5094
|
+
if len(experts) > 0:
|
|
5095
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
5096
|
+
|
|
5097
|
+
|
|
4706
5098
|
@ModelBase.register("CohereForCausalLM")
|
|
4707
5099
|
class CommandR2Model(TextModel):
|
|
4708
5100
|
model_arch = gguf.MODEL_ARCH.COMMAND_R
|
|
@@ -4848,8 +5240,6 @@ class JinaBertV2Model(BertModel):
|
|
|
4848
5240
|
self.gguf_writer.add_token_type_count(2)
|
|
4849
5241
|
else:
|
|
4850
5242
|
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
|
4851
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
4852
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
4853
5243
|
|
|
4854
5244
|
|
|
4855
5245
|
@ModelBase.register("OpenELMForCausalLM")
|
|
@@ -5451,9 +5841,6 @@ class T5Model(TextModel):
|
|
|
5451
5841
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
5452
5842
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
5453
5843
|
|
|
5454
|
-
self.gguf_writer.add_add_bos_token(False)
|
|
5455
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
5456
|
-
|
|
5457
5844
|
def set_gguf_parameters(self):
|
|
5458
5845
|
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
|
5459
5846
|
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
|
@@ -5591,9 +5978,6 @@ class T5EncoderModel(TextModel):
|
|
|
5591
5978
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
5592
5979
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
5593
5980
|
|
|
5594
|
-
self.gguf_writer.add_add_bos_token(False)
|
|
5595
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
5596
|
-
|
|
5597
5981
|
def set_gguf_parameters(self):
|
|
5598
5982
|
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
|
5599
5983
|
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
|
@@ -6072,18 +6456,148 @@ class GraniteMoeModel(GraniteModel):
|
|
|
6072
6456
|
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
|
6073
6457
|
]
|
|
6074
6458
|
|
|
6459
|
+
has_experts = bool(self.hparams.get('num_local_experts'))
|
|
6460
|
+
|
|
6075
6461
|
if name.endswith("shared_mlp.input_linear.weight"):
|
|
6076
6462
|
ffn_dim = self.hparams["shared_intermediate_size"]
|
|
6077
6463
|
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
|
|
6078
6464
|
gate, up = data_torch.split(ffn_dim, dim=-2)
|
|
6465
|
+
if has_experts:
|
|
6466
|
+
return [
|
|
6467
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
|
|
6468
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
|
|
6469
|
+
]
|
|
6470
|
+
return [
|
|
6471
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
|
|
6472
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
|
|
6473
|
+
]
|
|
6474
|
+
|
|
6475
|
+
if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
|
|
6079
6476
|
return [
|
|
6080
|
-
(self.format_tensor_name(gguf.MODEL_TENSOR.
|
|
6081
|
-
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
|
|
6477
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
|
|
6082
6478
|
]
|
|
6083
6479
|
|
|
6084
6480
|
return super().modify_tensors(data_torch, name, bid)
|
|
6085
6481
|
|
|
6086
6482
|
|
|
6483
|
+
@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
|
|
6484
|
+
class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
|
6485
|
+
"""GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
|
|
6486
|
+
layers and optionally uses MoE w/ a shared expert"""
|
|
6487
|
+
model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
|
|
6488
|
+
undo_permute = True
|
|
6489
|
+
|
|
6490
|
+
def __init__(self, *args, **kwargs):
|
|
6491
|
+
|
|
6492
|
+
# Hybrid mamba models use a prefix for the mamba-specific params.
|
|
6493
|
+
# TODO: Extend this if the prefix(es) need to be configurable
|
|
6494
|
+
self.hparam_prefixes = ["mamba"]
|
|
6495
|
+
|
|
6496
|
+
super().__init__(*args, **kwargs)
|
|
6497
|
+
|
|
6498
|
+
# Lists of which layers use ssm vs attention
|
|
6499
|
+
self._attn_layers = self.get_attn_layers()
|
|
6500
|
+
self._ssm_layers = [
|
|
6501
|
+
i for i in range(self.block_count)
|
|
6502
|
+
if i not in self._attn_layers
|
|
6503
|
+
]
|
|
6504
|
+
|
|
6505
|
+
# n_group and d_inner are used during reshape_tensors for mamba2
|
|
6506
|
+
self.d_model = self.find_hparam(["hidden_size", "d_model"])
|
|
6507
|
+
self.n_group = self.find_hparam(["n_groups"])
|
|
6508
|
+
self.d_inner = self.find_hparam(["expand"]) * self.d_model
|
|
6509
|
+
|
|
6510
|
+
def get_attn_layers(self):
|
|
6511
|
+
# Explicit list of layer type names
|
|
6512
|
+
if layer_types := self.hparams.get("layer_types"):
|
|
6513
|
+
return [
|
|
6514
|
+
i for i, typ in enumerate(layer_types)
|
|
6515
|
+
if typ == "attention"
|
|
6516
|
+
]
|
|
6517
|
+
|
|
6518
|
+
# Layer types indicated by index or period
|
|
6519
|
+
attn_layers = self.hparams.get("attn_layer_indices", [])
|
|
6520
|
+
if not attn_layers:
|
|
6521
|
+
attn_period = self.hparams.get("attn_layer_period")
|
|
6522
|
+
assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
|
|
6523
|
+
attn_offset = self.hparams.get("attn_layer_offset")
|
|
6524
|
+
assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
|
|
6525
|
+
attn_layers = [
|
|
6526
|
+
i for i in range(self.block_count)
|
|
6527
|
+
if i % attn_period == attn_offset
|
|
6528
|
+
]
|
|
6529
|
+
return attn_layers
|
|
6530
|
+
|
|
6531
|
+
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
|
|
6532
|
+
prefixed = []
|
|
6533
|
+
for pfx in self.hparam_prefixes:
|
|
6534
|
+
prefixed.extend(
|
|
6535
|
+
"_".join([pfx, k])
|
|
6536
|
+
for k in keys
|
|
6537
|
+
)
|
|
6538
|
+
keys = list(keys) + prefixed
|
|
6539
|
+
return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
|
|
6540
|
+
|
|
6541
|
+
def modify_tensors(
|
|
6542
|
+
self, data_torch: Tensor, name: str, bid: int | None
|
|
6543
|
+
) -> Iterable[tuple[str, Tensor]]:
|
|
6544
|
+
if (
|
|
6545
|
+
name.endswith("block_sparse_moe.input_linear.weight")
|
|
6546
|
+
or "shared_mlp" in name
|
|
6547
|
+
):
|
|
6548
|
+
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
|
|
6549
|
+
|
|
6550
|
+
# Determine whether this is a mamba layer or an attention layer
|
|
6551
|
+
if bid in self._ssm_layers:
|
|
6552
|
+
return Mamba2Model.modify_tensors(self, data_torch, name, bid)
|
|
6553
|
+
elif bid in self._attn_layers:
|
|
6554
|
+
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
|
|
6555
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
6556
|
+
|
|
6557
|
+
def set_gguf_parameters(self):
|
|
6558
|
+
"""This method merges params from both parents and some that are
|
|
6559
|
+
specific to this model. The result is some duplication of how the params
|
|
6560
|
+
get set. The following warnings are expected during conversion:
|
|
6561
|
+
|
|
6562
|
+
WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
|
|
6563
|
+
WARNING:Duplicated key name 'granitehybrid.context_length'
|
|
6564
|
+
"""
|
|
6565
|
+
GraniteMoeModel.set_gguf_parameters(self)
|
|
6566
|
+
|
|
6567
|
+
## Mamba mixer params ##
|
|
6568
|
+
self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
|
|
6569
|
+
self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
|
|
6570
|
+
self.gguf_writer.add_ssm_group_count(self.n_group)
|
|
6571
|
+
self.gguf_writer.add_ssm_inner_size(self.d_inner)
|
|
6572
|
+
# NOTE: The mamba_dt_rank is _not_ the right field for how this is used
|
|
6573
|
+
# in llama.cpp
|
|
6574
|
+
self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
|
|
6575
|
+
|
|
6576
|
+
## Attention params ##
|
|
6577
|
+
head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
|
6578
|
+
head_count_kv_vec = [
|
|
6579
|
+
head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
|
|
6580
|
+
]
|
|
6581
|
+
if rope_dim := self.hparams.get("attn_rotary_emb"):
|
|
6582
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
6583
|
+
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
|
|
6584
|
+
|
|
6585
|
+
## If Bamba, use rope, otherwise don't
|
|
6586
|
+
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
|
|
6587
|
+
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
|
|
6588
|
+
if not use_rope:
|
|
6589
|
+
self.gguf_writer.add_context_length(2**20)
|
|
6590
|
+
|
|
6591
|
+
## Validation ##
|
|
6592
|
+
d_head = self.find_hparam(["d_head"], optional=True) or 64
|
|
6593
|
+
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
|
6594
|
+
assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
|
|
6595
|
+
|
|
6596
|
+
def set_vocab(self):
|
|
6597
|
+
self.hparams["pad_vocab_size_multiple"] = 8
|
|
6598
|
+
Mamba2Model.set_vocab(self)
|
|
6599
|
+
|
|
6600
|
+
|
|
6087
6601
|
@ModelBase.register("BailingMoeForCausalLM")
|
|
6088
6602
|
class BailingMoeModel(TextModel):
|
|
6089
6603
|
model_arch = gguf.MODEL_ARCH.BAILINGMOE
|
|
@@ -6292,6 +6806,321 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
|
|
6292
6806
|
super().set_gguf_parameters()
|
|
6293
6807
|
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
|
6294
6808
|
|
|
6809
|
+
|
|
6810
|
+
@ModelBase.register("FalconH1ForCausalLM")
|
|
6811
|
+
class FalconH1Model(Mamba2Model):
|
|
6812
|
+
model_arch = gguf.MODEL_ARCH.FALCON_H1
|
|
6813
|
+
|
|
6814
|
+
def __init__(self, *args, **kwargs):
|
|
6815
|
+
# Set the hparam prefixes for Falcon Mamba2
|
|
6816
|
+
self.hparam_prefixes = ["mamba"]
|
|
6817
|
+
|
|
6818
|
+
# Initialize the base Mamba2Model
|
|
6819
|
+
super().__init__(*args, **kwargs)
|
|
6820
|
+
|
|
6821
|
+
# Use Llama conversion for attention
|
|
6822
|
+
self._transformer_model_class = LlamaModel
|
|
6823
|
+
|
|
6824
|
+
# n_group and d_inner are used during reshape_tensors for mamba2
|
|
6825
|
+
self.n_group = self.find_hparam(["n_groups"])
|
|
6826
|
+
self.d_inner = self.find_hparam(["mamba_d_ssm"])
|
|
6827
|
+
self.d_head = self.find_hparam(["d_head"])
|
|
6828
|
+
|
|
6829
|
+
# Initialize any Falcon Mamba2 specific attributes
|
|
6830
|
+
self.has_attention = True # Falcon Mamba2 has attention components
|
|
6831
|
+
|
|
6832
|
+
# Load Falcon-H1 multipliers from hyperparameters
|
|
6833
|
+
self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
|
|
6834
|
+
self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
|
|
6835
|
+
self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
|
|
6836
|
+
self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
|
|
6837
|
+
self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
|
|
6838
|
+
self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
|
|
6839
|
+
self.intermediate_size = self.find_hparam(["intermediate_size"])
|
|
6840
|
+
self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
|
|
6841
|
+
|
|
6842
|
+
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
|
|
6843
|
+
prefixed = []
|
|
6844
|
+
for pfx in self.hparam_prefixes:
|
|
6845
|
+
prefixed.extend(
|
|
6846
|
+
"_".join([pfx, k])
|
|
6847
|
+
for k in keys
|
|
6848
|
+
)
|
|
6849
|
+
keys = list(keys) + prefixed
|
|
6850
|
+
return super().find_hparam(keys, *args, **kwargs)
|
|
6851
|
+
|
|
6852
|
+
def set_vocab(self):
|
|
6853
|
+
self._set_vocab_gpt2()
|
|
6854
|
+
|
|
6855
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
6856
|
+
tensors = list(super().modify_tensors(data_torch, name, bid))
|
|
6857
|
+
tensor = tensors[0][1]
|
|
6858
|
+
|
|
6859
|
+
if "down_proj" in name:
|
|
6860
|
+
tensor = tensor * self.mlp_multipliers[1]
|
|
6861
|
+
elif "gate_proj" in name:
|
|
6862
|
+
tensor = tensor * self.mlp_multipliers[0]
|
|
6863
|
+
elif "k_proj" in name:
|
|
6864
|
+
tensor = tensor * self.key_multiplier * self.attention_in_multiplier
|
|
6865
|
+
elif "q_proj" in name:
|
|
6866
|
+
tensor = tensor * self.attention_in_multiplier
|
|
6867
|
+
elif "v_proj" in name:
|
|
6868
|
+
tensor = tensor * self.attention_in_multiplier
|
|
6869
|
+
elif "o_proj" in name:
|
|
6870
|
+
tensor = tensor * self.attention_out_multiplier
|
|
6871
|
+
elif "out_proj" in name:
|
|
6872
|
+
tensor = tensor * self.ssm_out_multiplier
|
|
6873
|
+
elif "in_proj" in name:
|
|
6874
|
+
tensor = tensor * self.ssm_in_multiplier
|
|
6875
|
+
zxbcdt_multipliers = self.hparams["ssm_multipliers"]
|
|
6876
|
+
intermediate_size = self.hparams["mamba_d_ssm"]
|
|
6877
|
+
groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
|
|
6878
|
+
tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
|
|
6879
|
+
tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
|
|
6880
|
+
tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
|
|
6881
|
+
tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
|
|
6882
|
+
tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
|
|
6883
|
+
elif "lm_head" in name:
|
|
6884
|
+
tensor = tensor * self.hparams["lm_head_multiplier"]
|
|
6885
|
+
elif "embed_tokens" in name:
|
|
6886
|
+
tensor = tensor * self.hparams["embedding_multiplier"]
|
|
6887
|
+
elif "mamba.norm" in name:
|
|
6888
|
+
tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
|
|
6889
|
+
|
|
6890
|
+
tensors = [(tensors[0][0], tensor)]
|
|
6891
|
+
return tensors
|
|
6892
|
+
|
|
6893
|
+
def set_gguf_parameters(self):
|
|
6894
|
+
super().set_gguf_parameters()
|
|
6895
|
+
|
|
6896
|
+
## General Params ##
|
|
6897
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
6898
|
+
# Override some Mamba2 defaults
|
|
6899
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
6900
|
+
self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
|
|
6901
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
6902
|
+
|
|
6903
|
+
## Attention params ##
|
|
6904
|
+
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
|
|
6905
|
+
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
|
6906
|
+
self.gguf_writer.add_key_length(self.hparams["head_dim"])
|
|
6907
|
+
self.gguf_writer.add_value_length(self.hparams["head_dim"])
|
|
6908
|
+
|
|
6909
|
+
## Validation ##
|
|
6910
|
+
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
|
6911
|
+
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
|
6912
|
+
|
|
6913
|
+
# Add any other Falcon Mamba2 specific configuration
|
|
6914
|
+
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
|
6915
|
+
|
|
6916
|
+
|
|
6917
|
+
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
|
6918
|
+
class HunYuanMoEModel(TextModel):
|
|
6919
|
+
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
|
|
6920
|
+
|
|
6921
|
+
def __init__(self, *args, **kwargs):
|
|
6922
|
+
super().__init__(*args, **kwargs)
|
|
6923
|
+
# For handling tied embeddings
|
|
6924
|
+
self._tok_embd = None
|
|
6925
|
+
|
|
6926
|
+
def set_vocab(self):
|
|
6927
|
+
from transformers import AutoTokenizer
|
|
6928
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
|
6929
|
+
|
|
6930
|
+
# 1. Get the pre-tokenizer identifier hash
|
|
6931
|
+
tokpre = self.get_vocab_base_pre(tokenizer)
|
|
6932
|
+
|
|
6933
|
+
# 2. Reverse-engineer the merges list from mergeable_ranks
|
|
6934
|
+
merges = []
|
|
6935
|
+
vocab = {}
|
|
6936
|
+
mergeable_ranks = tokenizer.mergeable_ranks
|
|
6937
|
+
for token, rank in mergeable_ranks.items():
|
|
6938
|
+
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
|
6939
|
+
if len(token) == 1:
|
|
6940
|
+
continue
|
|
6941
|
+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
|
6942
|
+
if len(merged) == 2: # todo this is an assert in Qwen, why?
|
|
6943
|
+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
|
6944
|
+
|
|
6945
|
+
# 3. Generate the tokens and toktypes lists
|
|
6946
|
+
vocab_size = self.hparams["vocab_size"]
|
|
6947
|
+
assert tokenizer.vocab_size == vocab_size
|
|
6948
|
+
special_tokens = tokenizer.special_tokens
|
|
6949
|
+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
|
|
6950
|
+
tokens: list[str] = []
|
|
6951
|
+
toktypes: list[int] = []
|
|
6952
|
+
for i in range(vocab_size):
|
|
6953
|
+
if i not in reverse_vocab:
|
|
6954
|
+
tokens.append(f"[PAD{i}]")
|
|
6955
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
|
6956
|
+
else:
|
|
6957
|
+
token = reverse_vocab[i]
|
|
6958
|
+
tokens.append(token)
|
|
6959
|
+
if i in special_tokens.values():
|
|
6960
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
6961
|
+
else:
|
|
6962
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
|
6963
|
+
|
|
6964
|
+
# 4. Write all vocab-related fields to the GGUF writer
|
|
6965
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
|
6966
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
|
6967
|
+
self.gguf_writer.add_token_list(tokens)
|
|
6968
|
+
self.gguf_writer.add_token_types(toktypes)
|
|
6969
|
+
self.gguf_writer.add_token_merges(merges)
|
|
6970
|
+
|
|
6971
|
+
# 5. Add special tokens and chat templates
|
|
6972
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
|
6973
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
|
6974
|
+
# FIX for BOS token: Overwrite incorrect id read from config.json
|
|
6975
|
+
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
|
|
6976
|
+
|
|
6977
|
+
def set_gguf_parameters(self):
|
|
6978
|
+
super().set_gguf_parameters()
|
|
6979
|
+
hparams = self.hparams
|
|
6980
|
+
|
|
6981
|
+
self.gguf_writer.add_expert_count(hparams["num_experts"])
|
|
6982
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
|
|
6983
|
+
|
|
6984
|
+
moe_intermediate_size = hparams["moe_intermediate_size"]
|
|
6985
|
+
assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
|
|
6986
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
|
|
6987
|
+
|
|
6988
|
+
moe_topk = hparams["moe_topk"]
|
|
6989
|
+
assert all(topk == moe_topk[0] for topk in moe_topk)
|
|
6990
|
+
self.gguf_writer.add_expert_used_count(moe_topk[0])
|
|
6991
|
+
|
|
6992
|
+
moe_shared_expert = hparams["num_shared_expert"]
|
|
6993
|
+
assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
|
|
6994
|
+
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
|
6995
|
+
|
|
6996
|
+
# Rope
|
|
6997
|
+
rope_scaling = hparams.get("rope_scaling", {})
|
|
6998
|
+
if rope_scaling.get("type") == "dynamic":
|
|
6999
|
+
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
|
7000
|
+
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
|
7001
|
+
alpha = rope_scaling.get("alpha", 1000)
|
|
7002
|
+
base = hparams.get("rope_theta", 10000.0)
|
|
7003
|
+
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
|
7004
|
+
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
|
7005
|
+
self.gguf_writer.add_rope_freq_base(scaled_base)
|
|
7006
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
7007
|
+
self.gguf_writer.add_rope_scaling_factor(1)
|
|
7008
|
+
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
|
7009
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
|
7010
|
+
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
|
7011
|
+
|
|
7012
|
+
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
|
7013
|
+
assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
|
7014
|
+
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
|
7015
|
+
|
|
7016
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
7017
|
+
|
|
7018
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
7019
|
+
if name == "model.embed_tokens.weight":
|
|
7020
|
+
self._tok_embd = data_torch.clone()
|
|
7021
|
+
|
|
7022
|
+
if name == "lm_head.weight":
|
|
7023
|
+
if self.hparams.get("tie_word_embeddings", False):
|
|
7024
|
+
logger.info("Skipping tied output layer 'lm_head.weight'")
|
|
7025
|
+
return []
|
|
7026
|
+
|
|
7027
|
+
if name.find("mlp.experts") != -1:
|
|
7028
|
+
n_experts = self.hparams["num_experts"]
|
|
7029
|
+
assert bid is not None
|
|
7030
|
+
|
|
7031
|
+
if self._experts is None:
|
|
7032
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
7033
|
+
|
|
7034
|
+
self._experts[bid][name] = data_torch
|
|
7035
|
+
|
|
7036
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
7037
|
+
# merge the experts into a single 3d tensor
|
|
7038
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
7039
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
|
7040
|
+
datas: list[Tensor] = []
|
|
7041
|
+
|
|
7042
|
+
for xid in range(n_experts):
|
|
7043
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
7044
|
+
datas.append(self._experts[bid][ename])
|
|
7045
|
+
del self._experts[bid][ename]
|
|
7046
|
+
|
|
7047
|
+
data_torch = torch.stack(datas, dim=0)
|
|
7048
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
7049
|
+
new_name = self.map_tensor_name(merged_name)
|
|
7050
|
+
tensors.append((new_name, data_torch))
|
|
7051
|
+
|
|
7052
|
+
return tensors
|
|
7053
|
+
else:
|
|
7054
|
+
return []
|
|
7055
|
+
|
|
7056
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
7057
|
+
|
|
7058
|
+
def prepare_tensors(self):
|
|
7059
|
+
super().prepare_tensors()
|
|
7060
|
+
if self._experts is not None:
|
|
7061
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
7062
|
+
if len(experts) > 0:
|
|
7063
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
7064
|
+
|
|
7065
|
+
|
|
7066
|
+
@ModelBase.register("SmolLM3ForCausalLM")
|
|
7067
|
+
class SmolLM3Model(LlamaModel):
|
|
7068
|
+
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
|
7069
|
+
|
|
7070
|
+
def set_vocab(self):
|
|
7071
|
+
super().set_vocab()
|
|
7072
|
+
# remove unsupported array slicing in chat template
|
|
7073
|
+
# ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
|
|
7074
|
+
from transformers import AutoTokenizer
|
|
7075
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
|
7076
|
+
if tokenizer.chat_template is not None:
|
|
7077
|
+
chat_template = tokenizer.chat_template.replace("[:]", "")
|
|
7078
|
+
self.gguf_writer.add_chat_template(chat_template)
|
|
7079
|
+
|
|
7080
|
+
|
|
7081
|
+
@ModelBase.register("Lfm2ForCausalLM")
|
|
7082
|
+
@ModelBase.register("LFM2ForCausalLM")
|
|
7083
|
+
class LFM2Model(TextModel):
|
|
7084
|
+
model_arch = gguf.MODEL_ARCH.LFM2
|
|
7085
|
+
|
|
7086
|
+
def _add_feed_forward_length(self):
|
|
7087
|
+
ff_dim = self.hparams["block_ff_dim"]
|
|
7088
|
+
|
|
7089
|
+
auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
|
|
7090
|
+
ff_dim = self.hparams["block_ff_dim"]
|
|
7091
|
+
ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
|
|
7092
|
+
multiple_of = self.hparams["block_multiple_of"]
|
|
7093
|
+
|
|
7094
|
+
if auto_adjust_ff_dim:
|
|
7095
|
+
ff_dim = int(2 * ff_dim / 3)
|
|
7096
|
+
# custom dim factor multiplier
|
|
7097
|
+
if ffn_dim_multiplier is not None:
|
|
7098
|
+
ff_dim = int(ffn_dim_multiplier * ff_dim)
|
|
7099
|
+
ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
|
|
7100
|
+
|
|
7101
|
+
self.gguf_writer.add_feed_forward_length(ff_dim)
|
|
7102
|
+
|
|
7103
|
+
def set_gguf_parameters(self):
|
|
7104
|
+
# set num_key_value_heads only for attention layers
|
|
7105
|
+
self.hparams["num_key_value_heads"] = [
|
|
7106
|
+
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
|
|
7107
|
+
for layer_type in self.hparams["layer_types"]
|
|
7108
|
+
]
|
|
7109
|
+
|
|
7110
|
+
super().set_gguf_parameters()
|
|
7111
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
7112
|
+
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
|
|
7113
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
|
|
7114
|
+
self._add_feed_forward_length()
|
|
7115
|
+
|
|
7116
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
7117
|
+
# conv op requires 2d tensor
|
|
7118
|
+
if 'conv.conv' in name:
|
|
7119
|
+
data_torch = data_torch.squeeze(1)
|
|
7120
|
+
|
|
7121
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
7122
|
+
|
|
7123
|
+
|
|
6295
7124
|
###### CONVERSION LOGIC ######
|
|
6296
7125
|
|
|
6297
7126
|
|
|
@@ -6471,12 +7300,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
|
|
|
6471
7300
|
# maybe we should fallback to text model's arch in that case, since not many models have both
|
|
6472
7301
|
text_config = hparams.get("text_config", {})
|
|
6473
7302
|
vision_config = hparams.get("vision_config", {})
|
|
6474
|
-
arch =
|
|
7303
|
+
arch = None
|
|
7304
|
+
if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
|
|
7305
|
+
arch = arches[0]
|
|
7306
|
+
elif "ssm_cfg" in hparams:
|
|
7307
|
+
# For non-hf Mamba and Mamba2 models
|
|
7308
|
+
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
|
|
7309
|
+
|
|
6475
7310
|
# if "architectures" is found in the sub-config, use that instead
|
|
6476
7311
|
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
|
|
6477
7312
|
arch = text_config["architectures"][0]
|
|
6478
7313
|
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
|
|
6479
7314
|
arch = vision_config["architectures"][0]
|
|
7315
|
+
if arch is None:
|
|
7316
|
+
raise ValueError("Failed to detect model architecture")
|
|
6480
7317
|
return arch
|
|
6481
7318
|
|
|
6482
7319
|
|