@novastera-oss/llamarn 0.2.9 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/proguard-rules.pro +12 -0
- package/android/src/main/cpp/include/llama.h +15 -47
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/CMakePresets.json +11 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -0
- package/cpp/llama.cpp/README.md +8 -8
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +62 -1
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.cpp +22 -6
- package/cpp/llama.cpp/common/common.h +22 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1250 -43
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +21 -13
- package/cpp/llama.cpp/ggml/CMakeLists.txt +13 -3
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +44 -38
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +126 -8
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +138 -18
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +11 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1206 -163
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +36 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +86 -17
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +47 -60
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +29 -42
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +46 -59
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +38 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +23 -36
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +255 -99
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -695
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +104 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +27 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +48 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +572 -106
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +599 -105
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +800 -42
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +191 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +991 -307
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +59 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +18 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +84 -9
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +386 -67
- package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +307 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
- package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +122 -47
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
- package/cpp/llama.cpp/include/llama.h +15 -47
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
- package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +316 -3
- package/cpp/llama.cpp/src/llama-arch.h +23 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +103 -71
- package/cpp/llama.cpp/src/llama-batch.h +31 -18
- package/cpp/llama.cpp/src/llama-chat.cpp +58 -1
- package/cpp/llama.cpp/src/llama-chat.h +3 -0
- package/cpp/llama.cpp/src/llama-context.cpp +180 -106
- package/cpp/llama.cpp/src/llama-context.h +26 -16
- package/cpp/llama.cpp/src/llama-cparams.h +3 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +310 -211
- package/cpp/llama.cpp/src/llama-graph.h +184 -122
- package/cpp/llama.cpp/src/llama-hparams.cpp +47 -1
- package/cpp/llama.cpp/src/llama-hparams.h +13 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +143 -47
- package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +36 -11
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model.cpp +3545 -719
- package/cpp/llama.cpp/src/llama-model.h +21 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +376 -10
- package/cpp/llama.cpp/src/llama-vocab.h +43 -0
- package/cpp/llama.cpp/src/unicode.cpp +207 -0
- package/cpp/llama.cpp/src/unicode.h +2 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +22 -4
- package/ios/include/llama.h +15 -47
- package/ios/libs/llama.xcframework/Info.plist +13 -13
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3766
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -4926
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -4897
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3794
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +4 -4
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -300,6 +300,7 @@ class ModelBase:
|
|
|
300
300
|
gguf.MODEL_TENSOR.POS_EMBD,
|
|
301
301
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
|
302
302
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
|
303
|
+
gguf.MODEL_TENSOR.SHORTCONV_CONV,
|
|
303
304
|
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
|
304
305
|
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
|
305
306
|
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
|
@@ -668,6 +669,36 @@ class TextModel(ModelBase):
|
|
|
668
669
|
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
|
669
670
|
# or pull the latest version of the model from Huggingface
|
|
670
671
|
# don't edit the hashes manually!
|
|
672
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
|
673
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
|
674
|
+
res = "chatglm-bpe"
|
|
675
|
+
if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
|
676
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
|
677
|
+
res = "chatglm-bpe"
|
|
678
|
+
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
|
|
679
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
|
|
680
|
+
res = "glm4"
|
|
681
|
+
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
|
682
|
+
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
|
683
|
+
res = "minerva-7b"
|
|
684
|
+
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
|
|
685
|
+
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
|
|
686
|
+
res = "hunyuan"
|
|
687
|
+
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
|
|
688
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
|
|
689
|
+
res = "falcon-h1"
|
|
690
|
+
if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
|
|
691
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
|
|
692
|
+
res = "falcon-h1"
|
|
693
|
+
if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
|
|
694
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
|
|
695
|
+
res = "falcon-h1"
|
|
696
|
+
if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
|
|
697
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
|
|
698
|
+
res = "falcon-h1"
|
|
699
|
+
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
|
|
700
|
+
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
|
|
701
|
+
res = "kimi-k2"
|
|
671
702
|
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
|
672
703
|
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
|
673
704
|
res = "llama-bpe"
|
|
@@ -803,18 +834,18 @@ class TextModel(ModelBase):
|
|
|
803
834
|
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
|
|
804
835
|
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
|
|
805
836
|
res = "seed-coder"
|
|
806
|
-
if chkhsh == "
|
|
807
|
-
# ref: https://huggingface.co/
|
|
808
|
-
res = "
|
|
809
|
-
if chkhsh == "
|
|
810
|
-
# ref: https://huggingface.co/
|
|
811
|
-
res = "
|
|
812
|
-
if chkhsh == "
|
|
813
|
-
# ref: https://huggingface.co/
|
|
814
|
-
res = "
|
|
815
|
-
if chkhsh == "
|
|
816
|
-
# ref: https://huggingface.co/
|
|
817
|
-
res = "
|
|
837
|
+
if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
|
|
838
|
+
# ref: https://huggingface.co/skt/A.X-4.0
|
|
839
|
+
res = "a.x-4.0"
|
|
840
|
+
if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
|
|
841
|
+
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
|
|
842
|
+
res = "midm-2.0"
|
|
843
|
+
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
|
|
844
|
+
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
|
|
845
|
+
res = "lfm2"
|
|
846
|
+
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
|
|
847
|
+
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
|
|
848
|
+
res = "exaone4"
|
|
818
849
|
|
|
819
850
|
if res is None:
|
|
820
851
|
logger.warning("\n")
|
|
@@ -1057,7 +1088,14 @@ class TextModel(ModelBase):
|
|
|
1057
1088
|
self.gguf_writer.add_token_list(tokens)
|
|
1058
1089
|
self.gguf_writer.add_token_types(toktypes)
|
|
1059
1090
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
|
1060
|
-
special_vocab.chat_template
|
|
1091
|
+
if special_vocab.chat_template is None:
|
|
1092
|
+
template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja"
|
|
1093
|
+
if template_path.is_file():
|
|
1094
|
+
with open(template_path, "r", encoding="utf-8") as f:
|
|
1095
|
+
template = f.read()
|
|
1096
|
+
else:
|
|
1097
|
+
template = "rwkv-world"
|
|
1098
|
+
special_vocab.chat_template = template
|
|
1061
1099
|
# hack: Add '\n\n' as the EOT token to make it chat normally
|
|
1062
1100
|
special_vocab._set_special_token("eot", 261)
|
|
1063
1101
|
# hack: Override these as they have already been set (incorrectly)
|
|
@@ -2743,6 +2781,210 @@ class Qwen2Model(TextModel):
|
|
|
2743
2781
|
yield from super().modify_tensors(data_torch, name, bid)
|
|
2744
2782
|
|
|
2745
2783
|
|
|
2784
|
+
@ModelBase.register("DreamModel")
|
|
2785
|
+
class DreamModel(TextModel):
|
|
2786
|
+
model_arch = gguf.MODEL_ARCH.DREAM
|
|
2787
|
+
|
|
2788
|
+
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
|
2789
|
+
tokens: list[str] = []
|
|
2790
|
+
toktypes: list[int] = []
|
|
2791
|
+
|
|
2792
|
+
from transformers import AutoTokenizer
|
|
2793
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
|
2794
|
+
|
|
2795
|
+
vocab_dict = tokenizer.get_vocab()
|
|
2796
|
+
vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
|
|
2797
|
+
assert max(vocab_dict.values()) < vocab_size
|
|
2798
|
+
|
|
2799
|
+
tokpre = self.get_vocab_base_pre(tokenizer)
|
|
2800
|
+
|
|
2801
|
+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
|
|
2802
|
+
added_vocab = tokenizer.get_added_vocab()
|
|
2803
|
+
|
|
2804
|
+
for i in range(vocab_size):
|
|
2805
|
+
if i not in reverse_vocab:
|
|
2806
|
+
tokens.append(f"[PAD{i}]")
|
|
2807
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
|
2808
|
+
elif reverse_vocab[i] in added_vocab:
|
|
2809
|
+
tokens.append(reverse_vocab[i])
|
|
2810
|
+
# Check if it's a special token - treat special tokens as CONTROL tokens
|
|
2811
|
+
if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
|
|
2812
|
+
if tokenizer.added_tokens_decoder[i].special:
|
|
2813
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
2814
|
+
else:
|
|
2815
|
+
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
2816
|
+
else:
|
|
2817
|
+
# Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
|
|
2818
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
2819
|
+
else:
|
|
2820
|
+
tokens.append(reverse_vocab[i])
|
|
2821
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
|
2822
|
+
|
|
2823
|
+
return tokens, toktypes, tokpre
|
|
2824
|
+
|
|
2825
|
+
def set_vocab(self):
|
|
2826
|
+
try:
|
|
2827
|
+
self._set_vocab_sentencepiece()
|
|
2828
|
+
except FileNotFoundError:
|
|
2829
|
+
self._set_vocab_gpt2()
|
|
2830
|
+
|
|
2831
|
+
def set_gguf_parameters(self):
|
|
2832
|
+
super().set_gguf_parameters()
|
|
2833
|
+
self._try_set_pooling_type()
|
|
2834
|
+
|
|
2835
|
+
# Dream models use non-causal attention for diffusion
|
|
2836
|
+
self.gguf_writer.add_causal_attention(False)
|
|
2837
|
+
# Handle RoPE scaling similar to Qwen2
|
|
2838
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
2839
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
2840
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
2841
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
2842
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
2843
|
+
|
|
2844
|
+
# Add Dream-specific parameters
|
|
2845
|
+
mask_token_id = self.hparams.get("mask_token_id")
|
|
2846
|
+
if mask_token_id is not None:
|
|
2847
|
+
self.gguf_writer.add_mask_token_id(mask_token_id)
|
|
2848
|
+
|
|
2849
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
2850
|
+
# Dream model tensors should be mapped directly since it's the base model
|
|
2851
|
+
yield from super().modify_tensors(data_torch, name, bid)
|
|
2852
|
+
|
|
2853
|
+
|
|
2854
|
+
@ModelBase.register("Ernie4_5_ForCausalLM")
|
|
2855
|
+
class Ernie4_5Model(TextModel):
|
|
2856
|
+
model_arch = gguf.MODEL_ARCH.ERNIE4_5
|
|
2857
|
+
|
|
2858
|
+
def set_vocab(self):
|
|
2859
|
+
self._set_vocab_sentencepiece()
|
|
2860
|
+
|
|
2861
|
+
def set_gguf_parameters(self):
|
|
2862
|
+
super().set_gguf_parameters()
|
|
2863
|
+
|
|
2864
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
2865
|
+
num_heads = self.hparams["num_attention_heads"]
|
|
2866
|
+
num_kv_heads = self.hparams["num_key_value_heads"]
|
|
2867
|
+
if (head_dim := self.hparams.get("head_dim")) is None:
|
|
2868
|
+
head_dim = self.hparams["hidden_size"] // num_heads
|
|
2869
|
+
|
|
2870
|
+
if "ernie." in name:
|
|
2871
|
+
name = name.replace("ernie.", "model.")
|
|
2872
|
+
# split the qkv weights
|
|
2873
|
+
# qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size]
|
|
2874
|
+
if "qkv_proj" in name:
|
|
2875
|
+
name_q = name.replace("qkv_proj.weight", "q_proj.weight")
|
|
2876
|
+
name_k = name.replace("qkv_proj.weight", "k_proj.weight")
|
|
2877
|
+
name_v = name.replace("qkv_proj.weight", "v_proj.weight")
|
|
2878
|
+
total_q_dim = num_heads * head_dim
|
|
2879
|
+
total_k_dim = num_kv_heads * head_dim
|
|
2880
|
+
total_v_dim = num_kv_heads * head_dim
|
|
2881
|
+
q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0)
|
|
2882
|
+
return [
|
|
2883
|
+
(self.map_tensor_name(name_q), q_proj_weight),
|
|
2884
|
+
(self.map_tensor_name(name_k), k_proj_weight),
|
|
2885
|
+
(self.map_tensor_name(name_v), v_proj_weight)
|
|
2886
|
+
]
|
|
2887
|
+
# split the up_gate_proj into gate and up
|
|
2888
|
+
# up_gate_proj shape: [2 * intermediate_size, hidden_size]
|
|
2889
|
+
if "up_gate_proj" in name:
|
|
2890
|
+
name_up = name.replace("up_gate_proj.weight", "up_proj.weight")
|
|
2891
|
+
name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight")
|
|
2892
|
+
dim_half = data_torch.shape[0] // 2
|
|
2893
|
+
gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0)
|
|
2894
|
+
return [
|
|
2895
|
+
(self.map_tensor_name(name_gate), gate_proj_weight),
|
|
2896
|
+
(self.map_tensor_name(name_up), up_proj_weight)
|
|
2897
|
+
]
|
|
2898
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
2899
|
+
|
|
2900
|
+
|
|
2901
|
+
@ModelBase.register("Ernie4_5_MoeForCausalLM")
|
|
2902
|
+
class Ernie4_5MoeModel(Ernie4_5Model):
|
|
2903
|
+
model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE
|
|
2904
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
2905
|
+
|
|
2906
|
+
def __init__(self, *args, **kwargs):
|
|
2907
|
+
super().__init__(*args, **kwargs)
|
|
2908
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
2909
|
+
|
|
2910
|
+
def set_gguf_parameters(self):
|
|
2911
|
+
super().set_gguf_parameters()
|
|
2912
|
+
self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
|
|
2913
|
+
self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
|
|
2914
|
+
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])
|
|
2915
|
+
self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"])
|
|
2916
|
+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
|
2917
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
|
2918
|
+
if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None:
|
|
2919
|
+
self.gguf_writer.add_expert_shared_count(shared_expert_count)
|
|
2920
|
+
if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
|
|
2921
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)
|
|
2922
|
+
|
|
2923
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
2924
|
+
# Modify correction bias name as in DeepseekV2
|
|
2925
|
+
if name.endswith("e_score_correction_bias"):
|
|
2926
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
|
2927
|
+
|
|
2928
|
+
# skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
|
|
2929
|
+
match = re.match(r"model.mtp_block.(\d+)", name)
|
|
2930
|
+
if match:
|
|
2931
|
+
return []
|
|
2932
|
+
|
|
2933
|
+
# skip all other MTP tensors for now
|
|
2934
|
+
match = re.match(r"model.mtp_emb_norm.(\d+)", name)
|
|
2935
|
+
if match:
|
|
2936
|
+
return []
|
|
2937
|
+
|
|
2938
|
+
match = re.match(r"model.mtp_hidden_norm.(\d+)", name)
|
|
2939
|
+
if match:
|
|
2940
|
+
return []
|
|
2941
|
+
|
|
2942
|
+
match = re.match(r"model.mtp_linear_proj.(\d+)", name)
|
|
2943
|
+
if match:
|
|
2944
|
+
return []
|
|
2945
|
+
|
|
2946
|
+
# process the experts separately
|
|
2947
|
+
if name.find("mlp.experts") != -1:
|
|
2948
|
+
n_experts = self.hparams["moe_num_experts"]
|
|
2949
|
+
assert bid is not None
|
|
2950
|
+
|
|
2951
|
+
if self._experts is None:
|
|
2952
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
2953
|
+
|
|
2954
|
+
self._experts[bid][name] = data_torch
|
|
2955
|
+
|
|
2956
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
2957
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
2958
|
+
|
|
2959
|
+
# merge the experts into a single 3d tensor
|
|
2960
|
+
for w_name in ["gate_proj", "up_proj", "down_proj"]:
|
|
2961
|
+
datas: list[Tensor] = []
|
|
2962
|
+
|
|
2963
|
+
for xid in range(n_experts):
|
|
2964
|
+
ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
2965
|
+
datas.append(self._experts[bid][ename_to_retrieve])
|
|
2966
|
+
del self._experts[bid][ename_to_retrieve]
|
|
2967
|
+
|
|
2968
|
+
data_torch = torch.stack(datas, dim=0)
|
|
2969
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
2970
|
+
new_name = self.map_tensor_name(merged_name)
|
|
2971
|
+
tensors.append((new_name, data_torch))
|
|
2972
|
+
|
|
2973
|
+
return tensors
|
|
2974
|
+
else:
|
|
2975
|
+
return []
|
|
2976
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
2977
|
+
|
|
2978
|
+
def prepare_tensors(self):
|
|
2979
|
+
super().prepare_tensors()
|
|
2980
|
+
|
|
2981
|
+
if self._experts is not None:
|
|
2982
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
2983
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
2984
|
+
if len(experts) > 0:
|
|
2985
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
2986
|
+
|
|
2987
|
+
|
|
2746
2988
|
@ModelBase.register(
|
|
2747
2989
|
"Qwen2VLModel",
|
|
2748
2990
|
"Qwen2VLForConditionalGeneration",
|
|
@@ -3430,6 +3672,175 @@ class PlamoModel(TextModel):
|
|
|
3430
3672
|
return [(new_name, data_torch)]
|
|
3431
3673
|
|
|
3432
3674
|
|
|
3675
|
+
@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM")
|
|
3676
|
+
class Plamo2Model(TextModel):
|
|
3677
|
+
model_arch = gguf.MODEL_ARCH.PLAMO2
|
|
3678
|
+
|
|
3679
|
+
def set_vocab(self):
|
|
3680
|
+
# PLaMo 2 uses a custom tokenizer with a .jsonl file
|
|
3681
|
+
# We need to handle this specially
|
|
3682
|
+
tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
|
|
3683
|
+
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
|
|
3684
|
+
|
|
3685
|
+
if not tokenizer_jsonl_path.is_file():
|
|
3686
|
+
raise FileNotFoundError(f"PLaMo 2 tokenizer file not found: {tokenizer_jsonl_path}")
|
|
3687
|
+
|
|
3688
|
+
# Load tokenizer config
|
|
3689
|
+
with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
|
|
3690
|
+
tokenizer_config = json.load(f)
|
|
3691
|
+
|
|
3692
|
+
# Load tokens from JSONL file (actually a list format)
|
|
3693
|
+
tokens = []
|
|
3694
|
+
scores = []
|
|
3695
|
+
toktypes = []
|
|
3696
|
+
|
|
3697
|
+
with open(tokenizer_jsonl_path, 'r', encoding='utf-8') as f:
|
|
3698
|
+
for line_num, line in enumerate(f):
|
|
3699
|
+
if line.strip():
|
|
3700
|
+
token_data = json.loads(line)
|
|
3701
|
+
# Format: [token, score, type, ?, ?, ?, ?]
|
|
3702
|
+
token = token_data[0].encode("utf-8")
|
|
3703
|
+
score = float(token_data[1])
|
|
3704
|
+
token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
|
|
3705
|
+
|
|
3706
|
+
tokens.append(token)
|
|
3707
|
+
scores.append(score)
|
|
3708
|
+
|
|
3709
|
+
# Map token type strings to GGUF token types
|
|
3710
|
+
if token_type_str == "UNKNOWN":
|
|
3711
|
+
toktypes.append(gguf.TokenType.UNKNOWN)
|
|
3712
|
+
elif token_type_str == "CONTROL":
|
|
3713
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
3714
|
+
elif token_type_str == "BYTE":
|
|
3715
|
+
toktypes.append(gguf.TokenType.BYTE)
|
|
3716
|
+
else:
|
|
3717
|
+
# Check for PLaMo-2 special tokens
|
|
3718
|
+
token_str = token_data[0]
|
|
3719
|
+
if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
|
|
3720
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
3721
|
+
else:
|
|
3722
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
|
3723
|
+
|
|
3724
|
+
vocab_size = self.hparams["vocab_size"]
|
|
3725
|
+
if vocab_size > len(tokens):
|
|
3726
|
+
pad_count = vocab_size - len(tokens)
|
|
3727
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
|
3728
|
+
for i in range(1, pad_count + 1):
|
|
3729
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
|
3730
|
+
scores.append(-1000.0)
|
|
3731
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
|
3732
|
+
|
|
3733
|
+
# Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
|
|
3734
|
+
self.gguf_writer.add_tokenizer_model("plamo2")
|
|
3735
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
|
3736
|
+
self.gguf_writer.add_token_list(tokens)
|
|
3737
|
+
self.gguf_writer.add_token_scores(scores)
|
|
3738
|
+
self.gguf_writer.add_token_types(toktypes)
|
|
3739
|
+
|
|
3740
|
+
# Add special tokens from config
|
|
3741
|
+
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
|
|
3742
|
+
token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
|
|
3743
|
+
self.gguf_writer.add_bos_token_id(token_id)
|
|
3744
|
+
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
|
|
3745
|
+
token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
|
|
3746
|
+
self.gguf_writer.add_eos_token_id(token_id)
|
|
3747
|
+
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
|
|
3748
|
+
token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
|
|
3749
|
+
self.gguf_writer.add_pad_token_id(token_id)
|
|
3750
|
+
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
|
|
3751
|
+
token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
|
|
3752
|
+
self.gguf_writer.add_sep_token_id(token_id)
|
|
3753
|
+
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
|
|
3754
|
+
token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
|
|
3755
|
+
self.gguf_writer.add_unk_token_id(token_id)
|
|
3756
|
+
|
|
3757
|
+
# Add <|plamo:op|> as EOT to ensure appropriate end of generation
|
|
3758
|
+
self.gguf_writer.add_eot_token_id(4)
|
|
3759
|
+
|
|
3760
|
+
self.gguf_writer.add_add_space_prefix(False)
|
|
3761
|
+
|
|
3762
|
+
def set_gguf_parameters(self):
|
|
3763
|
+
hparams = self.hparams
|
|
3764
|
+
block_count = hparams["num_hidden_layers"]
|
|
3765
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
3766
|
+
|
|
3767
|
+
# Which layers are Mamba layers
|
|
3768
|
+
# PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer)
|
|
3769
|
+
# This logic matches modeling_plamo.py's is_mamba function
|
|
3770
|
+
mamba_step = hparams.get("mamba_step", 2)
|
|
3771
|
+
mamba_enabled = hparams.get("mamba_enabled", True)
|
|
3772
|
+
mamba_layers = []
|
|
3773
|
+
|
|
3774
|
+
if mamba_enabled:
|
|
3775
|
+
for i in range(block_count):
|
|
3776
|
+
if block_count <= (mamba_step // 2):
|
|
3777
|
+
# use attention in last layer
|
|
3778
|
+
is_mamba = (i != block_count - 1)
|
|
3779
|
+
else:
|
|
3780
|
+
is_mamba = (i % mamba_step) != (mamba_step // 2)
|
|
3781
|
+
if is_mamba:
|
|
3782
|
+
mamba_layers.append(0)
|
|
3783
|
+
else:
|
|
3784
|
+
mamba_layers.append(hparams.get("num_key_value_heads", 4))
|
|
3785
|
+
|
|
3786
|
+
if mamba_layers:
|
|
3787
|
+
self.gguf_writer.add_head_count_kv(mamba_layers)
|
|
3788
|
+
|
|
3789
|
+
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
|
|
3790
|
+
self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
|
|
3791
|
+
self.gguf_writer.add_block_count(block_count)
|
|
3792
|
+
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
|
|
3793
|
+
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
|
3794
|
+
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
|
|
3795
|
+
|
|
3796
|
+
# Mamba parameters
|
|
3797
|
+
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
|
3798
|
+
self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4))
|
|
3799
|
+
self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64))
|
|
3800
|
+
intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128)
|
|
3801
|
+
self.gguf_writer.add_ssm_inner_size(intermediate_size)
|
|
3802
|
+
self.gguf_writer.add_ssm_group_count(0)
|
|
3803
|
+
|
|
3804
|
+
# MLP feed forward parameters (for attention layers)
|
|
3805
|
+
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
|
|
3806
|
+
self.gguf_writer.add_file_type(self.ftype)
|
|
3807
|
+
|
|
3808
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
3809
|
+
del bid # unused
|
|
3810
|
+
|
|
3811
|
+
if name.endswith(".A_log"):
|
|
3812
|
+
data_torch = -torch.exp(data_torch)
|
|
3813
|
+
elif name.endswith(".dt_bias"):
|
|
3814
|
+
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
|
|
3815
|
+
elif name.endswith(".dt_norm_weight"):
|
|
3816
|
+
name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight"
|
|
3817
|
+
elif name.endswith(".B_norm_weight"):
|
|
3818
|
+
name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight"
|
|
3819
|
+
elif name.endswith(".C_norm_weight"):
|
|
3820
|
+
name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight"
|
|
3821
|
+
elif name.endswith(".k_weight"):
|
|
3822
|
+
name = name.rpartition(".k_weight")[0] + ".k.weight"
|
|
3823
|
+
elif name.endswith(".q_weight"):
|
|
3824
|
+
name = name.rpartition(".q_weight")[0] + ".q.weight"
|
|
3825
|
+
elif name.endswith(".conv1d.weight"):
|
|
3826
|
+
data_torch = torch.squeeze(data_torch) # remove (, 1, )
|
|
3827
|
+
assert data_torch.ndim == 2
|
|
3828
|
+
elif name.endswith(".pre_mixer_norm.weight"):
|
|
3829
|
+
data_torch += 1.0
|
|
3830
|
+
elif name.endswith(".post_mixer_norm.weight"):
|
|
3831
|
+
data_torch += 1.0 / 5
|
|
3832
|
+
elif name.endswith(".pre_mlp_norm.weight"):
|
|
3833
|
+
data_torch += 1.0
|
|
3834
|
+
elif name.endswith(".post_mlp_norm.weight"):
|
|
3835
|
+
data_torch += 1.0 / (5**1.5)
|
|
3836
|
+
elif name.endswith(".norm.weight"):
|
|
3837
|
+
data_torch += 1.0
|
|
3838
|
+
|
|
3839
|
+
new_name = self.map_tensor_name(name)
|
|
3840
|
+
|
|
3841
|
+
return [(new_name, data_torch)]
|
|
3842
|
+
|
|
3843
|
+
|
|
3433
3844
|
@ModelBase.register("CodeShellForCausalLM")
|
|
3434
3845
|
class CodeShellModel(TextModel):
|
|
3435
3846
|
model_arch = gguf.MODEL_ARCH.CODESHELL
|
|
@@ -4362,9 +4773,6 @@ class Gemma3NModel(Gemma3Model):
|
|
|
4362
4773
|
]
|
|
4363
4774
|
|
|
4364
4775
|
def set_vocab(self):
|
|
4365
|
-
with open(self.dir_model / "chat_template.jinja") as f:
|
|
4366
|
-
# quick hack to make sure chat template is added
|
|
4367
|
-
self.gguf_writer.add_chat_template(f.read())
|
|
4368
4776
|
super().set_vocab()
|
|
4369
4777
|
|
|
4370
4778
|
def set_gguf_parameters(self):
|
|
@@ -4735,6 +5143,14 @@ class ARwkv7Model(Rwkv7Model):
|
|
|
4735
5143
|
class MambaModel(TextModel):
|
|
4736
5144
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
|
4737
5145
|
|
|
5146
|
+
def __init__(self, dir_model: Path, *args, **kwargs):
|
|
5147
|
+
# Avoid using AutoConfig for hparams
|
|
5148
|
+
hparams = kwargs.pop("hparams", None)
|
|
5149
|
+
if hparams is None:
|
|
5150
|
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
5151
|
+
hparams = json.load(f)
|
|
5152
|
+
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
|
5153
|
+
|
|
4738
5154
|
def set_vocab(self):
|
|
4739
5155
|
vocab_size = self.hparams["vocab_size"]
|
|
4740
5156
|
# Round vocab size to next multiple of 8
|
|
@@ -4809,36 +5225,246 @@ class MambaModel(TextModel):
|
|
|
4809
5225
|
return [(new_name, data_torch)]
|
|
4810
5226
|
|
|
4811
5227
|
|
|
4812
|
-
@ModelBase.register("
|
|
4813
|
-
class
|
|
4814
|
-
model_arch = gguf.MODEL_ARCH.
|
|
5228
|
+
@ModelBase.register("Mamba2ForCausalLM")
|
|
5229
|
+
class Mamba2Model(TextModel):
|
|
5230
|
+
model_arch = gguf.MODEL_ARCH.MAMBA2
|
|
4815
5231
|
|
|
4816
|
-
def __init__(self, *args, **kwargs):
|
|
4817
|
-
|
|
5232
|
+
def __init__(self, dir_model: Path, *args, **kwargs):
|
|
5233
|
+
# Avoid using AutoConfig for hparams
|
|
5234
|
+
# It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
|
|
5235
|
+
hparams = kwargs.pop("hparams", None)
|
|
5236
|
+
if hparams is None:
|
|
5237
|
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
5238
|
+
hparams = json.load(f)
|
|
5239
|
+
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
|
5240
|
+
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
|
5241
|
+
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
|
|
5242
|
+
self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
|
|
4818
5243
|
|
|
4819
|
-
|
|
4820
|
-
|
|
4821
|
-
#
|
|
4822
|
-
|
|
5244
|
+
def set_vocab(self):
|
|
5245
|
+
vocab_size = self.hparams["vocab_size"]
|
|
5246
|
+
# Round vocab size to next multiple of 16
|
|
5247
|
+
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
|
|
5248
|
+
# pad using ceiling division
|
|
5249
|
+
# ref: https://stackoverflow.com/a/17511341/22827863
|
|
5250
|
+
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
|
5251
|
+
self.hparams["vocab_size"] = vocab_size
|
|
5252
|
+
|
|
5253
|
+
if (self.dir_model / "tokenizer.model").is_file():
|
|
5254
|
+
self._set_vocab_sentencepiece()
|
|
5255
|
+
elif (self.dir_model / "tokenizer.model.v3").is_file():
|
|
5256
|
+
# mamba-codestral
|
|
5257
|
+
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
|
|
5258
|
+
elif (self.dir_model / "tokenizer.json").is_file():
|
|
5259
|
+
self._set_vocab_gpt2()
|
|
5260
|
+
else:
|
|
5261
|
+
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
|
5262
|
+
self._set_vocab_builtin("gpt-neox", vocab_size)
|
|
4823
5263
|
|
|
4824
5264
|
def set_gguf_parameters(self):
|
|
4825
|
-
|
|
4826
|
-
self.
|
|
4827
|
-
self.
|
|
5265
|
+
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
|
|
5266
|
+
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
|
|
5267
|
+
head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
|
|
4828
5268
|
|
|
5269
|
+
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
|
4829
5270
|
|
|
4830
|
-
|
|
4831
|
-
|
|
4832
|
-
|
|
5271
|
+
# Fail early for models which don't have a block expansion factor of 2
|
|
5272
|
+
# TODO: does this really matter?
|
|
5273
|
+
# skip the assertion for FalconH1 Model
|
|
5274
|
+
if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
|
|
5275
|
+
assert self.d_inner == 2 * self.d_model
|
|
5276
|
+
assert self.d_inner % head_dim == 0
|
|
5277
|
+
|
|
5278
|
+
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
|
5279
|
+
self.gguf_writer.add_embedding_length(self.d_model)
|
|
5280
|
+
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
|
5281
|
+
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
|
5282
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5283
|
+
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
|
5284
|
+
self.gguf_writer.add_ssm_inner_size(self.d_inner)
|
|
5285
|
+
self.gguf_writer.add_ssm_state_size(d_state)
|
|
5286
|
+
self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
|
|
5287
|
+
self.gguf_writer.add_ssm_group_count(self.n_group)
|
|
5288
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
|
5289
|
+
self.gguf_writer.add_file_type(self.ftype)
|
|
4833
5290
|
|
|
4834
|
-
def
|
|
4835
|
-
super().set_gguf_parameters()
|
|
5291
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4836
5292
|
|
|
4837
|
-
|
|
4838
|
-
|
|
4839
|
-
|
|
5293
|
+
if name.startswith("model.backbone") or name.startswith("model.lm_head"):
|
|
5294
|
+
# map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
|
|
5295
|
+
name = name.removeprefix("model.")
|
|
4840
5296
|
|
|
4841
|
-
|
|
5297
|
+
if name.endswith(".dt_bias"):
|
|
5298
|
+
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
|
|
5299
|
+
|
|
5300
|
+
new_name = self.map_tensor_name(name)
|
|
5301
|
+
|
|
5302
|
+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
|
5303
|
+
data_torch = data_torch.squeeze()
|
|
5304
|
+
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
|
|
5305
|
+
gguf.MODEL_TENSOR.SSM_A,
|
|
5306
|
+
gguf.MODEL_TENSOR.SSM_D,
|
|
5307
|
+
]):
|
|
5308
|
+
# unsqueeze A to use similar shape semantics as Mamba-1
|
|
5309
|
+
# (D is also unsqueezed, but for more straightforward broadcast internally)
|
|
5310
|
+
data_torch = data_torch.reshape((*data_torch.shape, 1))
|
|
5311
|
+
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
|
|
5312
|
+
data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
|
|
5313
|
+
|
|
5314
|
+
if name.endswith(".A_log"):
|
|
5315
|
+
logger.debug("A_log --> A ==> " + new_name)
|
|
5316
|
+
data_torch = -torch.exp(data_torch)
|
|
5317
|
+
|
|
5318
|
+
yield (new_name, data_torch)
|
|
5319
|
+
|
|
5320
|
+
|
|
5321
|
+
@ModelBase.register("JambaForCausalLM")
|
|
5322
|
+
class JambaModel(TextModel):
|
|
5323
|
+
model_arch = gguf.MODEL_ARCH.JAMBA
|
|
5324
|
+
|
|
5325
|
+
def get_vocab_base_pre(self, tokenizer) -> str:
|
|
5326
|
+
del tokenizer # unused
|
|
5327
|
+
|
|
5328
|
+
return "gpt-2"
|
|
5329
|
+
|
|
5330
|
+
def set_vocab(self):
|
|
5331
|
+
if (self.dir_model / "tokenizer.model").is_file():
|
|
5332
|
+
# Using Jamba's tokenizer.json causes errors on model load
|
|
5333
|
+
# (something about "byte not found in vocab"),
|
|
5334
|
+
# but there's a working tokenizer.model
|
|
5335
|
+
self._set_vocab_sentencepiece()
|
|
5336
|
+
else:
|
|
5337
|
+
# Some Jamba models only have a tokenizer.json, which works.
|
|
5338
|
+
self._set_vocab_gpt2()
|
|
5339
|
+
|
|
5340
|
+
def set_gguf_parameters(self):
|
|
5341
|
+
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
|
|
5342
|
+
d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
|
|
5343
|
+
d_inner = self.hparams["mamba_expand"] * d_model
|
|
5344
|
+
d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
|
|
5345
|
+
# ceiling division
|
|
5346
|
+
# ref: https://stackoverflow.com/a/17511341/22827863
|
|
5347
|
+
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
|
5348
|
+
dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
|
|
5349
|
+
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
|
|
5350
|
+
n_kv_head = self.hparams["num_key_value_heads"]
|
|
5351
|
+
attn_offset = self.hparams["attn_layer_offset"]
|
|
5352
|
+
attn_period = self.hparams["attn_layer_period"]
|
|
5353
|
+
n_kv_vec = [0 for _ in range(attn_offset)] + [
|
|
5354
|
+
n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
|
|
5355
|
+
]
|
|
5356
|
+
|
|
5357
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5358
|
+
self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
|
|
5359
|
+
self.gguf_writer.add_embedding_length(d_model)
|
|
5360
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
5361
|
+
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
5362
|
+
self.gguf_writer.add_head_count_kv(n_kv_vec)
|
|
5363
|
+
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
|
5364
|
+
self.gguf_writer.add_ssm_inner_size(d_inner)
|
|
5365
|
+
self.gguf_writer.add_ssm_state_size(d_state)
|
|
5366
|
+
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
|
5367
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
|
5368
|
+
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
|
5369
|
+
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
|
5370
|
+
self.gguf_writer.add_file_type(self.ftype)
|
|
5371
|
+
|
|
5372
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
5373
|
+
|
|
5374
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
5375
|
+
|
|
5376
|
+
# Mini-Jamba
|
|
5377
|
+
name = name.replace(".moe.", ".feed_forward.")
|
|
5378
|
+
if bid is not None:
|
|
5379
|
+
moe_offset = self.hparams["expert_layer_offset"]
|
|
5380
|
+
moe_period = self.hparams["expert_layer_period"]
|
|
5381
|
+
|
|
5382
|
+
if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
|
|
5383
|
+
name = name.replace(".experts.0.", ".")
|
|
5384
|
+
|
|
5385
|
+
# process the experts separately
|
|
5386
|
+
if ".feed_forward.experts." in name:
|
|
5387
|
+
n_experts = self.hparams["num_experts"]
|
|
5388
|
+
|
|
5389
|
+
assert bid is not None
|
|
5390
|
+
|
|
5391
|
+
if self._experts is None:
|
|
5392
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
5393
|
+
|
|
5394
|
+
self._experts[bid][name] = data_torch
|
|
5395
|
+
|
|
5396
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
5397
|
+
|
|
5398
|
+
# merge the experts into a single 3d tensor
|
|
5399
|
+
for wid in ["down_proj", "gate_proj", "up_proj"]:
|
|
5400
|
+
datas: list[Tensor] = []
|
|
5401
|
+
|
|
5402
|
+
for xid in range(n_experts):
|
|
5403
|
+
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
|
|
5404
|
+
datas.append(self._experts[bid][ename])
|
|
5405
|
+
del self._experts[bid][ename]
|
|
5406
|
+
|
|
5407
|
+
data_torch = torch.stack(datas, dim=0)
|
|
5408
|
+
|
|
5409
|
+
# using the same merged name as qwen2moe
|
|
5410
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
|
|
5411
|
+
|
|
5412
|
+
new_name = self.map_tensor_name(merged_name)
|
|
5413
|
+
|
|
5414
|
+
yield new_name, data_torch
|
|
5415
|
+
return
|
|
5416
|
+
|
|
5417
|
+
new_name = self.map_tensor_name(name)
|
|
5418
|
+
|
|
5419
|
+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
|
5420
|
+
data_torch = data_torch.squeeze()
|
|
5421
|
+
|
|
5422
|
+
if name.endswith(".A_log"):
|
|
5423
|
+
logger.debug("A_log --> A ==> " + new_name)
|
|
5424
|
+
data_torch = -torch.exp(data_torch)
|
|
5425
|
+
|
|
5426
|
+
yield (new_name, data_torch)
|
|
5427
|
+
|
|
5428
|
+
def prepare_tensors(self):
|
|
5429
|
+
super().prepare_tensors()
|
|
5430
|
+
|
|
5431
|
+
if self._experts is not None:
|
|
5432
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
5433
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
5434
|
+
if len(experts) > 0:
|
|
5435
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
5436
|
+
|
|
5437
|
+
|
|
5438
|
+
@ModelBase.register("CohereForCausalLM")
|
|
5439
|
+
class CommandR2Model(TextModel):
|
|
5440
|
+
model_arch = gguf.MODEL_ARCH.COMMAND_R
|
|
5441
|
+
|
|
5442
|
+
def __init__(self, *args, **kwargs):
|
|
5443
|
+
super().__init__(*args, **kwargs)
|
|
5444
|
+
|
|
5445
|
+
# max_position_embeddings = 8192 in config.json but model was actually
|
|
5446
|
+
# trained on 128k context length
|
|
5447
|
+
# aya-23 models don't have model_max_length specified
|
|
5448
|
+
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
|
5449
|
+
|
|
5450
|
+
def set_gguf_parameters(self):
|
|
5451
|
+
super().set_gguf_parameters()
|
|
5452
|
+
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
|
5453
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
5454
|
+
|
|
5455
|
+
|
|
5456
|
+
@ModelBase.register("Cohere2ForCausalLM")
|
|
5457
|
+
class Cohere2Model(TextModel):
|
|
5458
|
+
model_arch = gguf.MODEL_ARCH.COHERE2
|
|
5459
|
+
|
|
5460
|
+
def set_gguf_parameters(self):
|
|
5461
|
+
super().set_gguf_parameters()
|
|
5462
|
+
|
|
5463
|
+
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
|
5464
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
|
5465
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
5466
|
+
|
|
5467
|
+
rotary_pct = self.hparams["rotary_pct"]
|
|
4842
5468
|
hidden_size = self.hparams["hidden_size"]
|
|
4843
5469
|
num_attention_heads = self.hparams["num_attention_heads"]
|
|
4844
5470
|
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
|
|
@@ -5277,7 +5903,58 @@ class DeepseekV2Model(TextModel):
|
|
|
5277
5903
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
|
5278
5904
|
|
|
5279
5905
|
def set_vocab(self):
|
|
5280
|
-
|
|
5906
|
+
try:
|
|
5907
|
+
self._set_vocab_gpt2()
|
|
5908
|
+
return
|
|
5909
|
+
except Exception:
|
|
5910
|
+
pass
|
|
5911
|
+
|
|
5912
|
+
from transformers import AutoTokenizer
|
|
5913
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
|
5914
|
+
tokpre = self.get_vocab_base_pre(tokenizer)
|
|
5915
|
+
|
|
5916
|
+
if tokpre == "kimi-k2":
|
|
5917
|
+
# Build merges list using the approach similar to HunYuanMoE
|
|
5918
|
+
merges = []
|
|
5919
|
+
vocab = {}
|
|
5920
|
+
mergeable_ranks = tokenizer.model._mergeable_ranks
|
|
5921
|
+
for token, rank in mergeable_ranks.items():
|
|
5922
|
+
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
|
5923
|
+
if len(token) == 1:
|
|
5924
|
+
continue
|
|
5925
|
+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
|
5926
|
+
if len(merged) == 2:
|
|
5927
|
+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
|
5928
|
+
|
|
5929
|
+
# Build token list
|
|
5930
|
+
vocab_size = self.hparams["vocab_size"]
|
|
5931
|
+
special_tokens = tokenizer.special_tokens
|
|
5932
|
+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
|
|
5933
|
+
tokens: list[str] = []
|
|
5934
|
+
toktypes: list[int] = []
|
|
5935
|
+
|
|
5936
|
+
for i in range(vocab_size):
|
|
5937
|
+
if i not in reverse_vocab:
|
|
5938
|
+
tokens.append(f"[PAD{i}]")
|
|
5939
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
|
5940
|
+
else:
|
|
5941
|
+
token = reverse_vocab[i]
|
|
5942
|
+
tokens.append(token)
|
|
5943
|
+
if i in special_tokens.values():
|
|
5944
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
5945
|
+
else:
|
|
5946
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
|
5947
|
+
|
|
5948
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
|
5949
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
|
5950
|
+
self.gguf_writer.add_token_list(tokens)
|
|
5951
|
+
self.gguf_writer.add_token_types(toktypes)
|
|
5952
|
+
self.gguf_writer.add_token_merges(merges)
|
|
5953
|
+
|
|
5954
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
|
5955
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
|
5956
|
+
else:
|
|
5957
|
+
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
|
|
5281
5958
|
|
|
5282
5959
|
def set_gguf_parameters(self):
|
|
5283
5960
|
|
|
@@ -5809,7 +6486,7 @@ class JaisModel(TextModel):
|
|
|
5809
6486
|
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
|
5810
6487
|
|
|
5811
6488
|
|
|
5812
|
-
@ModelBase.register("Glm4ForCausalLM")
|
|
6489
|
+
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
|
|
5813
6490
|
class Glm4Model(TextModel):
|
|
5814
6491
|
model_arch = gguf.MODEL_ARCH.GLM4
|
|
5815
6492
|
|
|
@@ -5831,7 +6508,8 @@ class Glm4Model(TextModel):
|
|
|
5831
6508
|
|
|
5832
6509
|
def set_gguf_parameters(self):
|
|
5833
6510
|
super().set_gguf_parameters()
|
|
5834
|
-
rope_dim
|
|
6511
|
+
if (rope_dim := self.hparams.get("head_dim")) is None:
|
|
6512
|
+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
5835
6513
|
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
|
5836
6514
|
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
5837
6515
|
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
@@ -5839,6 +6517,13 @@ class Glm4Model(TextModel):
|
|
|
5839
6517
|
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
5840
6518
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
5841
6519
|
|
|
6520
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
6521
|
+
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
|
6522
|
+
return []
|
|
6523
|
+
elif name.startswith("model.language_model."):
|
|
6524
|
+
name = name.replace("language_model.", "") # for Glm4v
|
|
6525
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
6526
|
+
|
|
5842
6527
|
|
|
5843
6528
|
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
|
5844
6529
|
class ChatGLMModel(TextModel):
|
|
@@ -6106,6 +6791,75 @@ class ExaoneModel(TextModel):
|
|
|
6106
6791
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
|
6107
6792
|
|
|
6108
6793
|
|
|
6794
|
+
@ModelBase.register("Exaone4ForCausalLM")
|
|
6795
|
+
class Exaone4Model(TextModel):
|
|
6796
|
+
model_arch = gguf.MODEL_ARCH.EXAONE4
|
|
6797
|
+
|
|
6798
|
+
def set_vocab(self):
|
|
6799
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
|
6800
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
|
6801
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
|
6802
|
+
self.gguf_writer.add_token_list(tokens)
|
|
6803
|
+
self.gguf_writer.add_token_types(toktypes)
|
|
6804
|
+
|
|
6805
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
|
6806
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
|
6807
|
+
|
|
6808
|
+
def set_gguf_parameters(self):
|
|
6809
|
+
super().set_gguf_parameters()
|
|
6810
|
+
hparams = self.hparams
|
|
6811
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
6812
|
+
|
|
6813
|
+
if hparams.get("sliding_window") is not None:
|
|
6814
|
+
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
|
6815
|
+
if "layer_types" in hparams:
|
|
6816
|
+
self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
|
|
6817
|
+
elif "sliding_window_pattern" in hparams:
|
|
6818
|
+
sliding_window_pattern = []
|
|
6819
|
+
if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG
|
|
6820
|
+
for i in range(hparams["num_hidden_layers"]):
|
|
6821
|
+
sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L")
|
|
6822
|
+
if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4
|
|
6823
|
+
for i in range(hparams["num_hidden_layers"]):
|
|
6824
|
+
sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0)
|
|
6825
|
+
if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
|
|
6826
|
+
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
|
6827
|
+
|
|
6828
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
6829
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
6830
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
6831
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
6832
|
+
|
|
6833
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
|
6834
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
6835
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
6836
|
+
base = self.hparams.get("rope_theta", 10_000.0)
|
|
6837
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
6838
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
6839
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
6840
|
+
|
|
6841
|
+
factor = rope_scaling.get("factor", 16.0)
|
|
6842
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
|
6843
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
|
6844
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
|
6845
|
+
|
|
6846
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
|
6847
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
|
6848
|
+
|
|
6849
|
+
rope_factors = []
|
|
6850
|
+
for freq in freqs:
|
|
6851
|
+
wavelen = 2 * math.pi / freq
|
|
6852
|
+
if wavelen < high_freq_wavelen:
|
|
6853
|
+
rope_factors.append(1)
|
|
6854
|
+
elif wavelen > low_freq_wavelen:
|
|
6855
|
+
rope_factors.append(factor)
|
|
6856
|
+
else:
|
|
6857
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
|
6858
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
|
6859
|
+
|
|
6860
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
|
6861
|
+
|
|
6862
|
+
|
|
6109
6863
|
@ModelBase.register("GraniteForCausalLM")
|
|
6110
6864
|
class GraniteModel(LlamaModel):
|
|
6111
6865
|
"""Conversion for IBM's GraniteForCausalLM"""
|
|
@@ -6170,18 +6924,148 @@ class GraniteMoeModel(GraniteModel):
|
|
|
6170
6924
|
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
|
6171
6925
|
]
|
|
6172
6926
|
|
|
6927
|
+
has_experts = bool(self.hparams.get('num_local_experts'))
|
|
6928
|
+
|
|
6173
6929
|
if name.endswith("shared_mlp.input_linear.weight"):
|
|
6174
6930
|
ffn_dim = self.hparams["shared_intermediate_size"]
|
|
6175
6931
|
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
|
|
6176
6932
|
gate, up = data_torch.split(ffn_dim, dim=-2)
|
|
6933
|
+
if has_experts:
|
|
6934
|
+
return [
|
|
6935
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
|
|
6936
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
|
|
6937
|
+
]
|
|
6938
|
+
return [
|
|
6939
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
|
|
6940
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
|
|
6941
|
+
]
|
|
6942
|
+
|
|
6943
|
+
if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
|
|
6177
6944
|
return [
|
|
6178
|
-
(self.format_tensor_name(gguf.MODEL_TENSOR.
|
|
6179
|
-
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
|
|
6945
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
|
|
6180
6946
|
]
|
|
6181
6947
|
|
|
6182
6948
|
return super().modify_tensors(data_torch, name, bid)
|
|
6183
6949
|
|
|
6184
6950
|
|
|
6951
|
+
@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
|
|
6952
|
+
class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
|
6953
|
+
"""GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
|
|
6954
|
+
layers and optionally uses MoE w/ a shared expert"""
|
|
6955
|
+
model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
|
|
6956
|
+
undo_permute = True
|
|
6957
|
+
|
|
6958
|
+
def __init__(self, *args, **kwargs):
|
|
6959
|
+
|
|
6960
|
+
# Hybrid mamba models use a prefix for the mamba-specific params.
|
|
6961
|
+
# TODO: Extend this if the prefix(es) need to be configurable
|
|
6962
|
+
self.hparam_prefixes = ["mamba"]
|
|
6963
|
+
|
|
6964
|
+
super().__init__(*args, **kwargs)
|
|
6965
|
+
|
|
6966
|
+
# Lists of which layers use ssm vs attention
|
|
6967
|
+
self._attn_layers = self.get_attn_layers()
|
|
6968
|
+
self._ssm_layers = [
|
|
6969
|
+
i for i in range(self.block_count)
|
|
6970
|
+
if i not in self._attn_layers
|
|
6971
|
+
]
|
|
6972
|
+
|
|
6973
|
+
# n_group and d_inner are used during reshape_tensors for mamba2
|
|
6974
|
+
self.d_model = self.find_hparam(["hidden_size", "d_model"])
|
|
6975
|
+
self.n_group = self.find_hparam(["n_groups"])
|
|
6976
|
+
self.d_inner = self.find_hparam(["expand"]) * self.d_model
|
|
6977
|
+
|
|
6978
|
+
def get_attn_layers(self):
|
|
6979
|
+
# Explicit list of layer type names
|
|
6980
|
+
if layer_types := self.hparams.get("layer_types"):
|
|
6981
|
+
return [
|
|
6982
|
+
i for i, typ in enumerate(layer_types)
|
|
6983
|
+
if typ == "attention"
|
|
6984
|
+
]
|
|
6985
|
+
|
|
6986
|
+
# Layer types indicated by index or period
|
|
6987
|
+
attn_layers = self.hparams.get("attn_layer_indices", [])
|
|
6988
|
+
if not attn_layers:
|
|
6989
|
+
attn_period = self.hparams.get("attn_layer_period")
|
|
6990
|
+
assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
|
|
6991
|
+
attn_offset = self.hparams.get("attn_layer_offset")
|
|
6992
|
+
assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
|
|
6993
|
+
attn_layers = [
|
|
6994
|
+
i for i in range(self.block_count)
|
|
6995
|
+
if i % attn_period == attn_offset
|
|
6996
|
+
]
|
|
6997
|
+
return attn_layers
|
|
6998
|
+
|
|
6999
|
+
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
|
|
7000
|
+
prefixed = []
|
|
7001
|
+
for pfx in self.hparam_prefixes:
|
|
7002
|
+
prefixed.extend(
|
|
7003
|
+
"_".join([pfx, k])
|
|
7004
|
+
for k in keys
|
|
7005
|
+
)
|
|
7006
|
+
keys = list(keys) + prefixed
|
|
7007
|
+
return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
|
|
7008
|
+
|
|
7009
|
+
def modify_tensors(
|
|
7010
|
+
self, data_torch: Tensor, name: str, bid: int | None
|
|
7011
|
+
) -> Iterable[tuple[str, Tensor]]:
|
|
7012
|
+
if (
|
|
7013
|
+
name.endswith("block_sparse_moe.input_linear.weight")
|
|
7014
|
+
or "shared_mlp" in name
|
|
7015
|
+
):
|
|
7016
|
+
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
|
|
7017
|
+
|
|
7018
|
+
# Determine whether this is a mamba layer or an attention layer
|
|
7019
|
+
if bid in self._ssm_layers:
|
|
7020
|
+
return Mamba2Model.modify_tensors(self, data_torch, name, bid)
|
|
7021
|
+
elif bid in self._attn_layers:
|
|
7022
|
+
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
|
|
7023
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
7024
|
+
|
|
7025
|
+
def set_gguf_parameters(self):
|
|
7026
|
+
"""This method merges params from both parents and some that are
|
|
7027
|
+
specific to this model. The result is some duplication of how the params
|
|
7028
|
+
get set. The following warnings are expected during conversion:
|
|
7029
|
+
|
|
7030
|
+
WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
|
|
7031
|
+
WARNING:Duplicated key name 'granitehybrid.context_length'
|
|
7032
|
+
"""
|
|
7033
|
+
GraniteMoeModel.set_gguf_parameters(self)
|
|
7034
|
+
|
|
7035
|
+
## Mamba mixer params ##
|
|
7036
|
+
self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
|
|
7037
|
+
self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
|
|
7038
|
+
self.gguf_writer.add_ssm_group_count(self.n_group)
|
|
7039
|
+
self.gguf_writer.add_ssm_inner_size(self.d_inner)
|
|
7040
|
+
# NOTE: The mamba_dt_rank is _not_ the right field for how this is used
|
|
7041
|
+
# in llama.cpp
|
|
7042
|
+
self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
|
|
7043
|
+
|
|
7044
|
+
## Attention params ##
|
|
7045
|
+
head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
|
7046
|
+
head_count_kv_vec = [
|
|
7047
|
+
head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
|
|
7048
|
+
]
|
|
7049
|
+
if rope_dim := self.hparams.get("attn_rotary_emb"):
|
|
7050
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
7051
|
+
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
|
|
7052
|
+
|
|
7053
|
+
## If Bamba, use rope, otherwise don't
|
|
7054
|
+
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
|
|
7055
|
+
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
|
|
7056
|
+
if not use_rope:
|
|
7057
|
+
self.gguf_writer.add_context_length(2**20)
|
|
7058
|
+
|
|
7059
|
+
## Validation ##
|
|
7060
|
+
d_head = self.find_hparam(["d_head"], optional=True) or 64
|
|
7061
|
+
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
|
7062
|
+
assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
|
|
7063
|
+
|
|
7064
|
+
def set_vocab(self):
|
|
7065
|
+
self.hparams["pad_vocab_size_multiple"] = 8
|
|
7066
|
+
Mamba2Model.set_vocab(self)
|
|
7067
|
+
|
|
7068
|
+
|
|
6185
7069
|
@ModelBase.register("BailingMoeForCausalLM")
|
|
6186
7070
|
class BailingMoeModel(TextModel):
|
|
6187
7071
|
model_arch = gguf.MODEL_ARCH.BAILINGMOE
|
|
@@ -6390,6 +7274,321 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
|
|
6390
7274
|
super().set_gguf_parameters()
|
|
6391
7275
|
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
|
6392
7276
|
|
|
7277
|
+
|
|
7278
|
+
@ModelBase.register("FalconH1ForCausalLM")
|
|
7279
|
+
class FalconH1Model(Mamba2Model):
|
|
7280
|
+
model_arch = gguf.MODEL_ARCH.FALCON_H1
|
|
7281
|
+
|
|
7282
|
+
def __init__(self, *args, **kwargs):
|
|
7283
|
+
# Set the hparam prefixes for Falcon Mamba2
|
|
7284
|
+
self.hparam_prefixes = ["mamba"]
|
|
7285
|
+
|
|
7286
|
+
# Initialize the base Mamba2Model
|
|
7287
|
+
super().__init__(*args, **kwargs)
|
|
7288
|
+
|
|
7289
|
+
# Use Llama conversion for attention
|
|
7290
|
+
self._transformer_model_class = LlamaModel
|
|
7291
|
+
|
|
7292
|
+
# n_group and d_inner are used during reshape_tensors for mamba2
|
|
7293
|
+
self.n_group = self.find_hparam(["n_groups"])
|
|
7294
|
+
self.d_inner = self.find_hparam(["mamba_d_ssm"])
|
|
7295
|
+
self.d_head = self.find_hparam(["d_head"])
|
|
7296
|
+
|
|
7297
|
+
# Initialize any Falcon Mamba2 specific attributes
|
|
7298
|
+
self.has_attention = True # Falcon Mamba2 has attention components
|
|
7299
|
+
|
|
7300
|
+
# Load Falcon-H1 multipliers from hyperparameters
|
|
7301
|
+
self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
|
|
7302
|
+
self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
|
|
7303
|
+
self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
|
|
7304
|
+
self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
|
|
7305
|
+
self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
|
|
7306
|
+
self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
|
|
7307
|
+
self.intermediate_size = self.find_hparam(["intermediate_size"])
|
|
7308
|
+
self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
|
|
7309
|
+
|
|
7310
|
+
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
|
|
7311
|
+
prefixed = []
|
|
7312
|
+
for pfx in self.hparam_prefixes:
|
|
7313
|
+
prefixed.extend(
|
|
7314
|
+
"_".join([pfx, k])
|
|
7315
|
+
for k in keys
|
|
7316
|
+
)
|
|
7317
|
+
keys = list(keys) + prefixed
|
|
7318
|
+
return super().find_hparam(keys, *args, **kwargs)
|
|
7319
|
+
|
|
7320
|
+
def set_vocab(self):
|
|
7321
|
+
self._set_vocab_gpt2()
|
|
7322
|
+
|
|
7323
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
7324
|
+
tensors = list(super().modify_tensors(data_torch, name, bid))
|
|
7325
|
+
tensor = tensors[0][1]
|
|
7326
|
+
|
|
7327
|
+
if "down_proj" in name:
|
|
7328
|
+
tensor = tensor * self.mlp_multipliers[1]
|
|
7329
|
+
elif "gate_proj" in name:
|
|
7330
|
+
tensor = tensor * self.mlp_multipliers[0]
|
|
7331
|
+
elif "k_proj" in name:
|
|
7332
|
+
tensor = tensor * self.key_multiplier * self.attention_in_multiplier
|
|
7333
|
+
elif "q_proj" in name:
|
|
7334
|
+
tensor = tensor * self.attention_in_multiplier
|
|
7335
|
+
elif "v_proj" in name:
|
|
7336
|
+
tensor = tensor * self.attention_in_multiplier
|
|
7337
|
+
elif "o_proj" in name:
|
|
7338
|
+
tensor = tensor * self.attention_out_multiplier
|
|
7339
|
+
elif "out_proj" in name:
|
|
7340
|
+
tensor = tensor * self.ssm_out_multiplier
|
|
7341
|
+
elif "in_proj" in name:
|
|
7342
|
+
tensor = tensor * self.ssm_in_multiplier
|
|
7343
|
+
zxbcdt_multipliers = self.hparams["ssm_multipliers"]
|
|
7344
|
+
intermediate_size = self.hparams["mamba_d_ssm"]
|
|
7345
|
+
groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
|
|
7346
|
+
tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
|
|
7347
|
+
tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
|
|
7348
|
+
tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
|
|
7349
|
+
tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
|
|
7350
|
+
tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
|
|
7351
|
+
elif "lm_head" in name:
|
|
7352
|
+
tensor = tensor * self.hparams["lm_head_multiplier"]
|
|
7353
|
+
elif "embed_tokens" in name:
|
|
7354
|
+
tensor = tensor * self.hparams["embedding_multiplier"]
|
|
7355
|
+
elif "mamba.norm" in name:
|
|
7356
|
+
tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
|
|
7357
|
+
|
|
7358
|
+
tensors = [(tensors[0][0], tensor)]
|
|
7359
|
+
return tensors
|
|
7360
|
+
|
|
7361
|
+
def set_gguf_parameters(self):
|
|
7362
|
+
super().set_gguf_parameters()
|
|
7363
|
+
|
|
7364
|
+
## General Params ##
|
|
7365
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
7366
|
+
# Override some Mamba2 defaults
|
|
7367
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
7368
|
+
self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
|
|
7369
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
7370
|
+
|
|
7371
|
+
## Attention params ##
|
|
7372
|
+
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
|
|
7373
|
+
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
|
7374
|
+
self.gguf_writer.add_key_length(self.hparams["head_dim"])
|
|
7375
|
+
self.gguf_writer.add_value_length(self.hparams["head_dim"])
|
|
7376
|
+
|
|
7377
|
+
## Validation ##
|
|
7378
|
+
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
|
7379
|
+
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
|
7380
|
+
|
|
7381
|
+
# Add any other Falcon Mamba2 specific configuration
|
|
7382
|
+
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
|
7383
|
+
|
|
7384
|
+
|
|
7385
|
+
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
|
7386
|
+
class HunYuanMoEModel(TextModel):
|
|
7387
|
+
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
|
|
7388
|
+
|
|
7389
|
+
def __init__(self, *args, **kwargs):
|
|
7390
|
+
super().__init__(*args, **kwargs)
|
|
7391
|
+
# For handling tied embeddings
|
|
7392
|
+
self._tok_embd = None
|
|
7393
|
+
|
|
7394
|
+
def set_vocab(self):
|
|
7395
|
+
from transformers import AutoTokenizer
|
|
7396
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
|
7397
|
+
|
|
7398
|
+
# 1. Get the pre-tokenizer identifier hash
|
|
7399
|
+
tokpre = self.get_vocab_base_pre(tokenizer)
|
|
7400
|
+
|
|
7401
|
+
# 2. Reverse-engineer the merges list from mergeable_ranks
|
|
7402
|
+
merges = []
|
|
7403
|
+
vocab = {}
|
|
7404
|
+
mergeable_ranks = tokenizer.mergeable_ranks
|
|
7405
|
+
for token, rank in mergeable_ranks.items():
|
|
7406
|
+
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
|
7407
|
+
if len(token) == 1:
|
|
7408
|
+
continue
|
|
7409
|
+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
|
7410
|
+
if len(merged) == 2: # todo this is an assert in Qwen, why?
|
|
7411
|
+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
|
7412
|
+
|
|
7413
|
+
# 3. Generate the tokens and toktypes lists
|
|
7414
|
+
vocab_size = self.hparams["vocab_size"]
|
|
7415
|
+
assert tokenizer.vocab_size == vocab_size
|
|
7416
|
+
special_tokens = tokenizer.special_tokens
|
|
7417
|
+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
|
|
7418
|
+
tokens: list[str] = []
|
|
7419
|
+
toktypes: list[int] = []
|
|
7420
|
+
for i in range(vocab_size):
|
|
7421
|
+
if i not in reverse_vocab:
|
|
7422
|
+
tokens.append(f"[PAD{i}]")
|
|
7423
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
|
7424
|
+
else:
|
|
7425
|
+
token = reverse_vocab[i]
|
|
7426
|
+
tokens.append(token)
|
|
7427
|
+
if i in special_tokens.values():
|
|
7428
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
7429
|
+
else:
|
|
7430
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
|
7431
|
+
|
|
7432
|
+
# 4. Write all vocab-related fields to the GGUF writer
|
|
7433
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
|
7434
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
|
7435
|
+
self.gguf_writer.add_token_list(tokens)
|
|
7436
|
+
self.gguf_writer.add_token_types(toktypes)
|
|
7437
|
+
self.gguf_writer.add_token_merges(merges)
|
|
7438
|
+
|
|
7439
|
+
# 5. Add special tokens and chat templates
|
|
7440
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
|
7441
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
|
7442
|
+
# FIX for BOS token: Overwrite incorrect id read from config.json
|
|
7443
|
+
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
|
|
7444
|
+
|
|
7445
|
+
def set_gguf_parameters(self):
|
|
7446
|
+
super().set_gguf_parameters()
|
|
7447
|
+
hparams = self.hparams
|
|
7448
|
+
|
|
7449
|
+
self.gguf_writer.add_expert_count(hparams["num_experts"])
|
|
7450
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
|
|
7451
|
+
|
|
7452
|
+
moe_intermediate_size = hparams["moe_intermediate_size"]
|
|
7453
|
+
assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
|
|
7454
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
|
|
7455
|
+
|
|
7456
|
+
moe_topk = hparams["moe_topk"]
|
|
7457
|
+
assert all(topk == moe_topk[0] for topk in moe_topk)
|
|
7458
|
+
self.gguf_writer.add_expert_used_count(moe_topk[0])
|
|
7459
|
+
|
|
7460
|
+
moe_shared_expert = hparams["num_shared_expert"]
|
|
7461
|
+
assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
|
|
7462
|
+
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
|
7463
|
+
|
|
7464
|
+
# Rope
|
|
7465
|
+
rope_scaling = hparams.get("rope_scaling", {})
|
|
7466
|
+
if rope_scaling.get("type") == "dynamic":
|
|
7467
|
+
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
|
7468
|
+
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
|
7469
|
+
alpha = rope_scaling.get("alpha", 1000)
|
|
7470
|
+
base = hparams.get("rope_theta", 10000.0)
|
|
7471
|
+
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
|
7472
|
+
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
|
7473
|
+
self.gguf_writer.add_rope_freq_base(scaled_base)
|
|
7474
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
7475
|
+
self.gguf_writer.add_rope_scaling_factor(1)
|
|
7476
|
+
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
|
7477
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
|
7478
|
+
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
|
7479
|
+
|
|
7480
|
+
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
|
7481
|
+
assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
|
7482
|
+
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
|
7483
|
+
|
|
7484
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
7485
|
+
|
|
7486
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
7487
|
+
if name == "model.embed_tokens.weight":
|
|
7488
|
+
self._tok_embd = data_torch.clone()
|
|
7489
|
+
|
|
7490
|
+
if name == "lm_head.weight":
|
|
7491
|
+
if self.hparams.get("tie_word_embeddings", False):
|
|
7492
|
+
logger.info("Skipping tied output layer 'lm_head.weight'")
|
|
7493
|
+
return []
|
|
7494
|
+
|
|
7495
|
+
if name.find("mlp.experts") != -1:
|
|
7496
|
+
n_experts = self.hparams["num_experts"]
|
|
7497
|
+
assert bid is not None
|
|
7498
|
+
|
|
7499
|
+
if self._experts is None:
|
|
7500
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
7501
|
+
|
|
7502
|
+
self._experts[bid][name] = data_torch
|
|
7503
|
+
|
|
7504
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
7505
|
+
# merge the experts into a single 3d tensor
|
|
7506
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
7507
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
|
7508
|
+
datas: list[Tensor] = []
|
|
7509
|
+
|
|
7510
|
+
for xid in range(n_experts):
|
|
7511
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
7512
|
+
datas.append(self._experts[bid][ename])
|
|
7513
|
+
del self._experts[bid][ename]
|
|
7514
|
+
|
|
7515
|
+
data_torch = torch.stack(datas, dim=0)
|
|
7516
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
7517
|
+
new_name = self.map_tensor_name(merged_name)
|
|
7518
|
+
tensors.append((new_name, data_torch))
|
|
7519
|
+
|
|
7520
|
+
return tensors
|
|
7521
|
+
else:
|
|
7522
|
+
return []
|
|
7523
|
+
|
|
7524
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
7525
|
+
|
|
7526
|
+
def prepare_tensors(self):
|
|
7527
|
+
super().prepare_tensors()
|
|
7528
|
+
if self._experts is not None:
|
|
7529
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
7530
|
+
if len(experts) > 0:
|
|
7531
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
7532
|
+
|
|
7533
|
+
|
|
7534
|
+
@ModelBase.register("SmolLM3ForCausalLM")
|
|
7535
|
+
class SmolLM3Model(LlamaModel):
|
|
7536
|
+
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
|
7537
|
+
|
|
7538
|
+
def set_vocab(self):
|
|
7539
|
+
super().set_vocab()
|
|
7540
|
+
# remove unsupported array slicing in chat template
|
|
7541
|
+
# ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
|
|
7542
|
+
from transformers import AutoTokenizer
|
|
7543
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
|
7544
|
+
if tokenizer.chat_template is not None:
|
|
7545
|
+
chat_template = tokenizer.chat_template.replace("[:]", "")
|
|
7546
|
+
self.gguf_writer.add_chat_template(chat_template)
|
|
7547
|
+
|
|
7548
|
+
|
|
7549
|
+
@ModelBase.register("Lfm2ForCausalLM")
|
|
7550
|
+
@ModelBase.register("LFM2ForCausalLM")
|
|
7551
|
+
class LFM2Model(TextModel):
|
|
7552
|
+
model_arch = gguf.MODEL_ARCH.LFM2
|
|
7553
|
+
|
|
7554
|
+
def _add_feed_forward_length(self):
|
|
7555
|
+
ff_dim = self.hparams["block_ff_dim"]
|
|
7556
|
+
|
|
7557
|
+
auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
|
|
7558
|
+
ff_dim = self.hparams["block_ff_dim"]
|
|
7559
|
+
ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
|
|
7560
|
+
multiple_of = self.hparams["block_multiple_of"]
|
|
7561
|
+
|
|
7562
|
+
if auto_adjust_ff_dim:
|
|
7563
|
+
ff_dim = int(2 * ff_dim / 3)
|
|
7564
|
+
# custom dim factor multiplier
|
|
7565
|
+
if ffn_dim_multiplier is not None:
|
|
7566
|
+
ff_dim = int(ffn_dim_multiplier * ff_dim)
|
|
7567
|
+
ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
|
|
7568
|
+
|
|
7569
|
+
self.gguf_writer.add_feed_forward_length(ff_dim)
|
|
7570
|
+
|
|
7571
|
+
def set_gguf_parameters(self):
|
|
7572
|
+
# set num_key_value_heads only for attention layers
|
|
7573
|
+
self.hparams["num_key_value_heads"] = [
|
|
7574
|
+
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
|
|
7575
|
+
for layer_type in self.hparams["layer_types"]
|
|
7576
|
+
]
|
|
7577
|
+
|
|
7578
|
+
super().set_gguf_parameters()
|
|
7579
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
7580
|
+
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
|
|
7581
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
|
|
7582
|
+
self._add_feed_forward_length()
|
|
7583
|
+
|
|
7584
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
7585
|
+
# conv op requires 2d tensor
|
|
7586
|
+
if 'conv.conv' in name:
|
|
7587
|
+
data_torch = data_torch.squeeze(1)
|
|
7588
|
+
|
|
7589
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
7590
|
+
|
|
7591
|
+
|
|
6393
7592
|
###### CONVERSION LOGIC ######
|
|
6394
7593
|
|
|
6395
7594
|
|
|
@@ -6569,12 +7768,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
|
|
|
6569
7768
|
# maybe we should fallback to text model's arch in that case, since not many models have both
|
|
6570
7769
|
text_config = hparams.get("text_config", {})
|
|
6571
7770
|
vision_config = hparams.get("vision_config", {})
|
|
6572
|
-
arch =
|
|
7771
|
+
arch = None
|
|
7772
|
+
if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
|
|
7773
|
+
arch = arches[0]
|
|
7774
|
+
elif "ssm_cfg" in hparams:
|
|
7775
|
+
# For non-hf Mamba and Mamba2 models
|
|
7776
|
+
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
|
|
7777
|
+
|
|
6573
7778
|
# if "architectures" is found in the sub-config, use that instead
|
|
6574
7779
|
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
|
|
6575
7780
|
arch = text_config["architectures"][0]
|
|
6576
7781
|
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
|
|
6577
7782
|
arch = vision_config["architectures"][0]
|
|
7783
|
+
if arch is None:
|
|
7784
|
+
raise ValueError("Failed to detect model architecture")
|
|
6578
7785
|
return arch
|
|
6579
7786
|
|
|
6580
7787
|
|