@novastera-oss/llamarn 0.2.9 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +17 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.h +4 -0
- package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
- package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
- package/cpp/llama.cpp/include/llama.h +0 -40
- package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
- package/cpp/llama.cpp/src/llama-arch.h +18 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
- package/cpp/llama.cpp/src/llama-batch.h +8 -1
- package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
- package/cpp/llama.cpp/src/llama-graph.h +47 -60
- package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
- package/cpp/llama.cpp/src/llama-hparams.h +3 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
- package/cpp/llama.cpp/src/llama-model.h +18 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
- package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
- package/cpp/llama.cpp/src/llama-vocab.h +41 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +4 -0
- package/ios/include/llama.h +0 -40
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -300,6 +300,7 @@ class ModelBase:
|
|
|
300
300
|
gguf.MODEL_TENSOR.POS_EMBD,
|
|
301
301
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
|
302
302
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
|
303
|
+
gguf.MODEL_TENSOR.SHORTCONV_CONV,
|
|
303
304
|
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
|
304
305
|
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
|
305
306
|
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
|
@@ -815,6 +816,30 @@ class TextModel(ModelBase):
|
|
|
815
816
|
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
|
816
817
|
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
|
817
818
|
res = "minerva-7b"
|
|
819
|
+
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
|
|
820
|
+
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
|
|
821
|
+
res = "hunyuan"
|
|
822
|
+
if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
|
|
823
|
+
# ref: https://huggingface.co/skt/A.X-4.0
|
|
824
|
+
res = "a.x-4.0"
|
|
825
|
+
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
|
|
826
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
|
|
827
|
+
res = "falcon-h1"
|
|
828
|
+
if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
|
|
829
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
|
|
830
|
+
res = "falcon-h1"
|
|
831
|
+
if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
|
|
832
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
|
|
833
|
+
res = "falcon-h1"
|
|
834
|
+
if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
|
|
835
|
+
# ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
|
|
836
|
+
res = "falcon-h1"
|
|
837
|
+
if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
|
|
838
|
+
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
|
|
839
|
+
res = "midm-2.0"
|
|
840
|
+
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
|
|
841
|
+
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
|
|
842
|
+
res = "lfm2"
|
|
818
843
|
|
|
819
844
|
if res is None:
|
|
820
845
|
logger.warning("\n")
|
|
@@ -2743,6 +2768,52 @@ class Qwen2Model(TextModel):
|
|
|
2743
2768
|
yield from super().modify_tensors(data_torch, name, bid)
|
|
2744
2769
|
|
|
2745
2770
|
|
|
2771
|
+
@ModelBase.register("Ernie4_5_ForCausalLM")
|
|
2772
|
+
class Ernie4_5Model(TextModel):
|
|
2773
|
+
model_arch = gguf.MODEL_ARCH.ERNIE4_5
|
|
2774
|
+
|
|
2775
|
+
def set_vocab(self):
|
|
2776
|
+
self._set_vocab_sentencepiece()
|
|
2777
|
+
|
|
2778
|
+
def set_gguf_parameters(self):
|
|
2779
|
+
super().set_gguf_parameters()
|
|
2780
|
+
|
|
2781
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
2782
|
+
num_heads = self.hparams["num_attention_heads"]
|
|
2783
|
+
num_kv_heads = self.hparams["num_key_value_heads"]
|
|
2784
|
+
head_dim = self.hparams["head_dim"]
|
|
2785
|
+
|
|
2786
|
+
if "ernie." in name:
|
|
2787
|
+
name = name.replace("ernie.", "model.")
|
|
2788
|
+
# split the qkv weights
|
|
2789
|
+
# qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size]
|
|
2790
|
+
if "qkv_proj" in name:
|
|
2791
|
+
name_q = name.replace("qkv_proj.weight", "q_proj.weight")
|
|
2792
|
+
name_k = name.replace("qkv_proj.weight", "k_proj.weight")
|
|
2793
|
+
name_v = name.replace("qkv_proj.weight", "v_proj.weight")
|
|
2794
|
+
total_q_dim = num_heads * head_dim
|
|
2795
|
+
total_k_dim = num_kv_heads * head_dim
|
|
2796
|
+
total_v_dim = num_kv_heads * head_dim
|
|
2797
|
+
q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0)
|
|
2798
|
+
return [
|
|
2799
|
+
(self.map_tensor_name(name_q), q_proj_weight),
|
|
2800
|
+
(self.map_tensor_name(name_k), k_proj_weight),
|
|
2801
|
+
(self.map_tensor_name(name_v), v_proj_weight)
|
|
2802
|
+
]
|
|
2803
|
+
# split the up_gate_proj into gate and up
|
|
2804
|
+
# up_gate_proj shape: [2 * intermediate_size, hidden_size]
|
|
2805
|
+
if "up_gate_proj" in name:
|
|
2806
|
+
name_up = name.replace("up_gate_proj.weight", "up_proj.weight")
|
|
2807
|
+
name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight")
|
|
2808
|
+
dim_half = data_torch.shape[0] // 2
|
|
2809
|
+
gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0)
|
|
2810
|
+
return [
|
|
2811
|
+
(self.map_tensor_name(name_gate), gate_proj_weight),
|
|
2812
|
+
(self.map_tensor_name(name_up), up_proj_weight)
|
|
2813
|
+
]
|
|
2814
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
2815
|
+
|
|
2816
|
+
|
|
2746
2817
|
@ModelBase.register(
|
|
2747
2818
|
"Qwen2VLModel",
|
|
2748
2819
|
"Qwen2VLForConditionalGeneration",
|
|
@@ -4362,9 +4433,6 @@ class Gemma3NModel(Gemma3Model):
|
|
|
4362
4433
|
]
|
|
4363
4434
|
|
|
4364
4435
|
def set_vocab(self):
|
|
4365
|
-
with open(self.dir_model / "chat_template.jinja") as f:
|
|
4366
|
-
# quick hack to make sure chat template is added
|
|
4367
|
-
self.gguf_writer.add_chat_template(f.read())
|
|
4368
4436
|
super().set_vocab()
|
|
4369
4437
|
|
|
4370
4438
|
def set_gguf_parameters(self):
|
|
@@ -4735,6 +4803,14 @@ class ARwkv7Model(Rwkv7Model):
|
|
|
4735
4803
|
class MambaModel(TextModel):
|
|
4736
4804
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
|
4737
4805
|
|
|
4806
|
+
def __init__(self, dir_model: Path, *args, **kwargs):
|
|
4807
|
+
# Avoid using AutoConfig for hparams
|
|
4808
|
+
hparams = kwargs.pop("hparams", None)
|
|
4809
|
+
if hparams is None:
|
|
4810
|
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
4811
|
+
hparams = json.load(f)
|
|
4812
|
+
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
|
4813
|
+
|
|
4738
4814
|
def set_vocab(self):
|
|
4739
4815
|
vocab_size = self.hparams["vocab_size"]
|
|
4740
4816
|
# Round vocab size to next multiple of 8
|
|
@@ -4809,6 +4885,216 @@ class MambaModel(TextModel):
|
|
|
4809
4885
|
return [(new_name, data_torch)]
|
|
4810
4886
|
|
|
4811
4887
|
|
|
4888
|
+
@ModelBase.register("Mamba2ForCausalLM")
|
|
4889
|
+
class Mamba2Model(TextModel):
|
|
4890
|
+
model_arch = gguf.MODEL_ARCH.MAMBA2
|
|
4891
|
+
|
|
4892
|
+
def __init__(self, dir_model: Path, *args, **kwargs):
|
|
4893
|
+
# Avoid using AutoConfig for hparams
|
|
4894
|
+
# It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
|
|
4895
|
+
hparams = kwargs.pop("hparams", None)
|
|
4896
|
+
if hparams is None:
|
|
4897
|
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
4898
|
+
hparams = json.load(f)
|
|
4899
|
+
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
|
4900
|
+
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
|
4901
|
+
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
|
|
4902
|
+
self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
|
|
4903
|
+
|
|
4904
|
+
def set_vocab(self):
|
|
4905
|
+
vocab_size = self.hparams["vocab_size"]
|
|
4906
|
+
# Round vocab size to next multiple of 16
|
|
4907
|
+
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
|
|
4908
|
+
# pad using ceiling division
|
|
4909
|
+
# ref: https://stackoverflow.com/a/17511341/22827863
|
|
4910
|
+
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
|
4911
|
+
self.hparams["vocab_size"] = vocab_size
|
|
4912
|
+
|
|
4913
|
+
if (self.dir_model / "tokenizer.model").is_file():
|
|
4914
|
+
self._set_vocab_sentencepiece()
|
|
4915
|
+
elif (self.dir_model / "tokenizer.model.v3").is_file():
|
|
4916
|
+
# mamba-codestral
|
|
4917
|
+
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
|
|
4918
|
+
elif (self.dir_model / "tokenizer.json").is_file():
|
|
4919
|
+
self._set_vocab_gpt2()
|
|
4920
|
+
else:
|
|
4921
|
+
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
|
4922
|
+
self._set_vocab_builtin("gpt-neox", vocab_size)
|
|
4923
|
+
|
|
4924
|
+
def set_gguf_parameters(self):
|
|
4925
|
+
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
|
|
4926
|
+
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
|
|
4927
|
+
head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
|
|
4928
|
+
|
|
4929
|
+
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
|
4930
|
+
|
|
4931
|
+
# Fail early for models which don't have a block expansion factor of 2
|
|
4932
|
+
# TODO: does this really matter?
|
|
4933
|
+
# skip the assertion for FalconH1 Model
|
|
4934
|
+
if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
|
|
4935
|
+
assert self.d_inner == 2 * self.d_model
|
|
4936
|
+
assert self.d_inner % head_dim == 0
|
|
4937
|
+
|
|
4938
|
+
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
|
4939
|
+
self.gguf_writer.add_embedding_length(self.d_model)
|
|
4940
|
+
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
|
4941
|
+
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
|
4942
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
4943
|
+
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
|
4944
|
+
self.gguf_writer.add_ssm_inner_size(self.d_inner)
|
|
4945
|
+
self.gguf_writer.add_ssm_state_size(d_state)
|
|
4946
|
+
self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
|
|
4947
|
+
self.gguf_writer.add_ssm_group_count(self.n_group)
|
|
4948
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
|
4949
|
+
self.gguf_writer.add_file_type(self.ftype)
|
|
4950
|
+
|
|
4951
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4952
|
+
|
|
4953
|
+
if name.startswith("model.backbone") or name.startswith("model.lm_head"):
|
|
4954
|
+
# map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
|
|
4955
|
+
name = name.removeprefix("model.")
|
|
4956
|
+
|
|
4957
|
+
if name.endswith(".dt_bias"):
|
|
4958
|
+
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
|
|
4959
|
+
|
|
4960
|
+
new_name = self.map_tensor_name(name)
|
|
4961
|
+
|
|
4962
|
+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
|
4963
|
+
data_torch = data_torch.squeeze()
|
|
4964
|
+
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
|
|
4965
|
+
gguf.MODEL_TENSOR.SSM_A,
|
|
4966
|
+
gguf.MODEL_TENSOR.SSM_D,
|
|
4967
|
+
]):
|
|
4968
|
+
# unsqueeze A to use similar shape semantics as Mamba-1
|
|
4969
|
+
# (D is also unsqueezed, but for more straightforward broadcast internally)
|
|
4970
|
+
data_torch = data_torch.reshape((*data_torch.shape, 1))
|
|
4971
|
+
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
|
|
4972
|
+
data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
|
|
4973
|
+
|
|
4974
|
+
if name.endswith(".A_log"):
|
|
4975
|
+
logger.debug("A_log --> A ==> " + new_name)
|
|
4976
|
+
data_torch = -torch.exp(data_torch)
|
|
4977
|
+
|
|
4978
|
+
yield (new_name, data_torch)
|
|
4979
|
+
|
|
4980
|
+
|
|
4981
|
+
@ModelBase.register("JambaForCausalLM")
|
|
4982
|
+
class JambaModel(TextModel):
|
|
4983
|
+
model_arch = gguf.MODEL_ARCH.JAMBA
|
|
4984
|
+
|
|
4985
|
+
def get_vocab_base_pre(self, tokenizer) -> str:
|
|
4986
|
+
del tokenizer # unused
|
|
4987
|
+
|
|
4988
|
+
return "gpt-2"
|
|
4989
|
+
|
|
4990
|
+
def set_vocab(self):
|
|
4991
|
+
if (self.dir_model / "tokenizer.model").is_file():
|
|
4992
|
+
# Using Jamba's tokenizer.json causes errors on model load
|
|
4993
|
+
# (something about "byte not found in vocab"),
|
|
4994
|
+
# but there's a working tokenizer.model
|
|
4995
|
+
self._set_vocab_sentencepiece()
|
|
4996
|
+
else:
|
|
4997
|
+
# Some Jamba models only have a tokenizer.json, which works.
|
|
4998
|
+
self._set_vocab_gpt2()
|
|
4999
|
+
|
|
5000
|
+
def set_gguf_parameters(self):
|
|
5001
|
+
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
|
|
5002
|
+
d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
|
|
5003
|
+
d_inner = self.hparams["mamba_expand"] * d_model
|
|
5004
|
+
d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
|
|
5005
|
+
# ceiling division
|
|
5006
|
+
# ref: https://stackoverflow.com/a/17511341/22827863
|
|
5007
|
+
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
|
5008
|
+
dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
|
|
5009
|
+
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
|
|
5010
|
+
n_kv_head = self.hparams["num_key_value_heads"]
|
|
5011
|
+
attn_offset = self.hparams["attn_layer_offset"]
|
|
5012
|
+
attn_period = self.hparams["attn_layer_period"]
|
|
5013
|
+
n_kv_vec = [0 for _ in range(attn_offset)] + [
|
|
5014
|
+
n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
|
|
5015
|
+
]
|
|
5016
|
+
|
|
5017
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
5018
|
+
self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
|
|
5019
|
+
self.gguf_writer.add_embedding_length(d_model)
|
|
5020
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
5021
|
+
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
5022
|
+
self.gguf_writer.add_head_count_kv(n_kv_vec)
|
|
5023
|
+
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
|
5024
|
+
self.gguf_writer.add_ssm_inner_size(d_inner)
|
|
5025
|
+
self.gguf_writer.add_ssm_state_size(d_state)
|
|
5026
|
+
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
|
5027
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
|
5028
|
+
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
|
5029
|
+
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
|
5030
|
+
self.gguf_writer.add_file_type(self.ftype)
|
|
5031
|
+
|
|
5032
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
5033
|
+
|
|
5034
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
5035
|
+
|
|
5036
|
+
# Mini-Jamba
|
|
5037
|
+
name = name.replace(".moe.", ".feed_forward.")
|
|
5038
|
+
if bid is not None:
|
|
5039
|
+
moe_offset = self.hparams["expert_layer_offset"]
|
|
5040
|
+
moe_period = self.hparams["expert_layer_period"]
|
|
5041
|
+
|
|
5042
|
+
if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
|
|
5043
|
+
name = name.replace(".experts.0.", ".")
|
|
5044
|
+
|
|
5045
|
+
# process the experts separately
|
|
5046
|
+
if ".feed_forward.experts." in name:
|
|
5047
|
+
n_experts = self.hparams["num_experts"]
|
|
5048
|
+
|
|
5049
|
+
assert bid is not None
|
|
5050
|
+
|
|
5051
|
+
if self._experts is None:
|
|
5052
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
5053
|
+
|
|
5054
|
+
self._experts[bid][name] = data_torch
|
|
5055
|
+
|
|
5056
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
5057
|
+
|
|
5058
|
+
# merge the experts into a single 3d tensor
|
|
5059
|
+
for wid in ["down_proj", "gate_proj", "up_proj"]:
|
|
5060
|
+
datas: list[Tensor] = []
|
|
5061
|
+
|
|
5062
|
+
for xid in range(n_experts):
|
|
5063
|
+
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
|
|
5064
|
+
datas.append(self._experts[bid][ename])
|
|
5065
|
+
del self._experts[bid][ename]
|
|
5066
|
+
|
|
5067
|
+
data_torch = torch.stack(datas, dim=0)
|
|
5068
|
+
|
|
5069
|
+
# using the same merged name as qwen2moe
|
|
5070
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
|
|
5071
|
+
|
|
5072
|
+
new_name = self.map_tensor_name(merged_name)
|
|
5073
|
+
|
|
5074
|
+
yield new_name, data_torch
|
|
5075
|
+
return
|
|
5076
|
+
|
|
5077
|
+
new_name = self.map_tensor_name(name)
|
|
5078
|
+
|
|
5079
|
+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
|
5080
|
+
data_torch = data_torch.squeeze()
|
|
5081
|
+
|
|
5082
|
+
if name.endswith(".A_log"):
|
|
5083
|
+
logger.debug("A_log --> A ==> " + new_name)
|
|
5084
|
+
data_torch = -torch.exp(data_torch)
|
|
5085
|
+
|
|
5086
|
+
yield (new_name, data_torch)
|
|
5087
|
+
|
|
5088
|
+
def prepare_tensors(self):
|
|
5089
|
+
super().prepare_tensors()
|
|
5090
|
+
|
|
5091
|
+
if self._experts is not None:
|
|
5092
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
5093
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
5094
|
+
if len(experts) > 0:
|
|
5095
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
5096
|
+
|
|
5097
|
+
|
|
4812
5098
|
@ModelBase.register("CohereForCausalLM")
|
|
4813
5099
|
class CommandR2Model(TextModel):
|
|
4814
5100
|
model_arch = gguf.MODEL_ARCH.COMMAND_R
|
|
@@ -6170,18 +6456,148 @@ class GraniteMoeModel(GraniteModel):
|
|
|
6170
6456
|
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
|
6171
6457
|
]
|
|
6172
6458
|
|
|
6459
|
+
has_experts = bool(self.hparams.get('num_local_experts'))
|
|
6460
|
+
|
|
6173
6461
|
if name.endswith("shared_mlp.input_linear.weight"):
|
|
6174
6462
|
ffn_dim = self.hparams["shared_intermediate_size"]
|
|
6175
6463
|
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
|
|
6176
6464
|
gate, up = data_torch.split(ffn_dim, dim=-2)
|
|
6465
|
+
if has_experts:
|
|
6466
|
+
return [
|
|
6467
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
|
|
6468
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
|
|
6469
|
+
]
|
|
6177
6470
|
return [
|
|
6178
|
-
(self.format_tensor_name(gguf.MODEL_TENSOR.
|
|
6179
|
-
(self.format_tensor_name(gguf.MODEL_TENSOR.
|
|
6471
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
|
|
6472
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
|
|
6473
|
+
]
|
|
6474
|
+
|
|
6475
|
+
if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
|
|
6476
|
+
return [
|
|
6477
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
|
|
6180
6478
|
]
|
|
6181
6479
|
|
|
6182
6480
|
return super().modify_tensors(data_torch, name, bid)
|
|
6183
6481
|
|
|
6184
6482
|
|
|
6483
|
+
@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
|
|
6484
|
+
class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
|
6485
|
+
"""GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
|
|
6486
|
+
layers and optionally uses MoE w/ a shared expert"""
|
|
6487
|
+
model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
|
|
6488
|
+
undo_permute = True
|
|
6489
|
+
|
|
6490
|
+
def __init__(self, *args, **kwargs):
|
|
6491
|
+
|
|
6492
|
+
# Hybrid mamba models use a prefix for the mamba-specific params.
|
|
6493
|
+
# TODO: Extend this if the prefix(es) need to be configurable
|
|
6494
|
+
self.hparam_prefixes = ["mamba"]
|
|
6495
|
+
|
|
6496
|
+
super().__init__(*args, **kwargs)
|
|
6497
|
+
|
|
6498
|
+
# Lists of which layers use ssm vs attention
|
|
6499
|
+
self._attn_layers = self.get_attn_layers()
|
|
6500
|
+
self._ssm_layers = [
|
|
6501
|
+
i for i in range(self.block_count)
|
|
6502
|
+
if i not in self._attn_layers
|
|
6503
|
+
]
|
|
6504
|
+
|
|
6505
|
+
# n_group and d_inner are used during reshape_tensors for mamba2
|
|
6506
|
+
self.d_model = self.find_hparam(["hidden_size", "d_model"])
|
|
6507
|
+
self.n_group = self.find_hparam(["n_groups"])
|
|
6508
|
+
self.d_inner = self.find_hparam(["expand"]) * self.d_model
|
|
6509
|
+
|
|
6510
|
+
def get_attn_layers(self):
|
|
6511
|
+
# Explicit list of layer type names
|
|
6512
|
+
if layer_types := self.hparams.get("layer_types"):
|
|
6513
|
+
return [
|
|
6514
|
+
i for i, typ in enumerate(layer_types)
|
|
6515
|
+
if typ == "attention"
|
|
6516
|
+
]
|
|
6517
|
+
|
|
6518
|
+
# Layer types indicated by index or period
|
|
6519
|
+
attn_layers = self.hparams.get("attn_layer_indices", [])
|
|
6520
|
+
if not attn_layers:
|
|
6521
|
+
attn_period = self.hparams.get("attn_layer_period")
|
|
6522
|
+
assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
|
|
6523
|
+
attn_offset = self.hparams.get("attn_layer_offset")
|
|
6524
|
+
assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
|
|
6525
|
+
attn_layers = [
|
|
6526
|
+
i for i in range(self.block_count)
|
|
6527
|
+
if i % attn_period == attn_offset
|
|
6528
|
+
]
|
|
6529
|
+
return attn_layers
|
|
6530
|
+
|
|
6531
|
+
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
|
|
6532
|
+
prefixed = []
|
|
6533
|
+
for pfx in self.hparam_prefixes:
|
|
6534
|
+
prefixed.extend(
|
|
6535
|
+
"_".join([pfx, k])
|
|
6536
|
+
for k in keys
|
|
6537
|
+
)
|
|
6538
|
+
keys = list(keys) + prefixed
|
|
6539
|
+
return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
|
|
6540
|
+
|
|
6541
|
+
def modify_tensors(
|
|
6542
|
+
self, data_torch: Tensor, name: str, bid: int | None
|
|
6543
|
+
) -> Iterable[tuple[str, Tensor]]:
|
|
6544
|
+
if (
|
|
6545
|
+
name.endswith("block_sparse_moe.input_linear.weight")
|
|
6546
|
+
or "shared_mlp" in name
|
|
6547
|
+
):
|
|
6548
|
+
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
|
|
6549
|
+
|
|
6550
|
+
# Determine whether this is a mamba layer or an attention layer
|
|
6551
|
+
if bid in self._ssm_layers:
|
|
6552
|
+
return Mamba2Model.modify_tensors(self, data_torch, name, bid)
|
|
6553
|
+
elif bid in self._attn_layers:
|
|
6554
|
+
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
|
|
6555
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
6556
|
+
|
|
6557
|
+
def set_gguf_parameters(self):
|
|
6558
|
+
"""This method merges params from both parents and some that are
|
|
6559
|
+
specific to this model. The result is some duplication of how the params
|
|
6560
|
+
get set. The following warnings are expected during conversion:
|
|
6561
|
+
|
|
6562
|
+
WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
|
|
6563
|
+
WARNING:Duplicated key name 'granitehybrid.context_length'
|
|
6564
|
+
"""
|
|
6565
|
+
GraniteMoeModel.set_gguf_parameters(self)
|
|
6566
|
+
|
|
6567
|
+
## Mamba mixer params ##
|
|
6568
|
+
self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
|
|
6569
|
+
self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
|
|
6570
|
+
self.gguf_writer.add_ssm_group_count(self.n_group)
|
|
6571
|
+
self.gguf_writer.add_ssm_inner_size(self.d_inner)
|
|
6572
|
+
# NOTE: The mamba_dt_rank is _not_ the right field for how this is used
|
|
6573
|
+
# in llama.cpp
|
|
6574
|
+
self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
|
|
6575
|
+
|
|
6576
|
+
## Attention params ##
|
|
6577
|
+
head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
|
6578
|
+
head_count_kv_vec = [
|
|
6579
|
+
head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
|
|
6580
|
+
]
|
|
6581
|
+
if rope_dim := self.hparams.get("attn_rotary_emb"):
|
|
6582
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
6583
|
+
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
|
|
6584
|
+
|
|
6585
|
+
## If Bamba, use rope, otherwise don't
|
|
6586
|
+
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
|
|
6587
|
+
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
|
|
6588
|
+
if not use_rope:
|
|
6589
|
+
self.gguf_writer.add_context_length(2**20)
|
|
6590
|
+
|
|
6591
|
+
## Validation ##
|
|
6592
|
+
d_head = self.find_hparam(["d_head"], optional=True) or 64
|
|
6593
|
+
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
|
6594
|
+
assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
|
|
6595
|
+
|
|
6596
|
+
def set_vocab(self):
|
|
6597
|
+
self.hparams["pad_vocab_size_multiple"] = 8
|
|
6598
|
+
Mamba2Model.set_vocab(self)
|
|
6599
|
+
|
|
6600
|
+
|
|
6185
6601
|
@ModelBase.register("BailingMoeForCausalLM")
|
|
6186
6602
|
class BailingMoeModel(TextModel):
|
|
6187
6603
|
model_arch = gguf.MODEL_ARCH.BAILINGMOE
|
|
@@ -6390,6 +6806,321 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
|
|
6390
6806
|
super().set_gguf_parameters()
|
|
6391
6807
|
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
|
6392
6808
|
|
|
6809
|
+
|
|
6810
|
+
@ModelBase.register("FalconH1ForCausalLM")
|
|
6811
|
+
class FalconH1Model(Mamba2Model):
|
|
6812
|
+
model_arch = gguf.MODEL_ARCH.FALCON_H1
|
|
6813
|
+
|
|
6814
|
+
def __init__(self, *args, **kwargs):
|
|
6815
|
+
# Set the hparam prefixes for Falcon Mamba2
|
|
6816
|
+
self.hparam_prefixes = ["mamba"]
|
|
6817
|
+
|
|
6818
|
+
# Initialize the base Mamba2Model
|
|
6819
|
+
super().__init__(*args, **kwargs)
|
|
6820
|
+
|
|
6821
|
+
# Use Llama conversion for attention
|
|
6822
|
+
self._transformer_model_class = LlamaModel
|
|
6823
|
+
|
|
6824
|
+
# n_group and d_inner are used during reshape_tensors for mamba2
|
|
6825
|
+
self.n_group = self.find_hparam(["n_groups"])
|
|
6826
|
+
self.d_inner = self.find_hparam(["mamba_d_ssm"])
|
|
6827
|
+
self.d_head = self.find_hparam(["d_head"])
|
|
6828
|
+
|
|
6829
|
+
# Initialize any Falcon Mamba2 specific attributes
|
|
6830
|
+
self.has_attention = True # Falcon Mamba2 has attention components
|
|
6831
|
+
|
|
6832
|
+
# Load Falcon-H1 multipliers from hyperparameters
|
|
6833
|
+
self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
|
|
6834
|
+
self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
|
|
6835
|
+
self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
|
|
6836
|
+
self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
|
|
6837
|
+
self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
|
|
6838
|
+
self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
|
|
6839
|
+
self.intermediate_size = self.find_hparam(["intermediate_size"])
|
|
6840
|
+
self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
|
|
6841
|
+
|
|
6842
|
+
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
|
|
6843
|
+
prefixed = []
|
|
6844
|
+
for pfx in self.hparam_prefixes:
|
|
6845
|
+
prefixed.extend(
|
|
6846
|
+
"_".join([pfx, k])
|
|
6847
|
+
for k in keys
|
|
6848
|
+
)
|
|
6849
|
+
keys = list(keys) + prefixed
|
|
6850
|
+
return super().find_hparam(keys, *args, **kwargs)
|
|
6851
|
+
|
|
6852
|
+
def set_vocab(self):
|
|
6853
|
+
self._set_vocab_gpt2()
|
|
6854
|
+
|
|
6855
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
6856
|
+
tensors = list(super().modify_tensors(data_torch, name, bid))
|
|
6857
|
+
tensor = tensors[0][1]
|
|
6858
|
+
|
|
6859
|
+
if "down_proj" in name:
|
|
6860
|
+
tensor = tensor * self.mlp_multipliers[1]
|
|
6861
|
+
elif "gate_proj" in name:
|
|
6862
|
+
tensor = tensor * self.mlp_multipliers[0]
|
|
6863
|
+
elif "k_proj" in name:
|
|
6864
|
+
tensor = tensor * self.key_multiplier * self.attention_in_multiplier
|
|
6865
|
+
elif "q_proj" in name:
|
|
6866
|
+
tensor = tensor * self.attention_in_multiplier
|
|
6867
|
+
elif "v_proj" in name:
|
|
6868
|
+
tensor = tensor * self.attention_in_multiplier
|
|
6869
|
+
elif "o_proj" in name:
|
|
6870
|
+
tensor = tensor * self.attention_out_multiplier
|
|
6871
|
+
elif "out_proj" in name:
|
|
6872
|
+
tensor = tensor * self.ssm_out_multiplier
|
|
6873
|
+
elif "in_proj" in name:
|
|
6874
|
+
tensor = tensor * self.ssm_in_multiplier
|
|
6875
|
+
zxbcdt_multipliers = self.hparams["ssm_multipliers"]
|
|
6876
|
+
intermediate_size = self.hparams["mamba_d_ssm"]
|
|
6877
|
+
groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
|
|
6878
|
+
tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
|
|
6879
|
+
tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
|
|
6880
|
+
tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
|
|
6881
|
+
tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
|
|
6882
|
+
tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
|
|
6883
|
+
elif "lm_head" in name:
|
|
6884
|
+
tensor = tensor * self.hparams["lm_head_multiplier"]
|
|
6885
|
+
elif "embed_tokens" in name:
|
|
6886
|
+
tensor = tensor * self.hparams["embedding_multiplier"]
|
|
6887
|
+
elif "mamba.norm" in name:
|
|
6888
|
+
tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
|
|
6889
|
+
|
|
6890
|
+
tensors = [(tensors[0][0], tensor)]
|
|
6891
|
+
return tensors
|
|
6892
|
+
|
|
6893
|
+
def set_gguf_parameters(self):
|
|
6894
|
+
super().set_gguf_parameters()
|
|
6895
|
+
|
|
6896
|
+
## General Params ##
|
|
6897
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
6898
|
+
# Override some Mamba2 defaults
|
|
6899
|
+
self.gguf_writer.add_block_count(self.block_count)
|
|
6900
|
+
self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
|
|
6901
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
6902
|
+
|
|
6903
|
+
## Attention params ##
|
|
6904
|
+
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
|
|
6905
|
+
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
|
6906
|
+
self.gguf_writer.add_key_length(self.hparams["head_dim"])
|
|
6907
|
+
self.gguf_writer.add_value_length(self.hparams["head_dim"])
|
|
6908
|
+
|
|
6909
|
+
## Validation ##
|
|
6910
|
+
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
|
6911
|
+
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
|
6912
|
+
|
|
6913
|
+
# Add any other Falcon Mamba2 specific configuration
|
|
6914
|
+
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
|
6915
|
+
|
|
6916
|
+
|
|
6917
|
+
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
|
6918
|
+
class HunYuanMoEModel(TextModel):
|
|
6919
|
+
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
|
|
6920
|
+
|
|
6921
|
+
def __init__(self, *args, **kwargs):
|
|
6922
|
+
super().__init__(*args, **kwargs)
|
|
6923
|
+
# For handling tied embeddings
|
|
6924
|
+
self._tok_embd = None
|
|
6925
|
+
|
|
6926
|
+
def set_vocab(self):
|
|
6927
|
+
from transformers import AutoTokenizer
|
|
6928
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
|
6929
|
+
|
|
6930
|
+
# 1. Get the pre-tokenizer identifier hash
|
|
6931
|
+
tokpre = self.get_vocab_base_pre(tokenizer)
|
|
6932
|
+
|
|
6933
|
+
# 2. Reverse-engineer the merges list from mergeable_ranks
|
|
6934
|
+
merges = []
|
|
6935
|
+
vocab = {}
|
|
6936
|
+
mergeable_ranks = tokenizer.mergeable_ranks
|
|
6937
|
+
for token, rank in mergeable_ranks.items():
|
|
6938
|
+
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
|
6939
|
+
if len(token) == 1:
|
|
6940
|
+
continue
|
|
6941
|
+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
|
6942
|
+
if len(merged) == 2: # todo this is an assert in Qwen, why?
|
|
6943
|
+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
|
6944
|
+
|
|
6945
|
+
# 3. Generate the tokens and toktypes lists
|
|
6946
|
+
vocab_size = self.hparams["vocab_size"]
|
|
6947
|
+
assert tokenizer.vocab_size == vocab_size
|
|
6948
|
+
special_tokens = tokenizer.special_tokens
|
|
6949
|
+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
|
|
6950
|
+
tokens: list[str] = []
|
|
6951
|
+
toktypes: list[int] = []
|
|
6952
|
+
for i in range(vocab_size):
|
|
6953
|
+
if i not in reverse_vocab:
|
|
6954
|
+
tokens.append(f"[PAD{i}]")
|
|
6955
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
|
6956
|
+
else:
|
|
6957
|
+
token = reverse_vocab[i]
|
|
6958
|
+
tokens.append(token)
|
|
6959
|
+
if i in special_tokens.values():
|
|
6960
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
|
6961
|
+
else:
|
|
6962
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
|
6963
|
+
|
|
6964
|
+
# 4. Write all vocab-related fields to the GGUF writer
|
|
6965
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
|
6966
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
|
6967
|
+
self.gguf_writer.add_token_list(tokens)
|
|
6968
|
+
self.gguf_writer.add_token_types(toktypes)
|
|
6969
|
+
self.gguf_writer.add_token_merges(merges)
|
|
6970
|
+
|
|
6971
|
+
# 5. Add special tokens and chat templates
|
|
6972
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
|
6973
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
|
6974
|
+
# FIX for BOS token: Overwrite incorrect id read from config.json
|
|
6975
|
+
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
|
|
6976
|
+
|
|
6977
|
+
def set_gguf_parameters(self):
|
|
6978
|
+
super().set_gguf_parameters()
|
|
6979
|
+
hparams = self.hparams
|
|
6980
|
+
|
|
6981
|
+
self.gguf_writer.add_expert_count(hparams["num_experts"])
|
|
6982
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
|
|
6983
|
+
|
|
6984
|
+
moe_intermediate_size = hparams["moe_intermediate_size"]
|
|
6985
|
+
assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
|
|
6986
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
|
|
6987
|
+
|
|
6988
|
+
moe_topk = hparams["moe_topk"]
|
|
6989
|
+
assert all(topk == moe_topk[0] for topk in moe_topk)
|
|
6990
|
+
self.gguf_writer.add_expert_used_count(moe_topk[0])
|
|
6991
|
+
|
|
6992
|
+
moe_shared_expert = hparams["num_shared_expert"]
|
|
6993
|
+
assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
|
|
6994
|
+
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
|
6995
|
+
|
|
6996
|
+
# Rope
|
|
6997
|
+
rope_scaling = hparams.get("rope_scaling", {})
|
|
6998
|
+
if rope_scaling.get("type") == "dynamic":
|
|
6999
|
+
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
|
7000
|
+
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
|
7001
|
+
alpha = rope_scaling.get("alpha", 1000)
|
|
7002
|
+
base = hparams.get("rope_theta", 10000.0)
|
|
7003
|
+
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
|
7004
|
+
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
|
7005
|
+
self.gguf_writer.add_rope_freq_base(scaled_base)
|
|
7006
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
7007
|
+
self.gguf_writer.add_rope_scaling_factor(1)
|
|
7008
|
+
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
|
7009
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
|
7010
|
+
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
|
7011
|
+
|
|
7012
|
+
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
|
7013
|
+
assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
|
7014
|
+
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
|
7015
|
+
|
|
7016
|
+
_experts: list[dict[str, Tensor]] | None = None
|
|
7017
|
+
|
|
7018
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
7019
|
+
if name == "model.embed_tokens.weight":
|
|
7020
|
+
self._tok_embd = data_torch.clone()
|
|
7021
|
+
|
|
7022
|
+
if name == "lm_head.weight":
|
|
7023
|
+
if self.hparams.get("tie_word_embeddings", False):
|
|
7024
|
+
logger.info("Skipping tied output layer 'lm_head.weight'")
|
|
7025
|
+
return []
|
|
7026
|
+
|
|
7027
|
+
if name.find("mlp.experts") != -1:
|
|
7028
|
+
n_experts = self.hparams["num_experts"]
|
|
7029
|
+
assert bid is not None
|
|
7030
|
+
|
|
7031
|
+
if self._experts is None:
|
|
7032
|
+
self._experts = [{} for _ in range(self.block_count)]
|
|
7033
|
+
|
|
7034
|
+
self._experts[bid][name] = data_torch
|
|
7035
|
+
|
|
7036
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
|
7037
|
+
# merge the experts into a single 3d tensor
|
|
7038
|
+
tensors: list[tuple[str, Tensor]] = []
|
|
7039
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
|
7040
|
+
datas: list[Tensor] = []
|
|
7041
|
+
|
|
7042
|
+
for xid in range(n_experts):
|
|
7043
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
7044
|
+
datas.append(self._experts[bid][ename])
|
|
7045
|
+
del self._experts[bid][ename]
|
|
7046
|
+
|
|
7047
|
+
data_torch = torch.stack(datas, dim=0)
|
|
7048
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
7049
|
+
new_name = self.map_tensor_name(merged_name)
|
|
7050
|
+
tensors.append((new_name, data_torch))
|
|
7051
|
+
|
|
7052
|
+
return tensors
|
|
7053
|
+
else:
|
|
7054
|
+
return []
|
|
7055
|
+
|
|
7056
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
7057
|
+
|
|
7058
|
+
def prepare_tensors(self):
|
|
7059
|
+
super().prepare_tensors()
|
|
7060
|
+
if self._experts is not None:
|
|
7061
|
+
experts = [k for d in self._experts for k in d.keys()]
|
|
7062
|
+
if len(experts) > 0:
|
|
7063
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
|
7064
|
+
|
|
7065
|
+
|
|
7066
|
+
@ModelBase.register("SmolLM3ForCausalLM")
|
|
7067
|
+
class SmolLM3Model(LlamaModel):
|
|
7068
|
+
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
|
7069
|
+
|
|
7070
|
+
def set_vocab(self):
|
|
7071
|
+
super().set_vocab()
|
|
7072
|
+
# remove unsupported array slicing in chat template
|
|
7073
|
+
# ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
|
|
7074
|
+
from transformers import AutoTokenizer
|
|
7075
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
|
7076
|
+
if tokenizer.chat_template is not None:
|
|
7077
|
+
chat_template = tokenizer.chat_template.replace("[:]", "")
|
|
7078
|
+
self.gguf_writer.add_chat_template(chat_template)
|
|
7079
|
+
|
|
7080
|
+
|
|
7081
|
+
@ModelBase.register("Lfm2ForCausalLM")
|
|
7082
|
+
@ModelBase.register("LFM2ForCausalLM")
|
|
7083
|
+
class LFM2Model(TextModel):
|
|
7084
|
+
model_arch = gguf.MODEL_ARCH.LFM2
|
|
7085
|
+
|
|
7086
|
+
def _add_feed_forward_length(self):
|
|
7087
|
+
ff_dim = self.hparams["block_ff_dim"]
|
|
7088
|
+
|
|
7089
|
+
auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
|
|
7090
|
+
ff_dim = self.hparams["block_ff_dim"]
|
|
7091
|
+
ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
|
|
7092
|
+
multiple_of = self.hparams["block_multiple_of"]
|
|
7093
|
+
|
|
7094
|
+
if auto_adjust_ff_dim:
|
|
7095
|
+
ff_dim = int(2 * ff_dim / 3)
|
|
7096
|
+
# custom dim factor multiplier
|
|
7097
|
+
if ffn_dim_multiplier is not None:
|
|
7098
|
+
ff_dim = int(ffn_dim_multiplier * ff_dim)
|
|
7099
|
+
ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
|
|
7100
|
+
|
|
7101
|
+
self.gguf_writer.add_feed_forward_length(ff_dim)
|
|
7102
|
+
|
|
7103
|
+
def set_gguf_parameters(self):
|
|
7104
|
+
# set num_key_value_heads only for attention layers
|
|
7105
|
+
self.hparams["num_key_value_heads"] = [
|
|
7106
|
+
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
|
|
7107
|
+
for layer_type in self.hparams["layer_types"]
|
|
7108
|
+
]
|
|
7109
|
+
|
|
7110
|
+
super().set_gguf_parameters()
|
|
7111
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
|
7112
|
+
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
|
|
7113
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
|
|
7114
|
+
self._add_feed_forward_length()
|
|
7115
|
+
|
|
7116
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
7117
|
+
# conv op requires 2d tensor
|
|
7118
|
+
if 'conv.conv' in name:
|
|
7119
|
+
data_torch = data_torch.squeeze(1)
|
|
7120
|
+
|
|
7121
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
7122
|
+
|
|
7123
|
+
|
|
6393
7124
|
###### CONVERSION LOGIC ######
|
|
6394
7125
|
|
|
6395
7126
|
|
|
@@ -6569,12 +7300,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
|
|
|
6569
7300
|
# maybe we should fallback to text model's arch in that case, since not many models have both
|
|
6570
7301
|
text_config = hparams.get("text_config", {})
|
|
6571
7302
|
vision_config = hparams.get("vision_config", {})
|
|
6572
|
-
arch =
|
|
7303
|
+
arch = None
|
|
7304
|
+
if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
|
|
7305
|
+
arch = arches[0]
|
|
7306
|
+
elif "ssm_cfg" in hparams:
|
|
7307
|
+
# For non-hf Mamba and Mamba2 models
|
|
7308
|
+
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
|
|
7309
|
+
|
|
6573
7310
|
# if "architectures" is found in the sub-config, use that instead
|
|
6574
7311
|
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
|
|
6575
7312
|
arch = text_config["architectures"][0]
|
|
6576
7313
|
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
|
|
6577
7314
|
arch = vision_config["architectures"][0]
|
|
7315
|
+
if arch is None:
|
|
7316
|
+
raise ValueError("Failed to detect model architecture")
|
|
6578
7317
|
return arch
|
|
6579
7318
|
|
|
6580
7319
|
|