@novastera-oss/llamarn 0.2.9 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +17 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.h +4 -0
- package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
- package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
- package/cpp/llama.cpp/include/llama.h +0 -40
- package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
- package/cpp/llama.cpp/src/llama-arch.h +18 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
- package/cpp/llama.cpp/src/llama-batch.h +8 -1
- package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
- package/cpp/llama.cpp/src/llama-graph.h +47 -60
- package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
- package/cpp/llama.cpp/src/llama-hparams.h +3 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
- package/cpp/llama.cpp/src/llama-model.h +18 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
- package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
- package/cpp/llama.cpp/src/llama-vocab.h +41 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +4 -0
- package/ios/include/llama.h +0 -40
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -40,16 +40,21 @@ const char * llm_type_name(llm_type type) {
|
|
|
40
40
|
case LLM_TYPE_190M: return "190M";
|
|
41
41
|
case LLM_TYPE_220M: return "220M";
|
|
42
42
|
case LLM_TYPE_250M: return "250M";
|
|
43
|
+
case LLM_TYPE_256M: return "256M";
|
|
43
44
|
case LLM_TYPE_270M: return "270M";
|
|
44
45
|
case LLM_TYPE_335M: return "335M";
|
|
46
|
+
case LLM_TYPE_350M: return "350M";
|
|
45
47
|
case LLM_TYPE_410M: return "410M";
|
|
46
48
|
case LLM_TYPE_450M: return "450M";
|
|
47
49
|
case LLM_TYPE_475M: return "475M";
|
|
50
|
+
case LLM_TYPE_700M: return "700M";
|
|
48
51
|
case LLM_TYPE_770M: return "770M";
|
|
49
52
|
case LLM_TYPE_780M: return "780M";
|
|
53
|
+
case LLM_TYPE_0_3B: return "0.3B";
|
|
50
54
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
51
55
|
case LLM_TYPE_0_6B: return "0.6B";
|
|
52
56
|
case LLM_TYPE_1B: return "1B";
|
|
57
|
+
case LLM_TYPE_1_2B: return "1.2B";
|
|
53
58
|
case LLM_TYPE_1_3B: return "1.3B";
|
|
54
59
|
case LLM_TYPE_1_4B: return "1.4B";
|
|
55
60
|
case LLM_TYPE_1_5B: return "1.5B";
|
|
@@ -101,6 +106,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
101
106
|
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
|
102
107
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
103
108
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
109
|
+
case LLM_TYPE_A13B: return "A13B";
|
|
104
110
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
105
111
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
106
112
|
case LLM_TYPE_E2B: return "E2B";
|
|
@@ -207,23 +213,27 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
207
213
|
} break;
|
|
208
214
|
case GGML_OP_SSM_CONV:
|
|
209
215
|
{
|
|
210
|
-
|
|
211
|
-
|
|
216
|
+
const int64_t n_seq_tokens = 512;
|
|
217
|
+
const int64_t n_seqs = 3;
|
|
218
|
+
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
|
|
212
219
|
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
|
213
220
|
} break;
|
|
214
221
|
case GGML_OP_SSM_SCAN:
|
|
215
222
|
{
|
|
216
|
-
//
|
|
217
|
-
const int64_t d_state = w->ne[0];
|
|
218
|
-
const int64_t
|
|
223
|
+
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
|
|
224
|
+
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
|
|
225
|
+
const int64_t n_head = w->ne[1];
|
|
226
|
+
const int64_t head_dim = hparams.ssm_d_inner / n_head;
|
|
227
|
+
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
|
|
219
228
|
const int64_t n_seq_tokens = 512;
|
|
220
|
-
const int64_t n_seqs =
|
|
221
|
-
ggml_tensor * s
|
|
222
|
-
ggml_tensor * x
|
|
223
|
-
ggml_tensor * dt
|
|
224
|
-
ggml_tensor * B
|
|
225
|
-
ggml_tensor * C
|
|
226
|
-
|
|
229
|
+
const int64_t n_seqs = 3;
|
|
230
|
+
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
|
|
231
|
+
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
|
|
232
|
+
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
|
|
233
|
+
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
234
|
+
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
235
|
+
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
|
236
|
+
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
|
|
227
237
|
} break;
|
|
228
238
|
case GGML_OP_RWKV_WKV6:
|
|
229
239
|
{
|
|
@@ -575,6 +585,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
575
585
|
case 22: type = LLM_TYPE_1B; break;
|
|
576
586
|
case 26: type = LLM_TYPE_3B; break;
|
|
577
587
|
case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
|
|
588
|
+
case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
|
|
578
589
|
// granite uses a vocab with len 49152
|
|
579
590
|
case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
|
|
580
591
|
case 36: type = LLM_TYPE_8B; break; // granite
|
|
@@ -1080,6 +1091,58 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1080
1091
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1081
1092
|
}
|
|
1082
1093
|
} break;
|
|
1094
|
+
case LLM_ARCH_MAMBA2:
|
|
1095
|
+
{
|
|
1096
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1097
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1098
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1099
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1100
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1101
|
+
|
|
1102
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1103
|
+
|
|
1104
|
+
switch (hparams.n_layer) {
|
|
1105
|
+
case 24:
|
|
1106
|
+
switch (hparams.n_embd) {
|
|
1107
|
+
case 768: type = LLM_TYPE_SMALL; break;
|
|
1108
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1109
|
+
} break;
|
|
1110
|
+
case 48:
|
|
1111
|
+
switch (hparams.n_embd) {
|
|
1112
|
+
case 1024: type = LLM_TYPE_MEDIUM; break;
|
|
1113
|
+
case 1536: type = LLM_TYPE_LARGE; break;
|
|
1114
|
+
case 2048: type = LLM_TYPE_XL; break;
|
|
1115
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1116
|
+
} break;
|
|
1117
|
+
case 64:
|
|
1118
|
+
switch (hparams.n_embd) {
|
|
1119
|
+
case 2560: type = LLM_TYPE_3B; break;
|
|
1120
|
+
case 4096: type = LLM_TYPE_7B; break;
|
|
1121
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1122
|
+
} break;
|
|
1123
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1124
|
+
}
|
|
1125
|
+
} break;
|
|
1126
|
+
case LLM_ARCH_JAMBA:
|
|
1127
|
+
{
|
|
1128
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1129
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1130
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1131
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1132
|
+
|
|
1133
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1134
|
+
|
|
1135
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
1136
|
+
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
switch (hparams.n_layer) {
|
|
1140
|
+
// TODO: Jamba layers are a bit heterogenous, so naming this is hard.
|
|
1141
|
+
case 12: // 900M 8x???M
|
|
1142
|
+
case 32: // 51B 16x?B
|
|
1143
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1144
|
+
}
|
|
1145
|
+
} break;
|
|
1083
1146
|
case LLM_ARCH_XVERSE:
|
|
1084
1147
|
{
|
|
1085
1148
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1446,6 +1509,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1446
1509
|
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
|
1447
1510
|
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
|
1448
1511
|
|
|
1512
|
+
// Granite uses rope_finetuned as a switch for rope, so default to true
|
|
1513
|
+
bool rope_finetuned = true;
|
|
1514
|
+
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
|
1515
|
+
hparams.rope_finetuned = rope_finetuned;
|
|
1516
|
+
|
|
1449
1517
|
switch (hparams.n_layer) {
|
|
1450
1518
|
case 32: type = LLM_TYPE_3B; break;
|
|
1451
1519
|
case 40: type = LLM_TYPE_3B; break;
|
|
@@ -1453,6 +1521,40 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1453
1521
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1454
1522
|
}
|
|
1455
1523
|
|
|
1524
|
+
// For Granite MoE Shared
|
|
1525
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
|
1526
|
+
} break;
|
|
1527
|
+
case LLM_ARCH_GRANITE_HYBRID:
|
|
1528
|
+
{
|
|
1529
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1530
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
|
|
1531
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
|
|
1532
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
|
|
1533
|
+
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
|
|
1534
|
+
|
|
1535
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1536
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1537
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1538
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1539
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1540
|
+
|
|
1541
|
+
// Granite uses rope_finetuned as a switch for rope, so default to true
|
|
1542
|
+
bool rope_finetuned = true;
|
|
1543
|
+
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
|
1544
|
+
hparams.rope_finetuned = rope_finetuned;
|
|
1545
|
+
|
|
1546
|
+
// A layer is recurrent IFF the n_head_kv value is set to 0
|
|
1547
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
1548
|
+
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
|
|
1549
|
+
}
|
|
1550
|
+
|
|
1551
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1552
|
+
|
|
1553
|
+
switch (hparams.n_layer) {
|
|
1554
|
+
// TODO: Add llm type label (not sure this is useful)
|
|
1555
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1556
|
+
}
|
|
1557
|
+
|
|
1456
1558
|
// For Granite MoE Shared
|
|
1457
1559
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
|
1458
1560
|
} break;
|
|
@@ -1504,6 +1606,80 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1504
1606
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1505
1607
|
}
|
|
1506
1608
|
} break;
|
|
1609
|
+
case LLM_ARCH_ERNIE4_5:
|
|
1610
|
+
{
|
|
1611
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1612
|
+
switch (hparams.n_layer) {
|
|
1613
|
+
case 18: type = LLM_TYPE_0_3B; break;
|
|
1614
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1615
|
+
}
|
|
1616
|
+
} break;
|
|
1617
|
+
case LLM_ARCH_FALCON_H1:
|
|
1618
|
+
{
|
|
1619
|
+
// Common parameters
|
|
1620
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1621
|
+
|
|
1622
|
+
// SSM parameters
|
|
1623
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1624
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1625
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1626
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1627
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1628
|
+
|
|
1629
|
+
std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
|
|
1630
|
+
|
|
1631
|
+
switch (hparams.n_layer) {
|
|
1632
|
+
case 36:
|
|
1633
|
+
type = LLM_TYPE_0_5B; break;
|
|
1634
|
+
case 24:
|
|
1635
|
+
type = LLM_TYPE_1_5B; break;
|
|
1636
|
+
case 66:
|
|
1637
|
+
type = LLM_TYPE_1B; break;
|
|
1638
|
+
case 32:
|
|
1639
|
+
type = LLM_TYPE_3B; break;
|
|
1640
|
+
case 44:
|
|
1641
|
+
type = LLM_TYPE_7B; break;
|
|
1642
|
+
case 72:
|
|
1643
|
+
type = LLM_TYPE_34B; break;
|
|
1644
|
+
default:
|
|
1645
|
+
type = LLM_TYPE_UNKNOWN;
|
|
1646
|
+
}
|
|
1647
|
+
} break;
|
|
1648
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
1649
|
+
{
|
|
1650
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1651
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1652
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
|
1653
|
+
|
|
1654
|
+
switch (hparams.n_layer) {
|
|
1655
|
+
case 32: type = LLM_TYPE_A13B; break;
|
|
1656
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1657
|
+
}
|
|
1658
|
+
} break;
|
|
1659
|
+
case LLM_ARCH_SMOLLM3:
|
|
1660
|
+
{
|
|
1661
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1662
|
+
hparams.n_no_rope_layer_step = 4;
|
|
1663
|
+
|
|
1664
|
+
switch (hparams.n_layer) {
|
|
1665
|
+
case 36: type = LLM_TYPE_3B; break;
|
|
1666
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1667
|
+
}
|
|
1668
|
+
} break;
|
|
1669
|
+
case LLM_ARCH_LFM2:
|
|
1670
|
+
{
|
|
1671
|
+
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
1672
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1673
|
+
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
|
1674
|
+
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
|
1675
|
+
}
|
|
1676
|
+
switch (hparams.n_embd) {
|
|
1677
|
+
case 1024: type = LLM_TYPE_350M; break;
|
|
1678
|
+
case 1536: type = LLM_TYPE_700M; break;
|
|
1679
|
+
case 2048: type = LLM_TYPE_1_2B; break;
|
|
1680
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1681
|
+
}
|
|
1682
|
+
} break;
|
|
1507
1683
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1508
1684
|
}
|
|
1509
1685
|
|
|
@@ -3115,6 +3291,228 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3115
3291
|
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3116
3292
|
}
|
|
3117
3293
|
} break;
|
|
3294
|
+
case LLM_ARCH_MAMBA2:
|
|
3295
|
+
{
|
|
3296
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
3297
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
3298
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
3299
|
+
const int64_t n_head = hparams.ssm_dt_rank;
|
|
3300
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
3301
|
+
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
|
|
3302
|
+
|
|
3303
|
+
// only an expansion factor of 2 is supported for now
|
|
3304
|
+
GGML_ASSERT(2 * n_embd == d_inner);
|
|
3305
|
+
|
|
3306
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3307
|
+
|
|
3308
|
+
// output
|
|
3309
|
+
{
|
|
3310
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3311
|
+
|
|
3312
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3313
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
3314
|
+
if (output == NULL) {
|
|
3315
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3316
|
+
}
|
|
3317
|
+
}
|
|
3318
|
+
|
|
3319
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3320
|
+
auto & layer = layers[i];
|
|
3321
|
+
|
|
3322
|
+
// norm
|
|
3323
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3324
|
+
|
|
3325
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
|
3326
|
+
|
|
3327
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
|
3328
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
|
|
3329
|
+
|
|
3330
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
|
|
3331
|
+
|
|
3332
|
+
// no "weight" suffix for these
|
|
3333
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
|
|
3334
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
|
|
3335
|
+
|
|
3336
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
|
3337
|
+
|
|
3338
|
+
// out_proj
|
|
3339
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3340
|
+
}
|
|
3341
|
+
} break;
|
|
3342
|
+
case LLM_ARCH_JAMBA:
|
|
3343
|
+
{
|
|
3344
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
3345
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
3346
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
3347
|
+
const int64_t dt_rank = hparams.ssm_dt_rank;
|
|
3348
|
+
|
|
3349
|
+
// only an expansion factor of 2 is supported for now
|
|
3350
|
+
GGML_ASSERT(2 * n_embd == d_inner);
|
|
3351
|
+
|
|
3352
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3353
|
+
|
|
3354
|
+
// output
|
|
3355
|
+
{
|
|
3356
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3357
|
+
|
|
3358
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3359
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
3360
|
+
if (output == NULL) {
|
|
3361
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3362
|
+
}
|
|
3363
|
+
}
|
|
3364
|
+
|
|
3365
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3366
|
+
const int64_t n_head_kv = hparams.n_head_kv(i);
|
|
3367
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
|
|
3368
|
+
|
|
3369
|
+
auto & layer = layers[i];
|
|
3370
|
+
|
|
3371
|
+
// norm
|
|
3372
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3373
|
+
|
|
3374
|
+
if (n_head_kv == 0) {
|
|
3375
|
+
// Mamba layer
|
|
3376
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
|
|
3377
|
+
|
|
3378
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
|
|
3379
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
|
|
3380
|
+
|
|
3381
|
+
layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
|
|
3382
|
+
|
|
3383
|
+
layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
|
|
3384
|
+
|
|
3385
|
+
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
|
|
3386
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
|
|
3387
|
+
|
|
3388
|
+
layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
|
|
3389
|
+
layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
|
|
3390
|
+
|
|
3391
|
+
// no "weight" suffix for these
|
|
3392
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
|
|
3393
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
|
|
3394
|
+
|
|
3395
|
+
// out_proj
|
|
3396
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3397
|
+
} else {
|
|
3398
|
+
// Attention layers
|
|
3399
|
+
|
|
3400
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
3401
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3402
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3403
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3404
|
+
}
|
|
3405
|
+
|
|
3406
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3407
|
+
|
|
3408
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
3409
|
+
|
|
3410
|
+
if (layer.ffn_gate_inp) {
|
|
3411
|
+
// MoE
|
|
3412
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
3413
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
|
|
3414
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
3415
|
+
} else {
|
|
3416
|
+
// FFN (no MoE)
|
|
3417
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3418
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3419
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3420
|
+
}
|
|
3421
|
+
}
|
|
3422
|
+
} break;
|
|
3423
|
+
case LLM_ARCH_GRANITE_HYBRID:
|
|
3424
|
+
{
|
|
3425
|
+
// mamba2 Mixer SSM params
|
|
3426
|
+
// NOTE: int64_t for tensor dimensions
|
|
3427
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
3428
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
3429
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
3430
|
+
const int64_t n_ssm_head = hparams.ssm_dt_rank;
|
|
3431
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
3432
|
+
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
3433
|
+
|
|
3434
|
+
// only an expansion factor of 2 is supported for now
|
|
3435
|
+
GGML_ASSERT(2 * n_embd == d_inner);
|
|
3436
|
+
|
|
3437
|
+
// embeddings
|
|
3438
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3439
|
+
|
|
3440
|
+
// output
|
|
3441
|
+
{
|
|
3442
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3443
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3444
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
3445
|
+
if (output == NULL) {
|
|
3446
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3447
|
+
}
|
|
3448
|
+
}
|
|
3449
|
+
|
|
3450
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3451
|
+
auto & layer = layers[i];
|
|
3452
|
+
|
|
3453
|
+
// norm
|
|
3454
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3455
|
+
|
|
3456
|
+
if (hparams.is_recurrent(i)) {
|
|
3457
|
+
// ssm layers
|
|
3458
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
|
3459
|
+
|
|
3460
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
|
3461
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
|
|
3462
|
+
|
|
3463
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
|
|
3464
|
+
|
|
3465
|
+
// no "weight" suffix for these
|
|
3466
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
|
|
3467
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
|
|
3468
|
+
|
|
3469
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
|
3470
|
+
|
|
3471
|
+
// out_proj
|
|
3472
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3473
|
+
} else {
|
|
3474
|
+
// attention layers (with optional bias)
|
|
3475
|
+
const int64_t n_head_i = hparams.n_head(i);
|
|
3476
|
+
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
|
|
3477
|
+
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
|
|
3478
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
|
|
3479
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
|
|
3480
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
|
|
3481
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
|
|
3482
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3483
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
3484
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
3485
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3486
|
+
}
|
|
3487
|
+
|
|
3488
|
+
// feed forward (w/ optional biases)
|
|
3489
|
+
if (n_expert > 0) {
|
|
3490
|
+
// MoE FFN
|
|
3491
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3492
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
3493
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
3494
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
|
3495
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
3496
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
3497
|
+
|
|
3498
|
+
// For Granite MoE Shared
|
|
3499
|
+
if (hparams.n_ff_shexp > 0) {
|
|
3500
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
3501
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
3502
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
3503
|
+
}
|
|
3504
|
+
} else {
|
|
3505
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3506
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
3507
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3508
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3509
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3510
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
3511
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3512
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
3513
|
+
}
|
|
3514
|
+
}
|
|
3515
|
+
} break;
|
|
3118
3516
|
case LLM_ARCH_XVERSE:
|
|
3119
3517
|
{
|
|
3120
3518
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4348,16 +4746,226 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4348
4746
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4349
4747
|
}
|
|
4350
4748
|
} break;
|
|
4351
|
-
|
|
4352
|
-
|
|
4353
|
-
|
|
4749
|
+
case LLM_ARCH_ERNIE4_5:
|
|
4750
|
+
{
|
|
4751
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4354
4752
|
|
|
4355
|
-
|
|
4356
|
-
|
|
4357
|
-
|
|
4358
|
-
|
|
4359
|
-
|
|
4360
|
-
|
|
4753
|
+
// output
|
|
4754
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4755
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4756
|
+
// if output is NULL, init from the input tok embed
|
|
4757
|
+
if (output == NULL) {
|
|
4758
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4759
|
+
}
|
|
4760
|
+
|
|
4761
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4762
|
+
auto & layer = layers[i];
|
|
4763
|
+
|
|
4764
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4765
|
+
|
|
4766
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4767
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
4768
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
4769
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4770
|
+
|
|
4771
|
+
// optional bias tensors
|
|
4772
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4773
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
4774
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
4775
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4776
|
+
|
|
4777
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4778
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4779
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4780
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4781
|
+
}
|
|
4782
|
+
} break;
|
|
4783
|
+
case LLM_ARCH_FALCON_H1:
|
|
4784
|
+
{
|
|
4785
|
+
// Common
|
|
4786
|
+
const int64_t hidden_size = hparams.n_embd; // hidden_size
|
|
4787
|
+
|
|
4788
|
+
// mamba2 Mixer SSM params
|
|
4789
|
+
const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
|
|
4790
|
+
const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
|
|
4791
|
+
const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
|
|
4792
|
+
const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
|
|
4793
|
+
const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
|
|
4794
|
+
const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
|
|
4795
|
+
const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
|
|
4796
|
+
|
|
4797
|
+
// attn params
|
|
4798
|
+
const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
|
|
4799
|
+
const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
|
|
4800
|
+
|
|
4801
|
+
// ffn params
|
|
4802
|
+
const int64_t ffn_intermediate_size = hparams.n_ff(0);
|
|
4803
|
+
|
|
4804
|
+
// embeddings
|
|
4805
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
|
|
4806
|
+
|
|
4807
|
+
// output
|
|
4808
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4809
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
|
|
4810
|
+
|
|
4811
|
+
// if output is NULL, init from the input tok embed
|
|
4812
|
+
if (output == NULL) {
|
|
4813
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
|
|
4814
|
+
}
|
|
4815
|
+
|
|
4816
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4817
|
+
auto & layer = layers[i];
|
|
4818
|
+
|
|
4819
|
+
/*SSM LAYERS*/
|
|
4820
|
+
// ssm in
|
|
4821
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
|
|
4822
|
+
// ssm 1d conv
|
|
4823
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
|
|
4824
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
|
|
4825
|
+
// ssm_dt
|
|
4826
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
|
|
4827
|
+
// no "weight" suffix for these
|
|
4828
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
|
|
4829
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
|
|
4830
|
+
// ssm_norm
|
|
4831
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
|
|
4832
|
+
// out_proj
|
|
4833
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
|
|
4834
|
+
|
|
4835
|
+
/*ATTENTION LAYERS*/
|
|
4836
|
+
// attention layers (with optional bias)
|
|
4837
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
|
|
4838
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
|
|
4839
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
|
|
4840
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
|
|
4841
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
|
4842
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
|
|
4843
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
|
|
4844
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
|
4845
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
|
|
4846
|
+
|
|
4847
|
+
|
|
4848
|
+
// feed forward (w/ optional biases)
|
|
4849
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
|
|
4850
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4851
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
|
|
4852
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
|
|
4853
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
|
|
4854
|
+
|
|
4855
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
|
|
4856
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
|
4857
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
|
|
4858
|
+
}
|
|
4859
|
+
} break;
|
|
4860
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
4861
|
+
{
|
|
4862
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4863
|
+
|
|
4864
|
+
// output
|
|
4865
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4866
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4867
|
+
// if output is NULL, init from the input tok embed
|
|
4868
|
+
if (output == NULL) {
|
|
4869
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4870
|
+
}
|
|
4871
|
+
|
|
4872
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4873
|
+
auto & layer = layers[i];
|
|
4874
|
+
|
|
4875
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4876
|
+
|
|
4877
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4878
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4879
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4880
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4881
|
+
|
|
4882
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4883
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4884
|
+
|
|
4885
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4886
|
+
|
|
4887
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4888
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
4889
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
4890
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
4891
|
+
|
|
4892
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
4893
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
4894
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
4895
|
+
}
|
|
4896
|
+
} break;
|
|
4897
|
+
case LLM_ARCH_SMOLLM3:
|
|
4898
|
+
{
|
|
4899
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4900
|
+
|
|
4901
|
+
// output
|
|
4902
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4903
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4904
|
+
|
|
4905
|
+
// if output is NULL, init from the input tok embed
|
|
4906
|
+
if (output == NULL) {
|
|
4907
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4908
|
+
}
|
|
4909
|
+
|
|
4910
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4911
|
+
auto & layer = layers[i];
|
|
4912
|
+
|
|
4913
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4914
|
+
|
|
4915
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4916
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4917
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4918
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4919
|
+
|
|
4920
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4921
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4922
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4923
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4924
|
+
}
|
|
4925
|
+
} break;
|
|
4926
|
+
case LLM_ARCH_LFM2:
|
|
4927
|
+
{
|
|
4928
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4929
|
+
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
4930
|
+
|
|
4931
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4932
|
+
auto & layer = layers[i];
|
|
4933
|
+
// ffn is same for transformer and conv layers
|
|
4934
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4935
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4936
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4937
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4938
|
+
|
|
4939
|
+
// for operator_norm
|
|
4940
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4941
|
+
|
|
4942
|
+
if (!hparams.is_recurrent(i)) {
|
|
4943
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4944
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4945
|
+
GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
|
|
4946
|
+
|
|
4947
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
4948
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
|
|
4949
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
|
|
4950
|
+
|
|
4951
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
4952
|
+
} else {
|
|
4953
|
+
layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
|
|
4954
|
+
layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
|
|
4955
|
+
layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
|
|
4956
|
+
}
|
|
4957
|
+
}
|
|
4958
|
+
} break;
|
|
4959
|
+
default:
|
|
4960
|
+
throw std::runtime_error("unknown architecture");
|
|
4961
|
+
}
|
|
4962
|
+
|
|
4963
|
+
if (n_moved_tensors > 0) {
|
|
4964
|
+
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
|
|
4965
|
+
__func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
|
|
4966
|
+
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
|
4967
|
+
}
|
|
4968
|
+
}
|
|
4361
4969
|
|
|
4362
4970
|
ml.done_getting_tensors();
|
|
4363
4971
|
|
|
@@ -4587,12 +5195,6 @@ void llama_model::print_info() const {
|
|
|
4587
5195
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
4588
5196
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
4589
5197
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
4590
|
-
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
4591
|
-
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
4592
|
-
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
4593
|
-
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
4594
|
-
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
4595
|
-
|
|
4596
5198
|
if (!classifier_labels.empty()) {
|
|
4597
5199
|
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
|
4598
5200
|
|
|
@@ -4603,6 +5205,19 @@ void llama_model::print_info() const {
|
|
|
4603
5205
|
}
|
|
4604
5206
|
}
|
|
4605
5207
|
|
|
5208
|
+
if (arch == LLM_ARCH_MAMBA ||
|
|
5209
|
+
arch == LLM_ARCH_MAMBA2 ||
|
|
5210
|
+
arch == LLM_ARCH_JAMBA ||
|
|
5211
|
+
arch == LLM_ARCH_FALCON_H1 ||
|
|
5212
|
+
arch == LLM_ARCH_GRANITE_HYBRID) {
|
|
5213
|
+
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
5214
|
+
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
5215
|
+
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
5216
|
+
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
5217
|
+
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
|
5218
|
+
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
5219
|
+
}
|
|
5220
|
+
|
|
4606
5221
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
4607
5222
|
if (pimpl->n_elements >= 1e12) {
|
|
4608
5223
|
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
|
|
@@ -4649,7 +5264,8 @@ void llama_model::print_info() const {
|
|
|
4649
5264
|
|
|
4650
5265
|
if (arch == LLM_ARCH_MINICPM ||
|
|
4651
5266
|
arch == LLM_ARCH_GRANITE ||
|
|
4652
|
-
arch == LLM_ARCH_GRANITE_MOE
|
|
5267
|
+
arch == LLM_ARCH_GRANITE_MOE ||
|
|
5268
|
+
arch == LLM_ARCH_GRANITE_HYBRID) {
|
|
4653
5269
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
4654
5270
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
4655
5271
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
@@ -5539,12 +6155,10 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5539
6155
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
5540
6156
|
cb(cur, "wqkv", il);
|
|
5541
6157
|
|
|
5542
|
-
ggml_tensor * Qcur =
|
|
5543
|
-
ggml_tensor * Kcur =
|
|
6158
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6159
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
5544
6160
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
5545
6161
|
|
|
5546
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
5547
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
5548
6162
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5549
6163
|
|
|
5550
6164
|
// using mode = 2 for neox mode
|
|
@@ -5821,12 +6435,10 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
5821
6435
|
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
5822
6436
|
cb(cur, "wqkv_clamped", il);
|
|
5823
6437
|
|
|
5824
|
-
Qcur =
|
|
5825
|
-
Kcur =
|
|
6438
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6439
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
5826
6440
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
5827
6441
|
|
|
5828
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
5829
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
5830
6442
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5831
6443
|
|
|
5832
6444
|
Qcur = ggml_rope_ext(
|
|
@@ -6337,12 +6949,10 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
6337
6949
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6338
6950
|
cb(cur, "wqkv", il);
|
|
6339
6951
|
|
|
6340
|
-
Qcur =
|
|
6341
|
-
Kcur =
|
|
6952
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6953
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6342
6954
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6343
6955
|
|
|
6344
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6345
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6346
6956
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6347
6957
|
|
|
6348
6958
|
// RoPE
|
|
@@ -6572,8 +7182,8 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6572
7182
|
cb(cur, "wqkv_clamped", il);
|
|
6573
7183
|
}
|
|
6574
7184
|
|
|
6575
|
-
ggml_tensor * Qcur =
|
|
6576
|
-
ggml_tensor * Kcur =
|
|
7185
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7186
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6577
7187
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6578
7188
|
|
|
6579
7189
|
cb(Qcur, "Qcur", il);
|
|
@@ -6593,6 +7203,12 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6593
7203
|
model.layers[il].attn_k_norm_b,
|
|
6594
7204
|
LLM_NORM, il);
|
|
6595
7205
|
cb(Kcur, "Kcur", il);
|
|
7206
|
+
} else {
|
|
7207
|
+
Qcur = ggml_cont(ctx0, Qcur);
|
|
7208
|
+
cb(Qcur, "Qcur", il);
|
|
7209
|
+
|
|
7210
|
+
Kcur = ggml_cont(ctx0, Kcur);
|
|
7211
|
+
cb(Kcur, "Kcur", il);
|
|
6596
7212
|
}
|
|
6597
7213
|
|
|
6598
7214
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
@@ -6847,12 +7463,10 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
6847
7463
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6848
7464
|
cb(cur, "bqkv", il);
|
|
6849
7465
|
|
|
6850
|
-
ggml_tensor * Qcur =
|
|
6851
|
-
ggml_tensor * Kcur =
|
|
7466
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7467
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6852
7468
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
|
|
6853
7469
|
|
|
6854
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6855
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6856
7470
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6857
7471
|
|
|
6858
7472
|
// using mode = 2 for neox mode
|
|
@@ -7617,21 +8231,21 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7617
8231
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7618
8232
|
cb(cur, "bqkv", il);
|
|
7619
8233
|
|
|
7620
|
-
Qcur =
|
|
7621
|
-
Kcur =
|
|
8234
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8235
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7622
8236
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
7623
8237
|
} else {
|
|
7624
8238
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7625
8239
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7626
8240
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
8241
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8242
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7627
8243
|
}
|
|
7628
8244
|
|
|
7629
8245
|
cb(Qcur, "Qcur", il);
|
|
7630
8246
|
cb(Kcur, "Kcur", il);
|
|
7631
8247
|
cb(Vcur, "Vcur", il);
|
|
7632
8248
|
|
|
7633
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7634
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7635
8249
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7636
8250
|
|
|
7637
8251
|
Qcur = ggml_rope_ext(
|
|
@@ -7755,21 +8369,21 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7755
8369
|
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
7756
8370
|
cb(cur, "wqkv", il);
|
|
7757
8371
|
|
|
7758
|
-
Qcur =
|
|
7759
|
-
Kcur =
|
|
8372
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
8373
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
7760
8374
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
|
7761
8375
|
} else {
|
|
7762
8376
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7763
8377
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7764
8378
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
8379
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8380
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7765
8381
|
}
|
|
7766
8382
|
|
|
7767
8383
|
cb(Qcur, "Qcur", il);
|
|
7768
8384
|
cb(Kcur, "Kcur", il);
|
|
7769
8385
|
cb(Vcur, "Vcur", il);
|
|
7770
8386
|
|
|
7771
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7772
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7773
8387
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7774
8388
|
|
|
7775
8389
|
Qcur = ggml_rope_ext(
|
|
@@ -8125,12 +8739,10 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
8125
8739
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
8126
8740
|
cb(cur, "bqkv", il);
|
|
8127
8741
|
|
|
8128
|
-
ggml_tensor * Qcur =
|
|
8129
|
-
ggml_tensor * Kcur =
|
|
8742
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8743
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8130
8744
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
8131
8745
|
|
|
8132
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8133
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8134
8746
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8135
8747
|
|
|
8136
8748
|
Qcur = ggml_rope_ext(
|
|
@@ -8546,8 +9158,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8546
9158
|
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
8547
9159
|
cb(k_pe, "k_pe", il);
|
|
8548
9160
|
|
|
8549
|
-
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
|
|
8550
|
-
kv_compressed = ggml_cont(ctx0, kv_compressed);
|
|
8551
9161
|
kv_compressed = build_norm(kv_compressed,
|
|
8552
9162
|
model.layers[il].attn_kv_a_norm, NULL,
|
|
8553
9163
|
LLM_NORM_RMS, il);
|
|
@@ -8574,12 +9184,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8574
9184
|
v_states = ggml_cont(ctx0, v_states);
|
|
8575
9185
|
cb(v_states, "v_states", il);
|
|
8576
9186
|
|
|
8577
|
-
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
|
8578
|
-
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
|
8579
|
-
0);
|
|
8580
|
-
cb(v_states, "v_states", il);
|
|
8581
|
-
|
|
8582
|
-
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
|
8583
9187
|
q_pe = ggml_rope_ext(
|
|
8584
9188
|
ctx0, q_pe, inp_pos, rope_factors,
|
|
8585
9189
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -8588,7 +9192,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8588
9192
|
cb(q_pe, "q_pe", il);
|
|
8589
9193
|
|
|
8590
9194
|
// shared RoPE key
|
|
8591
|
-
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
|
8592
9195
|
k_pe = ggml_rope_ext(
|
|
8593
9196
|
ctx0, k_pe, inp_pos, rope_factors,
|
|
8594
9197
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9068,8 +9671,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9068
9671
|
const int n_layer_sparsity = 10; // number of layers using activation sparsity
|
|
9069
9672
|
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
|
|
9070
9673
|
|
|
9071
|
-
ggml_tensor * one; // containing single element 1.0f
|
|
9072
|
-
|
|
9073
9674
|
llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
|
|
9074
9675
|
: llm_graph_context(params),
|
|
9075
9676
|
model(model),
|
|
@@ -9081,14 +9682,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9081
9682
|
ggml_tensor * cur;
|
|
9082
9683
|
ggml_tensor * inpL;
|
|
9083
9684
|
|
|
9084
|
-
// TODO: remove this when ggml_scale_add is implemented
|
|
9085
|
-
one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
|
9086
|
-
{
|
|
9087
|
-
auto inp = std::make_unique<llm_graph_input_one>();
|
|
9088
|
-
inp->one = one;
|
|
9089
|
-
res->add_input(std::move(inp));
|
|
9090
|
-
}
|
|
9091
|
-
|
|
9092
9685
|
inpL = build_inp_embd(model.tok_embd);
|
|
9093
9686
|
|
|
9094
9687
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
@@ -9478,7 +10071,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9478
10071
|
cb(innovation, "innovation", il);
|
|
9479
10072
|
|
|
9480
10073
|
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
|
|
9481
|
-
all_coefs =
|
|
10074
|
+
all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
|
|
9482
10075
|
cb(all_coefs, "all_coefs", il);
|
|
9483
10076
|
all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
|
|
9484
10077
|
all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
|
|
@@ -9621,81 +10214,32 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
9621
10214
|
}
|
|
9622
10215
|
};
|
|
9623
10216
|
|
|
9624
|
-
struct
|
|
9625
|
-
const
|
|
9626
|
-
|
|
9627
|
-
llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
|
|
9628
|
-
ggml_tensor * cur;
|
|
9629
|
-
ggml_tensor * inpL;
|
|
9630
|
-
|
|
9631
|
-
// {n_embd, n_tokens}
|
|
9632
|
-
inpL = build_inp_embd(model.tok_embd);
|
|
9633
|
-
|
|
9634
|
-
auto * rs_inp = build_rs_inp();
|
|
9635
|
-
|
|
9636
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9637
|
-
|
|
9638
|
-
for (int il = 0; il < n_layer; ++il) {
|
|
9639
|
-
// norm
|
|
9640
|
-
cur = build_norm(inpL,
|
|
9641
|
-
model.layers[il].attn_norm, NULL,
|
|
9642
|
-
LLM_NORM_RMS, il);
|
|
9643
|
-
cb(cur, "attn_norm", il);
|
|
9644
|
-
|
|
9645
|
-
cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
|
|
9646
|
-
|
|
9647
|
-
if (il == n_layer - 1 && inp_out_ids) {
|
|
9648
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9649
|
-
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9650
|
-
}
|
|
9651
|
-
|
|
9652
|
-
// residual
|
|
9653
|
-
cur = ggml_add(ctx0, cur, inpL);
|
|
9654
|
-
|
|
9655
|
-
cur = build_cvec(cur, il);
|
|
9656
|
-
cb(cur, "l_out", il);
|
|
9657
|
-
|
|
9658
|
-
// input for next layer
|
|
9659
|
-
inpL = cur;
|
|
9660
|
-
}
|
|
10217
|
+
struct llm_graph_context_mamba : public llm_graph_context {
|
|
10218
|
+
llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
|
|
9661
10219
|
|
|
9662
|
-
// final rmsnorm
|
|
9663
|
-
cur = build_norm(inpL,
|
|
9664
|
-
model.output_norm, NULL,
|
|
9665
|
-
LLM_NORM_RMS, -1);
|
|
9666
|
-
|
|
9667
|
-
cb(cur, "result_norm", -1);
|
|
9668
|
-
res->t_embd = cur;
|
|
9669
|
-
|
|
9670
|
-
// lm_head
|
|
9671
|
-
cur = build_lora_mm(model.output, cur);
|
|
9672
|
-
|
|
9673
|
-
cb(cur, "result_output", -1);
|
|
9674
|
-
res->t_logits = cur;
|
|
9675
|
-
|
|
9676
|
-
ggml_build_forward_expand(gf, cur);
|
|
9677
|
-
}
|
|
9678
|
-
|
|
9679
|
-
// TODO: split
|
|
9680
10220
|
ggml_tensor * build_mamba_layer(
|
|
9681
10221
|
llm_graph_input_rs * inp,
|
|
9682
10222
|
ggml_cgraph * gf,
|
|
9683
10223
|
ggml_tensor * cur,
|
|
10224
|
+
const llama_model & model,
|
|
9684
10225
|
const llama_ubatch & ubatch,
|
|
9685
|
-
int il)
|
|
9686
|
-
|
|
10226
|
+
int il) {
|
|
10227
|
+
|
|
10228
|
+
const auto * mctx_cur = inp->mctx;
|
|
9687
10229
|
|
|
9688
10230
|
const auto kv_head = mctx_cur->get_head();
|
|
9689
10231
|
|
|
10232
|
+
const auto & layer = model.layers[il];
|
|
10233
|
+
|
|
9690
10234
|
const int64_t d_conv = hparams.ssm_d_conv;
|
|
9691
10235
|
const int64_t d_inner = hparams.ssm_d_inner;
|
|
9692
10236
|
const int64_t d_state = hparams.ssm_d_state;
|
|
9693
10237
|
const int64_t dt_rank = hparams.ssm_dt_rank;
|
|
10238
|
+
const int64_t n_head = d_inner;
|
|
10239
|
+
const int64_t head_dim = 1;
|
|
9694
10240
|
const int64_t n_seqs = ubatch.n_seqs;
|
|
9695
10241
|
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
|
|
9696
10242
|
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
|
|
9697
|
-
// Use the same RMS norm as the final layer norm
|
|
9698
|
-
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
|
9699
10243
|
|
|
9700
10244
|
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
9701
10245
|
|
|
@@ -9706,21 +10250,14 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9706
10250
|
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
9707
10251
|
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
9708
10252
|
|
|
9709
|
-
|
|
9710
|
-
ggml_tensor * conv = build_rs(
|
|
9711
|
-
inp, gf, conv_states_all,
|
|
9712
|
-
hparams.n_embd_r(), n_seqs);
|
|
10253
|
+
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
9713
10254
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
|
9714
|
-
ggml_tensor * ssm = build_rs(
|
|
9715
|
-
inp, gf, ssm_states_all,
|
|
9716
|
-
hparams.n_embd_s(), n_seqs);
|
|
9717
|
-
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
|
9718
10255
|
|
|
9719
10256
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
9720
10257
|
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
9721
10258
|
|
|
9722
10259
|
// {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
|
|
9723
|
-
ggml_tensor * xz = build_lora_mm(
|
|
10260
|
+
ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
|
|
9724
10261
|
// split the above in two
|
|
9725
10262
|
// => {d_inner, n_seq_tokens, n_seqs}
|
|
9726
10263
|
ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
|
|
@@ -9749,10 +10286,10 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9749
10286
|
// then permute away the ne[0] dimension,
|
|
9750
10287
|
// and then you're left with the resulting x tensor.
|
|
9751
10288
|
// For simultaneous sequences, all sequences need to have the same length.
|
|
9752
|
-
x = ggml_ssm_conv(ctx0, conv_x,
|
|
10289
|
+
x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
|
|
9753
10290
|
|
|
9754
10291
|
// bias
|
|
9755
|
-
x = ggml_add(ctx0, x,
|
|
10292
|
+
x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
|
|
9756
10293
|
|
|
9757
10294
|
x = ggml_silu(ctx0, x);
|
|
9758
10295
|
}
|
|
@@ -9760,55 +10297,366 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9760
10297
|
// ssm
|
|
9761
10298
|
{
|
|
9762
10299
|
// {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
|
|
9763
|
-
ggml_tensor * x_db = build_lora_mm(
|
|
10300
|
+
ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
|
|
9764
10301
|
// split
|
|
9765
10302
|
ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
|
|
9766
|
-
ggml_tensor * B =
|
|
9767
|
-
ggml_tensor * C =
|
|
9768
|
-
|
|
9769
|
-
// Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
|
|
9770
|
-
if (ssm_dt_b_c_rms) {
|
|
9771
|
-
dt =
|
|
9772
|
-
B
|
|
9773
|
-
C
|
|
10303
|
+
ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
|
|
10304
|
+
ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
|
|
10305
|
+
|
|
10306
|
+
// Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
|
|
10307
|
+
if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
|
|
10308
|
+
dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
|
|
10309
|
+
B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
|
|
10310
|
+
C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
|
|
9774
10311
|
}
|
|
9775
10312
|
|
|
9776
10313
|
// {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
|
|
9777
|
-
dt = build_lora_mm(
|
|
9778
|
-
dt = ggml_add(ctx0, dt,
|
|
10314
|
+
dt = build_lora_mm(layer.ssm_dt, dt);
|
|
10315
|
+
dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
|
|
10316
|
+
|
|
10317
|
+
cur = x;
|
|
10318
|
+
x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
|
|
9779
10319
|
|
|
9780
|
-
|
|
9781
|
-
|
|
9782
|
-
//
|
|
9783
|
-
|
|
10320
|
+
ggml_tensor * A = layer.ssm_a;
|
|
10321
|
+
|
|
10322
|
+
// use the states and the indices provided by build_recurrent_state
|
|
10323
|
+
// (this is necessary in order to properly use the states before they are overwritten,
|
|
10324
|
+
// while avoiding to make unnecessary copies of the states)
|
|
10325
|
+
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
|
10326
|
+
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
|
|
10327
|
+
|
|
10328
|
+
// Custom operator to optimize the parallel associative scan
|
|
10329
|
+
// as described in the Annex D of the Mamba paper.
|
|
10330
|
+
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
10331
|
+
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
10332
|
+
};
|
|
10333
|
+
|
|
10334
|
+
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
9784
10335
|
|
|
9785
10336
|
// store last states
|
|
9786
10337
|
ggml_build_forward_expand(gf,
|
|
9787
10338
|
ggml_cpy(ctx0,
|
|
9788
|
-
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
|
|
10339
|
+
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
|
|
9789
10340
|
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
|
9790
10341
|
|
|
9791
|
-
ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[
|
|
10342
|
+
ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
|
|
9792
10343
|
|
|
9793
10344
|
// TODO: skip computing output earlier for unused tokens
|
|
9794
10345
|
|
|
9795
|
-
|
|
9796
|
-
y =
|
|
9797
|
-
y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
|
|
10346
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
|
|
10347
|
+
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
|
9798
10348
|
|
|
9799
10349
|
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
9800
|
-
cur = build_lora_mm(
|
|
10350
|
+
cur = build_lora_mm(layer.ssm_out, y);
|
|
9801
10351
|
}
|
|
9802
10352
|
|
|
9803
10353
|
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
9804
10354
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
9805
|
-
//cb(cur, "mamba_out", il);
|
|
9806
10355
|
|
|
9807
10356
|
return cur;
|
|
9808
10357
|
}
|
|
9809
|
-
};
|
|
9810
10358
|
|
|
9811
|
-
|
|
10359
|
+
ggml_tensor * build_mamba2_layer(
|
|
10360
|
+
llm_graph_input_rs * inp,
|
|
10361
|
+
ggml_cgraph * gf,
|
|
10362
|
+
ggml_tensor * cur,
|
|
10363
|
+
const llama_model & model,
|
|
10364
|
+
const llama_ubatch & ubatch,
|
|
10365
|
+
int il) const {
|
|
10366
|
+
|
|
10367
|
+
const auto * mctx_cur = inp->mctx;
|
|
10368
|
+
|
|
10369
|
+
const auto kv_head = mctx_cur->get_head();
|
|
10370
|
+
|
|
10371
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
10372
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
10373
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
10374
|
+
const int64_t n_head = hparams.ssm_dt_rank;
|
|
10375
|
+
const int64_t head_dim = d_inner / n_head;
|
|
10376
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
10377
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
|
10378
|
+
|
|
10379
|
+
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
10380
|
+
|
|
10381
|
+
GGML_ASSERT(n_seqs != 0);
|
|
10382
|
+
GGML_ASSERT(ubatch.equal_seqs);
|
|
10383
|
+
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
10384
|
+
|
|
10385
|
+
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
10386
|
+
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
10387
|
+
|
|
10388
|
+
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
10389
|
+
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
|
|
10390
|
+
|
|
10391
|
+
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
10392
|
+
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
10393
|
+
|
|
10394
|
+
// d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
|
|
10395
|
+
|
|
10396
|
+
// {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
|
|
10397
|
+
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
|
|
10398
|
+
|
|
10399
|
+
// split the above in three
|
|
10400
|
+
ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
|
|
10401
|
+
ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
|
|
10402
|
+
ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
|
|
10403
|
+
|
|
10404
|
+
// conv
|
|
10405
|
+
{
|
|
10406
|
+
// => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
|
|
10407
|
+
ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
|
|
10408
|
+
|
|
10409
|
+
// copy last (d_conv - 1) columns back into the state cache
|
|
10410
|
+
ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
|
|
10411
|
+
|
|
10412
|
+
ggml_build_forward_expand(gf,
|
|
10413
|
+
ggml_cpy(ctx0, last_conv,
|
|
10414
|
+
ggml_view_1d(ctx0, conv_states_all,
|
|
10415
|
+
(d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
|
|
10416
|
+
kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
|
|
10417
|
+
|
|
10418
|
+
// 1D convolution
|
|
10419
|
+
// The equivalent is to make a self-overlapping view of conv_x
|
|
10420
|
+
// over d_conv columns at each stride in the 3rd dimension,
|
|
10421
|
+
// then element-wise multiply that with the conv1d weight,
|
|
10422
|
+
// then sum the elements of each row,
|
|
10423
|
+
// (the last two steps are a dot product over rows (also doable with mul_mat))
|
|
10424
|
+
// then permute away the ne[0] dimension,
|
|
10425
|
+
// and then you're left with the resulting x tensor.
|
|
10426
|
+
// For simultaneous sequences, all sequences need to have the same length.
|
|
10427
|
+
xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
|
10428
|
+
|
|
10429
|
+
// bias
|
|
10430
|
+
xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
|
|
10431
|
+
|
|
10432
|
+
xBC = ggml_silu(ctx0, xBC);
|
|
10433
|
+
}
|
|
10434
|
+
|
|
10435
|
+
// ssm
|
|
10436
|
+
{
|
|
10437
|
+
// These correspond to V K Q in SSM/attention duality
|
|
10438
|
+
ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
|
|
10439
|
+
ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
|
|
10440
|
+
ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
|
|
10441
|
+
|
|
10442
|
+
// {n_head, n_seq_tokens, n_seqs}
|
|
10443
|
+
dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
|
|
10444
|
+
|
|
10445
|
+
ggml_tensor * A = model.layers[il].ssm_a;
|
|
10446
|
+
|
|
10447
|
+
// use the states and the indices provided by build_recurrent_state
|
|
10448
|
+
// (this is necessary in order to properly use the states before they are overwritten,
|
|
10449
|
+
// while avoiding to make unnecessary copies of the states)
|
|
10450
|
+
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
|
10451
|
+
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
|
|
10452
|
+
|
|
10453
|
+
// TODO: use semistructured matrices to implement state-space duality
|
|
10454
|
+
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
10455
|
+
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
10456
|
+
};
|
|
10457
|
+
|
|
10458
|
+
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
10459
|
+
|
|
10460
|
+
// store last states
|
|
10461
|
+
ggml_build_forward_expand(gf,
|
|
10462
|
+
ggml_cpy(ctx0,
|
|
10463
|
+
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
|
|
10464
|
+
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
|
10465
|
+
|
|
10466
|
+
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
|
|
10467
|
+
|
|
10468
|
+
// TODO: skip computing output earlier for unused tokens
|
|
10469
|
+
|
|
10470
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
10471
|
+
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
|
10472
|
+
|
|
10473
|
+
// grouped RMS norm
|
|
10474
|
+
if (model.layers[il].ssm_norm) {
|
|
10475
|
+
y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
|
|
10476
|
+
y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
|
|
10477
|
+
}
|
|
10478
|
+
|
|
10479
|
+
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
|
|
10480
|
+
|
|
10481
|
+
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
10482
|
+
cur = build_lora_mm(model.layers[il].ssm_out, y);
|
|
10483
|
+
}
|
|
10484
|
+
|
|
10485
|
+
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
10486
|
+
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
10487
|
+
cb(cur, "mamba_out", il);
|
|
10488
|
+
|
|
10489
|
+
return cur;
|
|
10490
|
+
}
|
|
10491
|
+
};
|
|
10492
|
+
|
|
10493
|
+
struct llm_build_mamba : public llm_graph_context_mamba {
|
|
10494
|
+
llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
|
|
10495
|
+
ggml_tensor * cur;
|
|
10496
|
+
ggml_tensor * inpL;
|
|
10497
|
+
|
|
10498
|
+
// {n_embd, n_tokens}
|
|
10499
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
10500
|
+
|
|
10501
|
+
auto * rs_inp = build_rs_inp();
|
|
10502
|
+
|
|
10503
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10504
|
+
|
|
10505
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
10506
|
+
// norm
|
|
10507
|
+
cur = build_norm(inpL,
|
|
10508
|
+
model.layers[il].attn_norm, NULL,
|
|
10509
|
+
LLM_NORM_RMS, il);
|
|
10510
|
+
cb(cur, "attn_norm", il);
|
|
10511
|
+
|
|
10512
|
+
if (model.arch == LLM_ARCH_MAMBA2) {
|
|
10513
|
+
cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il);
|
|
10514
|
+
} else {
|
|
10515
|
+
cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
|
|
10516
|
+
}
|
|
10517
|
+
|
|
10518
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10519
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10520
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
10521
|
+
}
|
|
10522
|
+
|
|
10523
|
+
// residual
|
|
10524
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
10525
|
+
|
|
10526
|
+
cur = build_cvec(cur, il);
|
|
10527
|
+
cb(cur, "l_out", il);
|
|
10528
|
+
|
|
10529
|
+
// input for next layer
|
|
10530
|
+
inpL = cur;
|
|
10531
|
+
}
|
|
10532
|
+
|
|
10533
|
+
// final rmsnorm
|
|
10534
|
+
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
10535
|
+
|
|
10536
|
+
cb(cur, "result_norm", -1);
|
|
10537
|
+
res->t_embd = cur;
|
|
10538
|
+
|
|
10539
|
+
// lm_head
|
|
10540
|
+
cur = build_lora_mm(model.output, cur);
|
|
10541
|
+
|
|
10542
|
+
cb(cur, "result_output", -1);
|
|
10543
|
+
res->t_logits = cur;
|
|
10544
|
+
|
|
10545
|
+
ggml_build_forward_expand(gf, cur);
|
|
10546
|
+
}
|
|
10547
|
+
|
|
10548
|
+
};
|
|
10549
|
+
|
|
10550
|
+
struct llm_build_jamba : public llm_graph_context_mamba {
|
|
10551
|
+
llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
|
|
10552
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10553
|
+
|
|
10554
|
+
ggml_tensor * cur;
|
|
10555
|
+
ggml_tensor * inpL;
|
|
10556
|
+
|
|
10557
|
+
// {n_embd, n_tokens}
|
|
10558
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
10559
|
+
|
|
10560
|
+
auto * inp_hybrid = build_inp_mem_hybrid();
|
|
10561
|
+
|
|
10562
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10563
|
+
|
|
10564
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
10565
|
+
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
10566
|
+
|
|
10567
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
10568
|
+
cb(cur, "attn_norm", il);
|
|
10569
|
+
|
|
10570
|
+
if (n_head_kv == 0) {
|
|
10571
|
+
cur = build_mamba_layer(inp_hybrid->get_recr(), gf, cur, model, ubatch, il);
|
|
10572
|
+
} else {
|
|
10573
|
+
// Attention
|
|
10574
|
+
|
|
10575
|
+
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
10576
|
+
struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
10577
|
+
struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
10578
|
+
|
|
10579
|
+
cb(Qcur, "Qcur", il);
|
|
10580
|
+
cb(Kcur, "Kcur", il);
|
|
10581
|
+
cb(Vcur, "Vcur", il);
|
|
10582
|
+
|
|
10583
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
10584
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
10585
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
10586
|
+
|
|
10587
|
+
cb(Qcur, "Qcur", il);
|
|
10588
|
+
cb(Kcur, "Kcur", il);
|
|
10589
|
+
cb(Vcur, "Vcur", il);
|
|
10590
|
+
|
|
10591
|
+
// No RoPE :)
|
|
10592
|
+
cur = build_attn(inp_hybrid->get_attn(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10593
|
+
}
|
|
10594
|
+
|
|
10595
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10596
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10597
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
10598
|
+
}
|
|
10599
|
+
|
|
10600
|
+
// residual
|
|
10601
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
|
|
10602
|
+
cb(cur, "ffn_inp", il);
|
|
10603
|
+
|
|
10604
|
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
10605
|
+
cb(cur, "ffn_norm", il);
|
|
10606
|
+
|
|
10607
|
+
// feed-forward network
|
|
10608
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
10609
|
+
// FFN
|
|
10610
|
+
cur = build_ffn(cur,
|
|
10611
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
10612
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
10613
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
10614
|
+
NULL,
|
|
10615
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
10616
|
+
cb(cur, "ffn_out", il);
|
|
10617
|
+
} else {
|
|
10618
|
+
// MoE branch
|
|
10619
|
+
cur = build_moe_ffn(cur,
|
|
10620
|
+
model.layers[il].ffn_gate_inp,
|
|
10621
|
+
model.layers[il].ffn_up_exps,
|
|
10622
|
+
model.layers[il].ffn_gate_exps,
|
|
10623
|
+
model.layers[il].ffn_down_exps,
|
|
10624
|
+
nullptr,
|
|
10625
|
+
n_expert, n_expert_used,
|
|
10626
|
+
LLM_FFN_SILU, false,
|
|
10627
|
+
false, 0.0,
|
|
10628
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
10629
|
+
il);
|
|
10630
|
+
cb(cur, "ffn_moe_out", il);
|
|
10631
|
+
}
|
|
10632
|
+
|
|
10633
|
+
// residual
|
|
10634
|
+
cur = ggml_add(ctx0, ffn_inp, cur);
|
|
10635
|
+
|
|
10636
|
+
cur = build_cvec(cur, il);
|
|
10637
|
+
cb(cur, "l_out", il);
|
|
10638
|
+
|
|
10639
|
+
// input for next layer
|
|
10640
|
+
inpL = cur;
|
|
10641
|
+
}
|
|
10642
|
+
|
|
10643
|
+
// final rmsnorm
|
|
10644
|
+
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
10645
|
+
|
|
10646
|
+
cb(cur, "result_norm", -1);
|
|
10647
|
+
res->t_embd = cur;
|
|
10648
|
+
|
|
10649
|
+
// lm_head
|
|
10650
|
+
cur = build_lora_mm(model.output, cur);
|
|
10651
|
+
|
|
10652
|
+
cb(cur, "result_output", -1);
|
|
10653
|
+
res->t_logits = cur;
|
|
10654
|
+
|
|
10655
|
+
ggml_build_forward_expand(gf, cur);
|
|
10656
|
+
}
|
|
10657
|
+
};
|
|
10658
|
+
|
|
10659
|
+
struct llm_build_command_r : public llm_graph_context {
|
|
9812
10660
|
llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
9813
10661
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
9814
10662
|
|
|
@@ -10514,10 +11362,10 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
10514
11362
|
|
|
10515
11363
|
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
|
|
10516
11364
|
|
|
10517
|
-
ggml_tensor * Qcur =
|
|
11365
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
|
|
10518
11366
|
cb(Qcur, "Qcur", il);
|
|
10519
11367
|
|
|
10520
|
-
ggml_tensor * Kcur =
|
|
11368
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
|
|
10521
11369
|
cb(Kcur, "Kcur", il);
|
|
10522
11370
|
|
|
10523
11371
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
|
|
@@ -10639,12 +11487,10 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
10639
11487
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
10640
11488
|
cb(cur, "bqkv", il);
|
|
10641
11489
|
|
|
10642
|
-
ggml_tensor * Qcur =
|
|
10643
|
-
ggml_tensor * Kcur =
|
|
11490
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
11491
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
10644
11492
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
10645
11493
|
|
|
10646
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
10647
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
10648
11494
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
10649
11495
|
|
|
10650
11496
|
Qcur = ggml_rope_ext(
|
|
@@ -11889,6 +12735,8 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11889
12735
|
if (model.layers[il].bv) {
|
|
11890
12736
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
11891
12737
|
}
|
|
12738
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12739
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11892
12740
|
} else {
|
|
11893
12741
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
11894
12742
|
cb(cur, "wqkv", il);
|
|
@@ -11896,13 +12744,11 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11896
12744
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
11897
12745
|
cb(cur, "bqkv", il);
|
|
11898
12746
|
}
|
|
11899
|
-
Qcur =
|
|
11900
|
-
Kcur =
|
|
12747
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12748
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
11901
12749
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
11902
12750
|
}
|
|
11903
12751
|
|
|
11904
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
11905
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11906
12752
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
11907
12753
|
|
|
11908
12754
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
@@ -12023,6 +12869,8 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
12023
12869
|
if (model.layers[il].bv) {
|
|
12024
12870
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12025
12871
|
}
|
|
12872
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12873
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12026
12874
|
} else {
|
|
12027
12875
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
12028
12876
|
cb(cur, "wqkv", il);
|
|
@@ -12030,13 +12878,11 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
12030
12878
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
12031
12879
|
cb(cur, "bqkv", il);
|
|
12032
12880
|
}
|
|
12033
|
-
Qcur =
|
|
12034
|
-
Kcur =
|
|
12881
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12882
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
12035
12883
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
12036
12884
|
}
|
|
12037
12885
|
|
|
12038
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12039
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12040
12886
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12041
12887
|
|
|
12042
12888
|
Qcur = ggml_rope_ext(
|
|
@@ -13135,13 +13981,11 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
13135
13981
|
}
|
|
13136
13982
|
};
|
|
13137
13983
|
|
|
13138
|
-
|
|
13139
13984
|
struct llm_build_granite : public llm_graph_context {
|
|
13140
13985
|
llm_build_granite(
|
|
13141
13986
|
const llama_model & model,
|
|
13142
13987
|
const llm_graph_params & params,
|
|
13143
|
-
ggml_cgraph * gf
|
|
13144
|
-
const bool use_rope = true)
|
|
13988
|
+
ggml_cgraph * gf)
|
|
13145
13989
|
: llm_graph_context(params) {
|
|
13146
13990
|
|
|
13147
13991
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -13156,14 +14000,12 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13156
14000
|
|
|
13157
14001
|
// inp_pos - built only if rope enabled
|
|
13158
14002
|
ggml_tensor * inp_pos = nullptr;
|
|
13159
|
-
if (
|
|
14003
|
+
if (hparams.rope_finetuned) {
|
|
13160
14004
|
inp_pos = build_inp_pos();
|
|
13161
14005
|
}
|
|
13162
14006
|
|
|
13163
14007
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13164
14008
|
|
|
13165
|
-
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
13166
|
-
|
|
13167
14009
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13168
14010
|
|
|
13169
14011
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -13176,21 +14018,956 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13176
14018
|
cb(cur, "attn_norm", il);
|
|
13177
14019
|
|
|
13178
14020
|
// self-attention
|
|
13179
|
-
|
|
13180
|
-
|
|
13181
|
-
|
|
13182
|
-
|
|
13183
|
-
|
|
13184
|
-
|
|
13185
|
-
|
|
13186
|
-
|
|
13187
|
-
|
|
13188
|
-
|
|
13189
|
-
|
|
13190
|
-
|
|
13191
|
-
|
|
13192
|
-
|
|
13193
|
-
|
|
14021
|
+
cur = build_attention_layer(
|
|
14022
|
+
gf, cur, inp_pos, inp_attn,
|
|
14023
|
+
model, n_embd_head, il);
|
|
14024
|
+
|
|
14025
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14026
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14027
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14028
|
+
}
|
|
14029
|
+
|
|
14030
|
+
// ffn
|
|
14031
|
+
cur = build_layer_ffn(cur, inpSA, model, il);
|
|
14032
|
+
|
|
14033
|
+
// input for next layer
|
|
14034
|
+
inpL = cur;
|
|
14035
|
+
}
|
|
14036
|
+
|
|
14037
|
+
cur = inpL;
|
|
14038
|
+
|
|
14039
|
+
cur = build_norm(cur,
|
|
14040
|
+
model.output_norm, NULL,
|
|
14041
|
+
LLM_NORM_RMS, -1);
|
|
14042
|
+
|
|
14043
|
+
cb(cur, "result_norm", -1);
|
|
14044
|
+
res->t_embd = cur;
|
|
14045
|
+
|
|
14046
|
+
// lm_head
|
|
14047
|
+
cur = build_lora_mm(model.output, cur);
|
|
14048
|
+
|
|
14049
|
+
// For Granite architectures - scale logits
|
|
14050
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
14051
|
+
cb(cur, "result_output", -1);
|
|
14052
|
+
res->t_logits = cur;
|
|
14053
|
+
|
|
14054
|
+
ggml_build_forward_expand(gf, cur);
|
|
14055
|
+
}
|
|
14056
|
+
|
|
14057
|
+
ggml_tensor * build_attention_layer(
|
|
14058
|
+
ggml_cgraph * gf,
|
|
14059
|
+
ggml_tensor * cur,
|
|
14060
|
+
ggml_tensor * inp_pos,
|
|
14061
|
+
llm_graph_input_attn_kv_unified * inp_attn,
|
|
14062
|
+
const llama_model & model,
|
|
14063
|
+
const int64_t n_embd_head,
|
|
14064
|
+
const int il) {
|
|
14065
|
+
|
|
14066
|
+
// compute Q and K and (optionally) RoPE them
|
|
14067
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14068
|
+
cb(Qcur, "Qcur", il);
|
|
14069
|
+
if (model.layers[il].bq) {
|
|
14070
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14071
|
+
cb(Qcur, "Qcur", il);
|
|
14072
|
+
}
|
|
14073
|
+
|
|
14074
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14075
|
+
cb(Kcur, "Kcur", il);
|
|
14076
|
+
if (model.layers[il].bk) {
|
|
14077
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14078
|
+
cb(Kcur, "Kcur", il);
|
|
14079
|
+
}
|
|
14080
|
+
|
|
14081
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14082
|
+
cb(Vcur, "Vcur", il);
|
|
14083
|
+
if (model.layers[il].bv) {
|
|
14084
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14085
|
+
cb(Vcur, "Vcur", il);
|
|
14086
|
+
}
|
|
14087
|
+
|
|
14088
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
14089
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14090
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14091
|
+
|
|
14092
|
+
const bool use_rope = hparams.rope_finetuned;
|
|
14093
|
+
if (use_rope) {
|
|
14094
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
14095
|
+
Qcur = ggml_rope_ext(
|
|
14096
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
14097
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14098
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14099
|
+
);
|
|
14100
|
+
|
|
14101
|
+
Kcur = ggml_rope_ext(
|
|
14102
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
14103
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14104
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14105
|
+
);
|
|
14106
|
+
}
|
|
14107
|
+
|
|
14108
|
+
cb(Qcur, "Qcur", il);
|
|
14109
|
+
cb(Kcur, "Kcur", il);
|
|
14110
|
+
cb(Vcur, "Vcur", il);
|
|
14111
|
+
|
|
14112
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14113
|
+
cur = build_attn(inp_attn, gf,
|
|
14114
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14115
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
14116
|
+
cb(cur, "attn_out", il);
|
|
14117
|
+
return cur;
|
|
14118
|
+
}
|
|
14119
|
+
|
|
14120
|
+
ggml_tensor * build_layer_ffn(
|
|
14121
|
+
ggml_tensor * cur,
|
|
14122
|
+
ggml_tensor * inpSA,
|
|
14123
|
+
const llama_model & model,
|
|
14124
|
+
const int il) {
|
|
14125
|
+
|
|
14126
|
+
// For Granite architectures - scale residual
|
|
14127
|
+
if (hparams.f_residual_scale) {
|
|
14128
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
14129
|
+
}
|
|
14130
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14131
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14132
|
+
|
|
14133
|
+
// feed-forward network (non-MoE)
|
|
14134
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
14135
|
+
|
|
14136
|
+
cur = build_norm(ffn_inp,
|
|
14137
|
+
model.layers[il].ffn_norm, NULL,
|
|
14138
|
+
LLM_NORM_RMS, il);
|
|
14139
|
+
cb(cur, "ffn_norm", il);
|
|
14140
|
+
|
|
14141
|
+
cur = build_ffn(cur,
|
|
14142
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14143
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
14144
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14145
|
+
NULL,
|
|
14146
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14147
|
+
cb(cur, "ffn_out", il);
|
|
14148
|
+
|
|
14149
|
+
} else {
|
|
14150
|
+
// MoE branch
|
|
14151
|
+
cur = build_norm(ffn_inp,
|
|
14152
|
+
model.layers[il].ffn_norm, NULL,
|
|
14153
|
+
LLM_NORM_RMS, il);
|
|
14154
|
+
cb(cur, "ffn_norm", il);
|
|
14155
|
+
|
|
14156
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
14157
|
+
model.layers[il].ffn_gate_inp,
|
|
14158
|
+
model.layers[il].ffn_up_exps,
|
|
14159
|
+
model.layers[il].ffn_gate_exps,
|
|
14160
|
+
model.layers[il].ffn_down_exps,
|
|
14161
|
+
nullptr,
|
|
14162
|
+
n_expert, n_expert_used,
|
|
14163
|
+
LLM_FFN_SILU, true,
|
|
14164
|
+
false, 0.0,
|
|
14165
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
14166
|
+
il);
|
|
14167
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
14168
|
+
|
|
14169
|
+
// For Granite MoE Shared
|
|
14170
|
+
if (hparams.n_ff_shexp > 0) {
|
|
14171
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
14172
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
14173
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
14174
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
14175
|
+
NULL,
|
|
14176
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14177
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
14178
|
+
|
|
14179
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
14180
|
+
cb(cur, "ffn_out", il);
|
|
14181
|
+
} else {
|
|
14182
|
+
cur = moe_out;
|
|
14183
|
+
}
|
|
14184
|
+
}
|
|
14185
|
+
|
|
14186
|
+
// For Granite architectures - scale residual
|
|
14187
|
+
if (hparams.f_residual_scale) {
|
|
14188
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
14189
|
+
}
|
|
14190
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14191
|
+
cb(cur, "ffn_out", il);
|
|
14192
|
+
|
|
14193
|
+
cur = build_cvec(cur, il);
|
|
14194
|
+
cb(cur, "l_out", il);
|
|
14195
|
+
|
|
14196
|
+
return cur;
|
|
14197
|
+
}
|
|
14198
|
+
};
|
|
14199
|
+
|
|
14200
|
+
struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
14201
|
+
|
|
14202
|
+
llm_build_granite_hybrid(
|
|
14203
|
+
const llama_model & model,
|
|
14204
|
+
const llm_graph_params & params,
|
|
14205
|
+
ggml_cgraph * gf) :
|
|
14206
|
+
llm_graph_context_mamba(params) {
|
|
14207
|
+
|
|
14208
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14209
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14210
|
+
|
|
14211
|
+
ggml_tensor * cur;
|
|
14212
|
+
ggml_tensor * inpL;
|
|
14213
|
+
|
|
14214
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14215
|
+
|
|
14216
|
+
auto * inp = build_inp_mem_hybrid();
|
|
14217
|
+
|
|
14218
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14219
|
+
|
|
14220
|
+
// Positional embeddings populated if rope enabled
|
|
14221
|
+
ggml_tensor * inp_pos = nullptr;
|
|
14222
|
+
if (hparams.rope_finetuned) {
|
|
14223
|
+
inp_pos = build_inp_pos();
|
|
14224
|
+
}
|
|
14225
|
+
|
|
14226
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14227
|
+
struct ggml_tensor * inpSA = inpL;
|
|
14228
|
+
|
|
14229
|
+
// norm
|
|
14230
|
+
cur = build_norm(inpL,
|
|
14231
|
+
model.layers[il].attn_norm, NULL,
|
|
14232
|
+
LLM_NORM_RMS, il);
|
|
14233
|
+
cb(cur, "attn_norm", il);
|
|
14234
|
+
|
|
14235
|
+
if (hparams.is_recurrent(il)) {
|
|
14236
|
+
// ssm layer //
|
|
14237
|
+
cur = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il);
|
|
14238
|
+
} else {
|
|
14239
|
+
// attention layer //
|
|
14240
|
+
cur = build_attention_layer(
|
|
14241
|
+
gf, cur, inp_pos, inp->get_attn(), model,
|
|
14242
|
+
n_embd_head, il);
|
|
14243
|
+
}
|
|
14244
|
+
|
|
14245
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14246
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14247
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14248
|
+
}
|
|
14249
|
+
|
|
14250
|
+
// ffn
|
|
14251
|
+
cur = build_layer_ffn(cur, inpSA, model, il);
|
|
14252
|
+
|
|
14253
|
+
// input for next layer
|
|
14254
|
+
inpL = cur;
|
|
14255
|
+
}
|
|
14256
|
+
|
|
14257
|
+
cur = inpL;
|
|
14258
|
+
|
|
14259
|
+
cur = build_norm(cur,
|
|
14260
|
+
model.output_norm, NULL,
|
|
14261
|
+
LLM_NORM_RMS, -1);
|
|
14262
|
+
|
|
14263
|
+
cb(cur, "result_norm", -1);
|
|
14264
|
+
res->t_embd = cur;
|
|
14265
|
+
|
|
14266
|
+
// lm_head
|
|
14267
|
+
cur = build_lora_mm(model.output, cur);
|
|
14268
|
+
|
|
14269
|
+
// For Granite architectures - scale logits
|
|
14270
|
+
if (hparams.f_logit_scale) {
|
|
14271
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
14272
|
+
}
|
|
14273
|
+
cb(cur, "result_output", -1);
|
|
14274
|
+
res->t_logits = cur;
|
|
14275
|
+
|
|
14276
|
+
ggml_build_forward_expand(gf, cur);
|
|
14277
|
+
}
|
|
14278
|
+
|
|
14279
|
+
ggml_tensor * build_attention_layer(
|
|
14280
|
+
ggml_cgraph * gf,
|
|
14281
|
+
ggml_tensor * cur,
|
|
14282
|
+
ggml_tensor * inp_pos,
|
|
14283
|
+
llm_graph_input_attn_kv_unified * inp_attn,
|
|
14284
|
+
const llama_model & model,
|
|
14285
|
+
const int64_t n_embd_head,
|
|
14286
|
+
const int il) {
|
|
14287
|
+
|
|
14288
|
+
// compute Q and K and (optionally) RoPE them
|
|
14289
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14290
|
+
cb(Qcur, "Qcur", il);
|
|
14291
|
+
if (model.layers[il].bq) {
|
|
14292
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14293
|
+
cb(Qcur, "Qcur", il);
|
|
14294
|
+
}
|
|
14295
|
+
|
|
14296
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14297
|
+
cb(Kcur, "Kcur", il);
|
|
14298
|
+
if (model.layers[il].bk) {
|
|
14299
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14300
|
+
cb(Kcur, "Kcur", il);
|
|
14301
|
+
}
|
|
14302
|
+
|
|
14303
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14304
|
+
cb(Vcur, "Vcur", il);
|
|
14305
|
+
if (model.layers[il].bv) {
|
|
14306
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14307
|
+
cb(Vcur, "Vcur", il);
|
|
14308
|
+
}
|
|
14309
|
+
|
|
14310
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
14311
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14312
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14313
|
+
|
|
14314
|
+
const bool use_rope = hparams.rope_finetuned;
|
|
14315
|
+
if (use_rope) {
|
|
14316
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
14317
|
+
Qcur = ggml_rope_ext(
|
|
14318
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
14319
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14320
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14321
|
+
);
|
|
14322
|
+
|
|
14323
|
+
Kcur = ggml_rope_ext(
|
|
14324
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
14325
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14326
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14327
|
+
);
|
|
14328
|
+
}
|
|
14329
|
+
|
|
14330
|
+
cb(Qcur, "Qcur", il);
|
|
14331
|
+
cb(Kcur, "Kcur", il);
|
|
14332
|
+
cb(Vcur, "Vcur", il);
|
|
14333
|
+
|
|
14334
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14335
|
+
cur = build_attn(inp_attn, gf,
|
|
14336
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14337
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
14338
|
+
cb(cur, "attn_out", il);
|
|
14339
|
+
return cur;
|
|
14340
|
+
}
|
|
14341
|
+
|
|
14342
|
+
ggml_tensor * build_layer_ffn(
|
|
14343
|
+
ggml_tensor * cur,
|
|
14344
|
+
ggml_tensor * inpSA,
|
|
14345
|
+
const llama_model & model,
|
|
14346
|
+
const int il) {
|
|
14347
|
+
|
|
14348
|
+
// For Granite architectures - scale residual
|
|
14349
|
+
if (hparams.f_residual_scale) {
|
|
14350
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
14351
|
+
}
|
|
14352
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14353
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14354
|
+
|
|
14355
|
+
// feed-forward network (non-MoE)
|
|
14356
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
14357
|
+
|
|
14358
|
+
cur = build_norm(ffn_inp,
|
|
14359
|
+
model.layers[il].ffn_norm, NULL,
|
|
14360
|
+
LLM_NORM_RMS, il);
|
|
14361
|
+
cb(cur, "ffn_norm", il);
|
|
14362
|
+
|
|
14363
|
+
cur = build_ffn(cur,
|
|
14364
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14365
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
14366
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14367
|
+
NULL,
|
|
14368
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14369
|
+
cb(cur, "ffn_out", il);
|
|
14370
|
+
|
|
14371
|
+
} else {
|
|
14372
|
+
// MoE branch
|
|
14373
|
+
cur = build_norm(ffn_inp,
|
|
14374
|
+
model.layers[il].ffn_norm, NULL,
|
|
14375
|
+
LLM_NORM_RMS, il);
|
|
14376
|
+
cb(cur, "ffn_norm", il);
|
|
14377
|
+
|
|
14378
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
14379
|
+
model.layers[il].ffn_gate_inp,
|
|
14380
|
+
model.layers[il].ffn_up_exps,
|
|
14381
|
+
model.layers[il].ffn_gate_exps,
|
|
14382
|
+
model.layers[il].ffn_down_exps,
|
|
14383
|
+
nullptr,
|
|
14384
|
+
n_expert, n_expert_used,
|
|
14385
|
+
LLM_FFN_SILU, true,
|
|
14386
|
+
false, 0.0,
|
|
14387
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
14388
|
+
il);
|
|
14389
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
14390
|
+
|
|
14391
|
+
// For Granite MoE Shared
|
|
14392
|
+
if (hparams.n_ff_shexp > 0) {
|
|
14393
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
14394
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
14395
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
14396
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
14397
|
+
NULL,
|
|
14398
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14399
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
14400
|
+
|
|
14401
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
14402
|
+
cb(cur, "ffn_out", il);
|
|
14403
|
+
} else {
|
|
14404
|
+
cur = moe_out;
|
|
14405
|
+
}
|
|
14406
|
+
}
|
|
14407
|
+
|
|
14408
|
+
// For Granite architectures - scale residual
|
|
14409
|
+
if (hparams.f_residual_scale) {
|
|
14410
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
14411
|
+
}
|
|
14412
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14413
|
+
cb(cur, "ffn_out", il);
|
|
14414
|
+
|
|
14415
|
+
cur = build_cvec(cur, il);
|
|
14416
|
+
cb(cur, "l_out", il);
|
|
14417
|
+
|
|
14418
|
+
return cur;
|
|
14419
|
+
}
|
|
14420
|
+
};
|
|
14421
|
+
|
|
14422
|
+
// ref: https://github.com/facebookresearch/chameleon
|
|
14423
|
+
// based on the original build_llama() function, changes:
|
|
14424
|
+
// * qk-norm
|
|
14425
|
+
// * swin-norm
|
|
14426
|
+
// * removed bias
|
|
14427
|
+
// * removed MoE
|
|
14428
|
+
struct llm_build_chameleon : public llm_graph_context {
|
|
14429
|
+
llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14430
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14431
|
+
|
|
14432
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14433
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
14434
|
+
|
|
14435
|
+
ggml_tensor * cur;
|
|
14436
|
+
ggml_tensor * inpL;
|
|
14437
|
+
|
|
14438
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14439
|
+
|
|
14440
|
+
// inp_pos - contains the positions
|
|
14441
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
14442
|
+
|
|
14443
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
14444
|
+
|
|
14445
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14446
|
+
|
|
14447
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14448
|
+
ggml_tensor * inpSA = inpL;
|
|
14449
|
+
|
|
14450
|
+
// norm
|
|
14451
|
+
if (hparams.swin_norm) {
|
|
14452
|
+
cur = inpL;
|
|
14453
|
+
} else {
|
|
14454
|
+
cur = build_norm(inpL,
|
|
14455
|
+
model.layers[il].attn_norm, NULL,
|
|
14456
|
+
LLM_NORM_RMS, il);
|
|
14457
|
+
cb(cur, "attn_norm", il);
|
|
14458
|
+
}
|
|
14459
|
+
|
|
14460
|
+
// self-attention
|
|
14461
|
+
{
|
|
14462
|
+
// compute Q and K and RoPE them
|
|
14463
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14464
|
+
cb(Qcur, "Qcur", il);
|
|
14465
|
+
|
|
14466
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14467
|
+
cb(Kcur, "Kcur", il);
|
|
14468
|
+
|
|
14469
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14470
|
+
cb(Vcur, "Vcur", il);
|
|
14471
|
+
|
|
14472
|
+
if (model.layers[il].attn_q_norm) {
|
|
14473
|
+
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
|
14474
|
+
ggml_element_size(Qcur) * n_embd_head,
|
|
14475
|
+
ggml_element_size(Qcur) * n_embd_head * n_head,
|
|
14476
|
+
0);
|
|
14477
|
+
cb(Qcur, "Qcur", il);
|
|
14478
|
+
|
|
14479
|
+
Qcur = build_norm(Qcur,
|
|
14480
|
+
model.layers[il].attn_q_norm,
|
|
14481
|
+
model.layers[il].attn_q_norm_b,
|
|
14482
|
+
LLM_NORM, il);
|
|
14483
|
+
cb(Qcur, "Qcur", il);
|
|
14484
|
+
}
|
|
14485
|
+
|
|
14486
|
+
if (model.layers[il].attn_k_norm) {
|
|
14487
|
+
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
|
14488
|
+
ggml_element_size(Kcur) * n_embd_head,
|
|
14489
|
+
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
|
14490
|
+
0);
|
|
14491
|
+
cb(Kcur, "Kcur", il);
|
|
14492
|
+
|
|
14493
|
+
Kcur = build_norm(Kcur,
|
|
14494
|
+
model.layers[il].attn_k_norm,
|
|
14495
|
+
model.layers[il].attn_k_norm_b,
|
|
14496
|
+
LLM_NORM, il);
|
|
14497
|
+
cb(Kcur, "Kcur", il);
|
|
14498
|
+
}
|
|
14499
|
+
|
|
14500
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
14501
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14502
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14503
|
+
|
|
14504
|
+
Qcur = ggml_rope_ext(
|
|
14505
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
14506
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14507
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14508
|
+
);
|
|
14509
|
+
|
|
14510
|
+
Kcur = ggml_rope_ext(
|
|
14511
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
14512
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14513
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14514
|
+
);
|
|
14515
|
+
|
|
14516
|
+
cb(Qcur, "Qcur", il);
|
|
14517
|
+
cb(Kcur, "Kcur", il);
|
|
14518
|
+
cb(Vcur, "Vcur", il);
|
|
14519
|
+
|
|
14520
|
+
cur = build_attn(inp_attn, gf,
|
|
14521
|
+
model.layers[il].wo, nullptr,
|
|
14522
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14523
|
+
}
|
|
14524
|
+
|
|
14525
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14526
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14527
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14528
|
+
}
|
|
14529
|
+
|
|
14530
|
+
if (hparams.swin_norm) {
|
|
14531
|
+
cur = build_norm(cur,
|
|
14532
|
+
model.layers[il].attn_norm, NULL,
|
|
14533
|
+
LLM_NORM_RMS, il);
|
|
14534
|
+
}
|
|
14535
|
+
|
|
14536
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14537
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14538
|
+
|
|
14539
|
+
// feed-forward network
|
|
14540
|
+
if (!hparams.swin_norm) {
|
|
14541
|
+
cur = build_norm(ffn_inp,
|
|
14542
|
+
model.layers[il].ffn_norm, NULL,
|
|
14543
|
+
LLM_NORM_RMS, il);
|
|
14544
|
+
cb(cur, "ffn_norm", il);
|
|
14545
|
+
}
|
|
14546
|
+
|
|
14547
|
+
cur = build_ffn(cur,
|
|
14548
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
14549
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
14550
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
14551
|
+
NULL,
|
|
14552
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14553
|
+
cb(cur, "ffn_out", il);
|
|
14554
|
+
|
|
14555
|
+
if (hparams.swin_norm) {
|
|
14556
|
+
cur = build_norm(cur,
|
|
14557
|
+
model.layers[il].ffn_norm, NULL,
|
|
14558
|
+
LLM_NORM_RMS, il);
|
|
14559
|
+
cb(cur, "ffn_norm", il);
|
|
14560
|
+
}
|
|
14561
|
+
|
|
14562
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14563
|
+
cb(cur, "ffn_out", il);
|
|
14564
|
+
|
|
14565
|
+
cur = build_cvec(cur, il);
|
|
14566
|
+
cb(cur, "l_out", il);
|
|
14567
|
+
|
|
14568
|
+
// input for next layer
|
|
14569
|
+
inpL = cur;
|
|
14570
|
+
}
|
|
14571
|
+
|
|
14572
|
+
cur = inpL;
|
|
14573
|
+
|
|
14574
|
+
cur = build_norm(cur,
|
|
14575
|
+
model.output_norm, NULL,
|
|
14576
|
+
LLM_NORM_RMS, -1);
|
|
14577
|
+
|
|
14578
|
+
cb(cur, "result_norm", -1);
|
|
14579
|
+
res->t_embd = cur;
|
|
14580
|
+
|
|
14581
|
+
// lm_head
|
|
14582
|
+
cur = build_lora_mm(model.output, cur);
|
|
14583
|
+
cb(cur, "result_output_with_img_logits", -1);
|
|
14584
|
+
|
|
14585
|
+
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
|
|
14586
|
+
// Needs to be removed once image outputs are supported.
|
|
14587
|
+
int img_token_end_idx = 8196;
|
|
14588
|
+
int img_token_start_idx = 4;
|
|
14589
|
+
int num_img_tokens = img_token_end_idx - img_token_start_idx;
|
|
14590
|
+
// creates 1d tensor of size num_img_tokens and values -FLT_MAX,
|
|
14591
|
+
// which ensures that text token values are always at least larger than image token values
|
|
14592
|
+
ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
|
|
14593
|
+
img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
|
|
14594
|
+
cb(img_logits, "img_logits", -1);
|
|
14595
|
+
|
|
14596
|
+
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
|
|
14597
|
+
|
|
14598
|
+
cb(cur, "result_output", -1);
|
|
14599
|
+
res->t_logits = cur;
|
|
14600
|
+
|
|
14601
|
+
ggml_build_forward_expand(gf, cur);
|
|
14602
|
+
}
|
|
14603
|
+
};
|
|
14604
|
+
|
|
14605
|
+
struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
|
14606
|
+
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14607
|
+
ggml_tensor * cur;
|
|
14608
|
+
ggml_tensor * inpL;
|
|
14609
|
+
|
|
14610
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14611
|
+
|
|
14612
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
|
|
14613
|
+
|
|
14614
|
+
cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
|
|
14615
|
+
cur = ggml_add(ctx0, cur, model.conv1d_b);
|
|
14616
|
+
|
|
14617
|
+
// posnet
|
|
14618
|
+
for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
|
|
14619
|
+
const auto & layer = model.layers[il].posnet;
|
|
14620
|
+
|
|
14621
|
+
inpL = cur;
|
|
14622
|
+
|
|
14623
|
+
switch (il) {
|
|
14624
|
+
case 0:
|
|
14625
|
+
case 1:
|
|
14626
|
+
case 3:
|
|
14627
|
+
case 4:
|
|
14628
|
+
{
|
|
14629
|
+
cur = build_norm(cur,
|
|
14630
|
+
layer.norm1,
|
|
14631
|
+
layer.norm1_b,
|
|
14632
|
+
LLM_NORM_GROUP, 0);
|
|
14633
|
+
|
|
14634
|
+
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
14635
|
+
|
|
14636
|
+
cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
|
|
14637
|
+
cur = ggml_add(ctx0, cur, layer.conv1_b);
|
|
14638
|
+
|
|
14639
|
+
cur = build_norm(cur,
|
|
14640
|
+
layer.norm2,
|
|
14641
|
+
layer.norm2_b,
|
|
14642
|
+
LLM_NORM_GROUP, 0);
|
|
14643
|
+
|
|
14644
|
+
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
14645
|
+
|
|
14646
|
+
cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
|
|
14647
|
+
cur = ggml_add(ctx0, cur, layer.conv2_b);
|
|
14648
|
+
|
|
14649
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
14650
|
+
} break;
|
|
14651
|
+
case 2:
|
|
14652
|
+
{
|
|
14653
|
+
cur = build_norm(cur,
|
|
14654
|
+
layer.attn_norm,
|
|
14655
|
+
layer.attn_norm_b,
|
|
14656
|
+
LLM_NORM_GROUP, 0);
|
|
14657
|
+
|
|
14658
|
+
ggml_tensor * q;
|
|
14659
|
+
ggml_tensor * k;
|
|
14660
|
+
ggml_tensor * v;
|
|
14661
|
+
|
|
14662
|
+
q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
|
|
14663
|
+
k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
|
|
14664
|
+
v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
|
|
14665
|
+
|
|
14666
|
+
q = ggml_add(ctx0, q, layer.attn_q_b);
|
|
14667
|
+
k = ggml_add(ctx0, k, layer.attn_k_b);
|
|
14668
|
+
v = ggml_add(ctx0, v, layer.attn_v_b);
|
|
14669
|
+
|
|
14670
|
+
q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
|
|
14671
|
+
k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
|
|
14672
|
+
|
|
14673
|
+
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
14674
|
+
|
|
14675
|
+
kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
|
|
14676
|
+
|
|
14677
|
+
cur = ggml_mul_mat(ctx0, kq, v);
|
|
14678
|
+
|
|
14679
|
+
cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
|
|
14680
|
+
cur = ggml_add(ctx0, cur, layer.attn_o_b);
|
|
14681
|
+
|
|
14682
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
14683
|
+
} break;
|
|
14684
|
+
case 5:
|
|
14685
|
+
{
|
|
14686
|
+
cur = build_norm(cur,
|
|
14687
|
+
layer.norm,
|
|
14688
|
+
layer.norm_b,
|
|
14689
|
+
LLM_NORM_GROUP, 0);
|
|
14690
|
+
} break;
|
|
14691
|
+
default: GGML_ABORT("unknown posnet layer");
|
|
14692
|
+
};
|
|
14693
|
+
}
|
|
14694
|
+
|
|
14695
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
14696
|
+
|
|
14697
|
+
cur = build_norm(cur,
|
|
14698
|
+
model.tok_norm,
|
|
14699
|
+
model.tok_norm_b,
|
|
14700
|
+
LLM_NORM, -1);
|
|
14701
|
+
|
|
14702
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
14703
|
+
|
|
14704
|
+
inpL = cur;
|
|
14705
|
+
|
|
14706
|
+
// convnext
|
|
14707
|
+
for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
|
|
14708
|
+
const auto & layer = model.layers[il].convnext;
|
|
14709
|
+
|
|
14710
|
+
cur = inpL;
|
|
14711
|
+
|
|
14712
|
+
cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
|
|
14713
|
+
cur = ggml_add(ctx0, cur, layer.dw_b);
|
|
14714
|
+
|
|
14715
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
14716
|
+
|
|
14717
|
+
cur = build_norm(cur,
|
|
14718
|
+
layer.norm,
|
|
14719
|
+
layer.norm_b,
|
|
14720
|
+
LLM_NORM, -1);
|
|
14721
|
+
|
|
14722
|
+
cur = build_ffn(cur,
|
|
14723
|
+
layer.pw1, layer.pw1_b, NULL,
|
|
14724
|
+
NULL, NULL, NULL,
|
|
14725
|
+
layer.pw2, layer.pw2_b, NULL,
|
|
14726
|
+
NULL,
|
|
14727
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
14728
|
+
|
|
14729
|
+
cur = ggml_mul(ctx0, cur, layer.gamma);
|
|
14730
|
+
|
|
14731
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
14732
|
+
|
|
14733
|
+
inpL = ggml_add(ctx0, cur, inpL);
|
|
14734
|
+
}
|
|
14735
|
+
|
|
14736
|
+
cur = inpL;
|
|
14737
|
+
|
|
14738
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
14739
|
+
|
|
14740
|
+
cur = build_norm(cur,
|
|
14741
|
+
model.output_norm,
|
|
14742
|
+
model.output_norm_b,
|
|
14743
|
+
LLM_NORM, -1);
|
|
14744
|
+
|
|
14745
|
+
// lm_head
|
|
14746
|
+
cur = build_lora_mm(model.output, cur);
|
|
14747
|
+
|
|
14748
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
|
14749
|
+
|
|
14750
|
+
cb(cur, "result_embd", -1);
|
|
14751
|
+
res->t_embd = cur;
|
|
14752
|
+
|
|
14753
|
+
ggml_build_forward_expand(gf, cur);
|
|
14754
|
+
}
|
|
14755
|
+
};
|
|
14756
|
+
|
|
14757
|
+
struct llm_build_plm : public llm_graph_context {
|
|
14758
|
+
llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14759
|
+
const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
|
|
14760
|
+
|
|
14761
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
14762
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
14763
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
14764
|
+
|
|
14765
|
+
ggml_tensor * cur;
|
|
14766
|
+
ggml_tensor * inpL;
|
|
14767
|
+
|
|
14768
|
+
// {n_embd, n_tokens}
|
|
14769
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14770
|
+
|
|
14771
|
+
// inp_pos - contains the positions
|
|
14772
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
14773
|
+
|
|
14774
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
14775
|
+
|
|
14776
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14777
|
+
|
|
14778
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14779
|
+
ggml_tensor * inpSA = inpL;
|
|
14780
|
+
|
|
14781
|
+
// norm
|
|
14782
|
+
cur = build_norm(inpL,
|
|
14783
|
+
model.layers[il].attn_norm, NULL,
|
|
14784
|
+
LLM_NORM_RMS, il);
|
|
14785
|
+
cb(cur, "attn_norm", il);
|
|
14786
|
+
|
|
14787
|
+
// self_attention
|
|
14788
|
+
{
|
|
14789
|
+
ggml_tensor * q = NULL;
|
|
14790
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
14791
|
+
cb(q, "q", il);
|
|
14792
|
+
|
|
14793
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
14794
|
+
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
|
14795
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
14796
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
14797
|
+
0);
|
|
14798
|
+
cb(q_nope, "q_nope", il);
|
|
14799
|
+
|
|
14800
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
|
14801
|
+
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
|
14802
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
14803
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
14804
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
14805
|
+
cb(q_pe, "q_pe", il);
|
|
14806
|
+
|
|
14807
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
|
14808
|
+
ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
14809
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
14810
|
+
|
|
14811
|
+
// split into {kv_lora_rank, n_tokens}
|
|
14812
|
+
ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
|
14813
|
+
kv_pe_compresseed->nb[1],
|
|
14814
|
+
0);
|
|
14815
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
14816
|
+
|
|
14817
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
|
14818
|
+
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
|
14819
|
+
kv_pe_compresseed->nb[1],
|
|
14820
|
+
kv_pe_compresseed->nb[1],
|
|
14821
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
14822
|
+
cb(k_pe, "k_pe", il);
|
|
14823
|
+
|
|
14824
|
+
kv_compressed = build_norm(kv_compressed,
|
|
14825
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
|
14826
|
+
LLM_NORM_RMS, il);
|
|
14827
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
14828
|
+
|
|
14829
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
|
14830
|
+
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
|
14831
|
+
cb(kv, "kv", il);
|
|
14832
|
+
|
|
14833
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
14834
|
+
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
|
14835
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
|
14836
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
14837
|
+
0);
|
|
14838
|
+
cb(k_nope, "k_nope", il);
|
|
14839
|
+
|
|
14840
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
|
14841
|
+
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
|
14842
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
14843
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
|
14844
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
|
14845
|
+
cb(v_states, "v_states", il);
|
|
14846
|
+
|
|
14847
|
+
v_states = ggml_cont(ctx0, v_states);
|
|
14848
|
+
cb(v_states, "v_states", il);
|
|
14849
|
+
|
|
14850
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
|
14851
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
|
14852
|
+
0);
|
|
14853
|
+
cb(v_states, "v_states", il);
|
|
14854
|
+
|
|
14855
|
+
q_pe = ggml_rope_ext(
|
|
14856
|
+
ctx0, q_pe, inp_pos, nullptr,
|
|
14857
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14858
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14859
|
+
);
|
|
14860
|
+
cb(q_pe, "q_pe", il);
|
|
14861
|
+
|
|
14862
|
+
// shared RoPE key
|
|
14863
|
+
k_pe = ggml_rope_ext(
|
|
14864
|
+
ctx0, k_pe, inp_pos, nullptr,
|
|
14865
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14866
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14867
|
+
);
|
|
14868
|
+
cb(k_pe, "k_pe", il);
|
|
14869
|
+
|
|
14870
|
+
ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
|
14871
|
+
cb(q_states, "q_states", il);
|
|
14872
|
+
|
|
14873
|
+
ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
|
14874
|
+
cb(k_states, "k_states", il);
|
|
14875
|
+
|
|
14876
|
+
cur = build_attn(inp_attn, gf,
|
|
14877
|
+
model.layers[il].wo, NULL,
|
|
14878
|
+
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
14879
|
+
}
|
|
14880
|
+
|
|
14881
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14882
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14883
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14884
|
+
}
|
|
14885
|
+
|
|
14886
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14887
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14888
|
+
|
|
14889
|
+
cur = build_norm(ffn_inp,
|
|
14890
|
+
model.layers[il].ffn_norm, NULL,
|
|
14891
|
+
LLM_NORM_RMS, il);
|
|
14892
|
+
cb(cur, "ffn_norm", il);
|
|
14893
|
+
|
|
14894
|
+
cur = build_ffn(cur,
|
|
14895
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
14896
|
+
NULL, NULL, NULL,
|
|
14897
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
14898
|
+
NULL,
|
|
14899
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
14900
|
+
cb(cur, "ffn_out", il);
|
|
14901
|
+
|
|
14902
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14903
|
+
|
|
14904
|
+
cur = build_cvec(cur, il);
|
|
14905
|
+
cb(cur, "l_out", il);
|
|
14906
|
+
|
|
14907
|
+
// input for next layer
|
|
14908
|
+
inpL = cur;
|
|
14909
|
+
}
|
|
14910
|
+
|
|
14911
|
+
cur = inpL;
|
|
14912
|
+
|
|
14913
|
+
cur = build_norm(cur,
|
|
14914
|
+
model.output_norm, NULL,
|
|
14915
|
+
LLM_NORM_RMS, -1);
|
|
14916
|
+
|
|
14917
|
+
cb(cur, "result_norm", -1);
|
|
14918
|
+
res->t_embd = cur;
|
|
14919
|
+
|
|
14920
|
+
cur = build_lora_mm(model.output, cur);
|
|
14921
|
+
|
|
14922
|
+
cb(cur, "result_output", -1);
|
|
14923
|
+
res->t_logits = cur;
|
|
14924
|
+
|
|
14925
|
+
ggml_build_forward_expand(gf, cur);
|
|
14926
|
+
}
|
|
14927
|
+
};
|
|
14928
|
+
|
|
14929
|
+
struct llm_build_bailingmoe : public llm_graph_context {
|
|
14930
|
+
llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14931
|
+
ggml_tensor * cur;
|
|
14932
|
+
ggml_tensor * inpL;
|
|
14933
|
+
|
|
14934
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14935
|
+
|
|
14936
|
+
// inp_pos - contains the positions
|
|
14937
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
14938
|
+
|
|
14939
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
14940
|
+
|
|
14941
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14942
|
+
|
|
14943
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14944
|
+
ggml_tensor * inpSA = inpL;
|
|
14945
|
+
|
|
14946
|
+
// norm
|
|
14947
|
+
cur = build_norm(inpL,
|
|
14948
|
+
model.layers[il].attn_norm, NULL,
|
|
14949
|
+
LLM_NORM_RMS, il);
|
|
14950
|
+
cb(cur, "attn_norm", il);
|
|
14951
|
+
|
|
14952
|
+
// self-attention
|
|
14953
|
+
{
|
|
14954
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
14955
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
14956
|
+
|
|
14957
|
+
// compute Q and K and RoPE them
|
|
14958
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14959
|
+
cb(Qcur, "Qcur", il);
|
|
14960
|
+
if (model.layers[il].bq) {
|
|
14961
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14962
|
+
cb(Qcur, "Qcur", il);
|
|
14963
|
+
}
|
|
14964
|
+
|
|
14965
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14966
|
+
cb(Kcur, "Kcur", il);
|
|
14967
|
+
if (model.layers[il].bk) {
|
|
14968
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14969
|
+
cb(Kcur, "Kcur", il);
|
|
14970
|
+
}
|
|
13194
14971
|
|
|
13195
14972
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13196
14973
|
cb(Vcur, "Vcur", il);
|
|
@@ -13199,24 +14976,21 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13199
14976
|
cb(Vcur, "Vcur", il);
|
|
13200
14977
|
}
|
|
13201
14978
|
|
|
13202
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur,
|
|
13203
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur,
|
|
13204
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur,
|
|
14979
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
|
14980
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
|
14981
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
|
13205
14982
|
|
|
13206
|
-
|
|
13207
|
-
|
|
13208
|
-
|
|
13209
|
-
|
|
13210
|
-
|
|
13211
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13212
|
-
);
|
|
14983
|
+
Qcur = ggml_rope_ext(
|
|
14984
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
14985
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14986
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14987
|
+
);
|
|
13213
14988
|
|
|
13214
|
-
|
|
13215
|
-
|
|
13216
|
-
|
|
13217
|
-
|
|
13218
|
-
|
|
13219
|
-
}
|
|
14989
|
+
Kcur = ggml_rope_ext(
|
|
14990
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
14991
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14992
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14993
|
+
);
|
|
13220
14994
|
|
|
13221
14995
|
cb(Qcur, "Qcur", il);
|
|
13222
14996
|
cb(Kcur, "Kcur", il);
|
|
@@ -13224,77 +14998,51 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13224
14998
|
|
|
13225
14999
|
cur = build_attn(inp_attn, gf,
|
|
13226
15000
|
model.layers[il].wo, model.layers[il].bo,
|
|
13227
|
-
Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
13228
|
-
cb(cur, "attn_out", il);
|
|
15001
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
13229
15002
|
}
|
|
13230
15003
|
|
|
13231
15004
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
13232
15005
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13233
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13234
|
-
}
|
|
13235
|
-
|
|
13236
|
-
// For Granite architectures - scale residual
|
|
13237
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
13238
|
-
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13239
|
-
cb(ffn_inp, "ffn_inp", il);
|
|
13240
|
-
|
|
13241
|
-
// feed-forward network (non-MoE)
|
|
13242
|
-
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
13243
|
-
|
|
13244
|
-
cur = build_norm(ffn_inp,
|
|
13245
|
-
model.layers[il].ffn_norm, NULL,
|
|
13246
|
-
LLM_NORM_RMS, il);
|
|
13247
|
-
cb(cur, "ffn_norm", il);
|
|
13248
|
-
|
|
13249
|
-
cur = build_ffn(cur,
|
|
13250
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
13251
|
-
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
13252
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
13253
|
-
NULL,
|
|
13254
|
-
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13255
|
-
cb(cur, "ffn_out", il);
|
|
15006
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15007
|
+
}
|
|
13256
15008
|
|
|
13257
|
-
|
|
13258
|
-
|
|
13259
|
-
|
|
13260
|
-
|
|
13261
|
-
|
|
13262
|
-
|
|
15009
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15010
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
15011
|
+
|
|
15012
|
+
cur = build_norm(ffn_inp,
|
|
15013
|
+
model.layers[il].ffn_norm, NULL,
|
|
15014
|
+
LLM_NORM_RMS, il);
|
|
15015
|
+
cb(cur, "ffn_norm", il);
|
|
13263
15016
|
|
|
13264
|
-
|
|
15017
|
+
ggml_tensor * moe_out =
|
|
15018
|
+
build_moe_ffn(cur,
|
|
13265
15019
|
model.layers[il].ffn_gate_inp,
|
|
13266
15020
|
model.layers[il].ffn_up_exps,
|
|
13267
15021
|
model.layers[il].ffn_gate_exps,
|
|
13268
15022
|
model.layers[il].ffn_down_exps,
|
|
13269
15023
|
nullptr,
|
|
13270
15024
|
n_expert, n_expert_used,
|
|
13271
|
-
LLM_FFN_SILU,
|
|
13272
|
-
false,
|
|
15025
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
15026
|
+
false, hparams.expert_weights_scale,
|
|
13273
15027
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
13274
15028
|
il);
|
|
13275
|
-
|
|
15029
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
13276
15030
|
|
|
13277
|
-
|
|
13278
|
-
|
|
13279
|
-
|
|
15031
|
+
// FFN shared expert
|
|
15032
|
+
{
|
|
15033
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
13280
15034
|
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
13281
15035
|
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
13282
15036
|
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
13283
15037
|
NULL,
|
|
13284
15038
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13285
|
-
|
|
15039
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
13286
15040
|
|
|
13287
|
-
|
|
13288
|
-
|
|
13289
|
-
} else {
|
|
13290
|
-
cur = moe_out;
|
|
13291
|
-
}
|
|
15041
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
15042
|
+
cb(cur, "ffn_out", il);
|
|
13292
15043
|
}
|
|
13293
15044
|
|
|
13294
|
-
// For Granite architectures - scale residual
|
|
13295
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
13296
15045
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13297
|
-
cb(cur, "ffn_out", il);
|
|
13298
15046
|
|
|
13299
15047
|
cur = build_cvec(cur, il);
|
|
13300
15048
|
cb(cur, "l_out", il);
|
|
@@ -13315,8 +15063,6 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13315
15063
|
// lm_head
|
|
13316
15064
|
cur = build_lora_mm(model.output, cur);
|
|
13317
15065
|
|
|
13318
|
-
// For Granite architectures - scale logits
|
|
13319
|
-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
13320
15066
|
cb(cur, "result_output", -1);
|
|
13321
15067
|
res->t_logits = cur;
|
|
13322
15068
|
|
|
@@ -13324,14 +15070,8 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13324
15070
|
}
|
|
13325
15071
|
};
|
|
13326
15072
|
|
|
13327
|
-
|
|
13328
|
-
|
|
13329
|
-
// * qk-norm
|
|
13330
|
-
// * swin-norm
|
|
13331
|
-
// * removed bias
|
|
13332
|
-
// * removed MoE
|
|
13333
|
-
struct llm_build_chameleon : public llm_graph_context {
|
|
13334
|
-
llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
15073
|
+
struct llm_build_dots1 : public llm_graph_context {
|
|
15074
|
+
llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
13335
15075
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13336
15076
|
|
|
13337
15077
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -13353,16 +15093,12 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
13353
15093
|
ggml_tensor * inpSA = inpL;
|
|
13354
15094
|
|
|
13355
15095
|
// norm
|
|
13356
|
-
|
|
13357
|
-
|
|
13358
|
-
|
|
13359
|
-
|
|
13360
|
-
model.layers[il].attn_norm, NULL,
|
|
13361
|
-
LLM_NORM_RMS, il);
|
|
13362
|
-
cb(cur, "attn_norm", il);
|
|
13363
|
-
}
|
|
15096
|
+
cur = build_norm(inpL,
|
|
15097
|
+
model.layers[il].attn_norm, NULL,
|
|
15098
|
+
LLM_NORM_RMS, il);
|
|
15099
|
+
cb(cur, "attn_norm", il);
|
|
13364
15100
|
|
|
13365
|
-
//
|
|
15101
|
+
// self_attention
|
|
13366
15102
|
{
|
|
13367
15103
|
// compute Q and K and RoPE them
|
|
13368
15104
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -13374,44 +15110,22 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
13374
15110
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13375
15111
|
cb(Vcur, "Vcur", il);
|
|
13376
15112
|
|
|
13377
|
-
if (model.layers[il].attn_q_norm) {
|
|
13378
|
-
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
|
13379
|
-
ggml_element_size(Qcur) * n_embd_head,
|
|
13380
|
-
ggml_element_size(Qcur) * n_embd_head * n_head,
|
|
13381
|
-
0);
|
|
13382
|
-
cb(Qcur, "Qcur", il);
|
|
13383
|
-
|
|
13384
|
-
Qcur = build_norm(Qcur,
|
|
13385
|
-
model.layers[il].attn_q_norm,
|
|
13386
|
-
model.layers[il].attn_q_norm_b,
|
|
13387
|
-
LLM_NORM, il);
|
|
13388
|
-
cb(Qcur, "Qcur", il);
|
|
13389
|
-
}
|
|
13390
|
-
|
|
13391
|
-
if (model.layers[il].attn_k_norm) {
|
|
13392
|
-
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
|
13393
|
-
ggml_element_size(Kcur) * n_embd_head,
|
|
13394
|
-
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
|
13395
|
-
0);
|
|
13396
|
-
cb(Kcur, "Kcur", il);
|
|
13397
|
-
|
|
13398
|
-
Kcur = build_norm(Kcur,
|
|
13399
|
-
model.layers[il].attn_k_norm,
|
|
13400
|
-
model.layers[il].attn_k_norm_b,
|
|
13401
|
-
LLM_NORM, il);
|
|
13402
|
-
cb(Kcur, "Kcur", il);
|
|
13403
|
-
}
|
|
13404
|
-
|
|
13405
15113
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13406
15114
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13407
15115
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13408
15116
|
|
|
15117
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
15118
|
+
cb(Qcur, "Qcur_normed", il);
|
|
15119
|
+
|
|
13409
15120
|
Qcur = ggml_rope_ext(
|
|
13410
15121
|
ctx0, Qcur, inp_pos, nullptr,
|
|
13411
15122
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13412
15123
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13413
15124
|
);
|
|
13414
15125
|
|
|
15126
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
15127
|
+
cb(Kcur, "Kcur_normed", il);
|
|
15128
|
+
|
|
13415
15129
|
Kcur = ggml_rope_ext(
|
|
13416
15130
|
ctx0, Kcur, inp_pos, nullptr,
|
|
13417
15131
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -13423,7 +15137,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
13423
15137
|
cb(Vcur, "Vcur", il);
|
|
13424
15138
|
|
|
13425
15139
|
cur = build_attn(inp_attn, gf,
|
|
13426
|
-
model.layers[il].wo,
|
|
15140
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
13427
15141
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13428
15142
|
}
|
|
13429
15143
|
|
|
@@ -13432,40 +15146,53 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
13432
15146
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13433
15147
|
}
|
|
13434
15148
|
|
|
13435
|
-
if (hparams.swin_norm) {
|
|
13436
|
-
cur = build_norm(cur,
|
|
13437
|
-
model.layers[il].attn_norm, NULL,
|
|
13438
|
-
LLM_NORM_RMS, il);
|
|
13439
|
-
}
|
|
13440
|
-
|
|
13441
15149
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13442
15150
|
cb(ffn_inp, "ffn_inp", il);
|
|
13443
15151
|
|
|
13444
|
-
//
|
|
13445
|
-
|
|
13446
|
-
|
|
13447
|
-
|
|
13448
|
-
|
|
13449
|
-
cb(cur, "ffn_norm", il);
|
|
13450
|
-
}
|
|
15152
|
+
// MoE branch
|
|
15153
|
+
cur = build_norm(ffn_inp,
|
|
15154
|
+
model.layers[il].ffn_norm, NULL,
|
|
15155
|
+
LLM_NORM_RMS, il);
|
|
15156
|
+
cb(cur, "ffn_norm", il);
|
|
13451
15157
|
|
|
13452
|
-
|
|
13453
|
-
|
|
13454
|
-
|
|
13455
|
-
|
|
13456
|
-
|
|
13457
|
-
|
|
13458
|
-
|
|
15158
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
|
15159
|
+
cur = build_ffn(cur,
|
|
15160
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
15161
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
15162
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
15163
|
+
NULL,
|
|
15164
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15165
|
+
cb(cur, "ffn_out", il);
|
|
15166
|
+
} else {
|
|
15167
|
+
ggml_tensor * moe_out =
|
|
15168
|
+
build_moe_ffn(cur,
|
|
15169
|
+
model.layers[il].ffn_gate_inp,
|
|
15170
|
+
model.layers[il].ffn_up_exps,
|
|
15171
|
+
model.layers[il].ffn_gate_exps,
|
|
15172
|
+
model.layers[il].ffn_down_exps,
|
|
15173
|
+
model.layers[il].ffn_exp_probs_b,
|
|
15174
|
+
n_expert, n_expert_used,
|
|
15175
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
15176
|
+
true, hparams.expert_weights_scale,
|
|
15177
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
15178
|
+
il);
|
|
15179
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
13459
15180
|
|
|
13460
|
-
|
|
13461
|
-
|
|
13462
|
-
|
|
13463
|
-
|
|
13464
|
-
|
|
15181
|
+
{
|
|
15182
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
15183
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
15184
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
15185
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
15186
|
+
NULL,
|
|
15187
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15188
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
15189
|
+
|
|
15190
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
15191
|
+
cb(cur, "ffn_out", il);
|
|
15192
|
+
}
|
|
13465
15193
|
}
|
|
13466
15194
|
|
|
13467
15195
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13468
|
-
cb(cur, "ffn_out", il);
|
|
13469
15196
|
|
|
13470
15197
|
cur = build_cvec(cur, il);
|
|
13471
15198
|
cb(cur, "l_out", il);
|
|
@@ -13485,20 +15212,6 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
13485
15212
|
|
|
13486
15213
|
// lm_head
|
|
13487
15214
|
cur = build_lora_mm(model.output, cur);
|
|
13488
|
-
cb(cur, "result_output_with_img_logits", -1);
|
|
13489
|
-
|
|
13490
|
-
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
|
|
13491
|
-
// Needs to be removed once image outputs are supported.
|
|
13492
|
-
int img_token_end_idx = 8196;
|
|
13493
|
-
int img_token_start_idx = 4;
|
|
13494
|
-
int num_img_tokens = img_token_end_idx - img_token_start_idx;
|
|
13495
|
-
// creates 1d tensor of size num_img_tokens and values -FLT_MAX,
|
|
13496
|
-
// which ensures that text token values are always at least larger than image token values
|
|
13497
|
-
ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
|
|
13498
|
-
img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
|
|
13499
|
-
cb(img_logits, "img_logits", -1);
|
|
13500
|
-
|
|
13501
|
-
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
|
|
13502
15215
|
|
|
13503
15216
|
cb(cur, "result_output", -1);
|
|
13504
15217
|
res->t_logits = cur;
|
|
@@ -13507,304 +15220,235 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
13507
15220
|
}
|
|
13508
15221
|
};
|
|
13509
15222
|
|
|
13510
|
-
struct
|
|
13511
|
-
|
|
15223
|
+
struct llm_build_ernie4_5 : public llm_graph_context {
|
|
15224
|
+
llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
15225
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15226
|
+
|
|
15227
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
15228
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
15229
|
+
|
|
13512
15230
|
ggml_tensor * cur;
|
|
13513
15231
|
ggml_tensor * inpL;
|
|
13514
15232
|
|
|
13515
15233
|
inpL = build_inp_embd(model.tok_embd);
|
|
13516
15234
|
|
|
13517
|
-
|
|
13518
|
-
|
|
13519
|
-
cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
|
|
13520
|
-
cur = ggml_add(ctx0, cur, model.conv1d_b);
|
|
13521
|
-
|
|
13522
|
-
// posnet
|
|
13523
|
-
for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
|
|
13524
|
-
const auto & layer = model.layers[il].posnet;
|
|
13525
|
-
|
|
13526
|
-
inpL = cur;
|
|
13527
|
-
|
|
13528
|
-
switch (il) {
|
|
13529
|
-
case 0:
|
|
13530
|
-
case 1:
|
|
13531
|
-
case 3:
|
|
13532
|
-
case 4:
|
|
13533
|
-
{
|
|
13534
|
-
cur = build_norm(cur,
|
|
13535
|
-
layer.norm1,
|
|
13536
|
-
layer.norm1_b,
|
|
13537
|
-
LLM_NORM_GROUP, 0);
|
|
13538
|
-
|
|
13539
|
-
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
13540
|
-
|
|
13541
|
-
cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
|
|
13542
|
-
cur = ggml_add(ctx0, cur, layer.conv1_b);
|
|
13543
|
-
|
|
13544
|
-
cur = build_norm(cur,
|
|
13545
|
-
layer.norm2,
|
|
13546
|
-
layer.norm2_b,
|
|
13547
|
-
LLM_NORM_GROUP, 0);
|
|
13548
|
-
|
|
13549
|
-
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
13550
|
-
|
|
13551
|
-
cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
|
|
13552
|
-
cur = ggml_add(ctx0, cur, layer.conv2_b);
|
|
13553
|
-
|
|
13554
|
-
cur = ggml_add(ctx0, cur, inpL);
|
|
13555
|
-
} break;
|
|
13556
|
-
case 2:
|
|
13557
|
-
{
|
|
13558
|
-
cur = build_norm(cur,
|
|
13559
|
-
layer.attn_norm,
|
|
13560
|
-
layer.attn_norm_b,
|
|
13561
|
-
LLM_NORM_GROUP, 0);
|
|
13562
|
-
|
|
13563
|
-
ggml_tensor * q;
|
|
13564
|
-
ggml_tensor * k;
|
|
13565
|
-
ggml_tensor * v;
|
|
13566
|
-
|
|
13567
|
-
q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
|
|
13568
|
-
k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
|
|
13569
|
-
v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
|
|
13570
|
-
|
|
13571
|
-
q = ggml_add(ctx0, q, layer.attn_q_b);
|
|
13572
|
-
k = ggml_add(ctx0, k, layer.attn_k_b);
|
|
13573
|
-
v = ggml_add(ctx0, v, layer.attn_v_b);
|
|
13574
|
-
|
|
13575
|
-
q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
|
|
13576
|
-
k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
|
|
15235
|
+
// inp_pos - contains the positions
|
|
15236
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13577
15237
|
|
|
13578
|
-
|
|
15238
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13579
15239
|
|
|
13580
|
-
|
|
15240
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
15241
|
+
ggml_tensor * inpSA = inpL;
|
|
13581
15242
|
|
|
13582
|
-
|
|
15243
|
+
// norm
|
|
15244
|
+
{
|
|
15245
|
+
cur = build_norm(inpL,
|
|
15246
|
+
model.layers[il].attn_norm, NULL,
|
|
15247
|
+
LLM_NORM_RMS, il);
|
|
15248
|
+
cb(cur, "attn_norm", il);
|
|
15249
|
+
}
|
|
13583
15250
|
|
|
13584
|
-
|
|
13585
|
-
|
|
15251
|
+
// self-attention
|
|
15252
|
+
{
|
|
15253
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
15254
|
+
cb(Qcur, "Qcur", il);
|
|
15255
|
+
if (model.layers[il].bq) {
|
|
15256
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15257
|
+
cb(Qcur, "Qcur", il);
|
|
15258
|
+
}
|
|
13586
15259
|
|
|
13587
|
-
|
|
13588
|
-
|
|
13589
|
-
|
|
13590
|
-
|
|
13591
|
-
|
|
13592
|
-
|
|
13593
|
-
layer.norm_b,
|
|
13594
|
-
LLM_NORM_GROUP, 0);
|
|
13595
|
-
} break;
|
|
13596
|
-
default: GGML_ABORT("unknown posnet layer");
|
|
13597
|
-
};
|
|
13598
|
-
}
|
|
15260
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
15261
|
+
cb(Kcur, "Kcur", il);
|
|
15262
|
+
if (model.layers[il].bk) {
|
|
15263
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15264
|
+
cb(Kcur, "Kcur", il);
|
|
15265
|
+
}
|
|
13599
15266
|
|
|
13600
|
-
|
|
15267
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
15268
|
+
cb(Vcur, "Vcur", il);
|
|
15269
|
+
if (model.layers[il].bv) {
|
|
15270
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15271
|
+
cb(Vcur, "Vcur", il);
|
|
15272
|
+
}
|
|
13601
15273
|
|
|
13602
|
-
|
|
13603
|
-
|
|
13604
|
-
|
|
13605
|
-
LLM_NORM, -1);
|
|
15274
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15275
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
15276
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13606
15277
|
|
|
13607
|
-
|
|
15278
|
+
Qcur = ggml_rope_ext(
|
|
15279
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
15280
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15281
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15282
|
+
);
|
|
13608
15283
|
|
|
13609
|
-
|
|
15284
|
+
Kcur = ggml_rope_ext(
|
|
15285
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
15286
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15287
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15288
|
+
);
|
|
13610
15289
|
|
|
13611
|
-
|
|
13612
|
-
|
|
13613
|
-
|
|
15290
|
+
cb(Qcur, "Qcur", il);
|
|
15291
|
+
cb(Kcur, "Kcur", il);
|
|
15292
|
+
cb(Vcur, "Vcur", il);
|
|
13614
15293
|
|
|
13615
|
-
|
|
15294
|
+
cur = build_attn(inp_attn, gf,
|
|
15295
|
+
model.layers[il].wo, NULL,
|
|
15296
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15297
|
+
}
|
|
13616
15298
|
|
|
13617
|
-
|
|
13618
|
-
|
|
15299
|
+
if (il == n_layer - 1) {
|
|
15300
|
+
// skip computing output for unused tokens
|
|
15301
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15302
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15303
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15304
|
+
}
|
|
13619
15305
|
|
|
13620
|
-
|
|
15306
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15307
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13621
15308
|
|
|
13622
|
-
|
|
13623
|
-
|
|
13624
|
-
|
|
13625
|
-
|
|
15309
|
+
// feed-forward network
|
|
15310
|
+
{
|
|
15311
|
+
cur = build_norm(ffn_inp,
|
|
15312
|
+
model.layers[il].ffn_norm, NULL,
|
|
15313
|
+
LLM_NORM_RMS, il);
|
|
15314
|
+
cb(cur, "ffn_norm", il);
|
|
13626
15315
|
|
|
13627
|
-
|
|
13628
|
-
|
|
13629
|
-
|
|
13630
|
-
|
|
13631
|
-
|
|
13632
|
-
|
|
15316
|
+
cur = build_ffn(cur,
|
|
15317
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
15318
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
15319
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
15320
|
+
NULL,
|
|
15321
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15322
|
+
cb(cur, "ffn_out", il);
|
|
15323
|
+
}
|
|
13633
15324
|
|
|
13634
|
-
cur =
|
|
15325
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13635
15326
|
|
|
13636
|
-
cur =
|
|
15327
|
+
cur = build_cvec(cur, il);
|
|
15328
|
+
cb(cur, "l_out", il);
|
|
13637
15329
|
|
|
13638
|
-
|
|
15330
|
+
// input for next layer
|
|
15331
|
+
inpL = cur;
|
|
13639
15332
|
}
|
|
13640
15333
|
|
|
13641
15334
|
cur = inpL;
|
|
13642
15335
|
|
|
13643
|
-
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
13644
|
-
|
|
13645
15336
|
cur = build_norm(cur,
|
|
13646
|
-
model.output_norm,
|
|
13647
|
-
|
|
13648
|
-
|
|
15337
|
+
model.output_norm, NULL,
|
|
15338
|
+
LLM_NORM_RMS, -1);
|
|
15339
|
+
|
|
15340
|
+
cb(cur, "result_norm", -1);
|
|
15341
|
+
res->t_embd = cur;
|
|
13649
15342
|
|
|
13650
15343
|
// lm_head
|
|
13651
15344
|
cur = build_lora_mm(model.output, cur);
|
|
13652
15345
|
|
|
13653
|
-
cur
|
|
13654
|
-
|
|
13655
|
-
cb(cur, "result_embd", -1);
|
|
13656
|
-
res->t_embd = cur;
|
|
15346
|
+
cb(cur, "result_output", -1);
|
|
15347
|
+
res->t_logits = cur;
|
|
13657
15348
|
|
|
13658
15349
|
ggml_build_forward_expand(gf, cur);
|
|
13659
15350
|
}
|
|
13660
15351
|
};
|
|
13661
15352
|
|
|
13662
|
-
struct
|
|
13663
|
-
|
|
13664
|
-
const
|
|
13665
|
-
|
|
13666
|
-
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
13667
|
-
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
13668
|
-
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
15353
|
+
struct llm_build_falcon_h1 : public llm_graph_context_mamba {
|
|
15354
|
+
llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
|
|
15355
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13669
15356
|
|
|
13670
15357
|
ggml_tensor * cur;
|
|
13671
15358
|
ggml_tensor * inpL;
|
|
13672
15359
|
|
|
13673
|
-
// {n_embd, n_tokens}
|
|
13674
15360
|
inpL = build_inp_embd(model.tok_embd);
|
|
13675
15361
|
|
|
13676
15362
|
// inp_pos - contains the positions
|
|
13677
15363
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13678
15364
|
|
|
13679
|
-
|
|
15365
|
+
// Build the inputs in the recurrent & kv cache
|
|
15366
|
+
auto * inp = build_inp_mem_hybrid();
|
|
15367
|
+
|
|
15368
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
13680
15369
|
|
|
13681
15370
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13682
15371
|
|
|
13683
15372
|
for (int il = 0; il < n_layer; ++il) {
|
|
13684
15373
|
ggml_tensor * inpSA = inpL;
|
|
13685
15374
|
|
|
13686
|
-
// norm
|
|
13687
15375
|
cur = build_norm(inpL,
|
|
13688
15376
|
model.layers[il].attn_norm, NULL,
|
|
13689
15377
|
LLM_NORM_RMS, il);
|
|
13690
15378
|
cb(cur, "attn_norm", il);
|
|
13691
15379
|
|
|
13692
|
-
//
|
|
13693
|
-
|
|
13694
|
-
|
|
13695
|
-
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
13696
|
-
cb(q, "q", il);
|
|
13697
|
-
|
|
13698
|
-
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
13699
|
-
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
|
13700
|
-
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
13701
|
-
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
13702
|
-
0);
|
|
13703
|
-
cb(q_nope, "q_nope", il);
|
|
13704
|
-
|
|
13705
|
-
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
|
13706
|
-
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
|
13707
|
-
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
13708
|
-
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
13709
|
-
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
13710
|
-
cb(q_pe, "q_pe", il);
|
|
13711
|
-
|
|
13712
|
-
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
|
13713
|
-
ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
13714
|
-
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
13715
|
-
|
|
13716
|
-
// split into {kv_lora_rank, n_tokens}
|
|
13717
|
-
ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
|
13718
|
-
kv_pe_compresseed->nb[1],
|
|
13719
|
-
0);
|
|
13720
|
-
cb(kv_compressed, "kv_compressed", il);
|
|
13721
|
-
|
|
13722
|
-
// and {n_embd_head_qk_rope, n_tokens}
|
|
13723
|
-
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
|
13724
|
-
kv_pe_compresseed->nb[1],
|
|
13725
|
-
kv_pe_compresseed->nb[1],
|
|
13726
|
-
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
13727
|
-
cb(k_pe, "k_pe", il);
|
|
15380
|
+
// self-attention
|
|
15381
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
15382
|
+
cb(Qcur, "Qcur", il);
|
|
13728
15383
|
|
|
13729
|
-
|
|
13730
|
-
|
|
13731
|
-
LLM_NORM_RMS, il);
|
|
13732
|
-
cb(kv_compressed, "kv_compressed", il);
|
|
15384
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
15385
|
+
cb(Kcur, "Kcur", il);
|
|
13733
15386
|
|
|
13734
|
-
|
|
13735
|
-
|
|
13736
|
-
cb(kv, "kv", il);
|
|
15387
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
15388
|
+
cb(Vcur, "Vcur", il);
|
|
13737
15389
|
|
|
13738
|
-
|
|
13739
|
-
|
|
13740
|
-
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
|
13741
|
-
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
13742
|
-
0);
|
|
13743
|
-
cb(k_nope, "k_nope", il);
|
|
15390
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15391
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13744
15392
|
|
|
13745
|
-
|
|
13746
|
-
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
|
13747
|
-
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
13748
|
-
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
|
13749
|
-
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
|
13750
|
-
cb(v_states, "v_states", il);
|
|
15393
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13751
15394
|
|
|
13752
|
-
|
|
13753
|
-
|
|
15395
|
+
Qcur = ggml_rope_ext(
|
|
15396
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
15397
|
+
n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15398
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
13754
15399
|
|
|
13755
|
-
|
|
13756
|
-
|
|
13757
|
-
|
|
13758
|
-
|
|
15400
|
+
Kcur = ggml_rope_ext(
|
|
15401
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
15402
|
+
n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15403
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15404
|
+
);
|
|
13759
15405
|
|
|
13760
|
-
|
|
13761
|
-
|
|
13762
|
-
|
|
13763
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13764
|
-
);
|
|
13765
|
-
cb(q_pe, "q_pe", il);
|
|
15406
|
+
cb(Qcur, "Qcur-post-rope", il);
|
|
15407
|
+
cb(Kcur, "Kcur-post-rope", il);
|
|
15408
|
+
cb(Vcur, "Vcur-post-rope", il);
|
|
13766
15409
|
|
|
13767
|
-
|
|
13768
|
-
|
|
13769
|
-
|
|
13770
|
-
|
|
13771
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13772
|
-
);
|
|
13773
|
-
cb(k_pe, "k_pe", il);
|
|
15410
|
+
ggml_tensor * attn_out = build_attn(inp->get_attn(), gf,
|
|
15411
|
+
model.layers[il].wo, NULL,
|
|
15412
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15413
|
+
cb(attn_out, "attn_out", il);
|
|
13774
15414
|
|
|
13775
|
-
|
|
13776
|
-
|
|
15415
|
+
cur = build_norm(inpL,
|
|
15416
|
+
model.layers[il].attn_norm, NULL,
|
|
15417
|
+
LLM_NORM_RMS, il);
|
|
15418
|
+
// Mamba2 layer
|
|
15419
|
+
cb(cur, "ssm_in", il);
|
|
13777
15420
|
|
|
13778
|
-
|
|
13779
|
-
|
|
15421
|
+
ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il);
|
|
15422
|
+
cb(ssm_out, "ssm_out", il);
|
|
13780
15423
|
|
|
13781
|
-
|
|
13782
|
-
|
|
13783
|
-
|
|
13784
|
-
|
|
15424
|
+
// // Aggregation
|
|
15425
|
+
cur = ggml_add(ctx0, attn_out, ssm_out);
|
|
15426
|
+
inpSA = ggml_add(ctx0, cur, inpSA);
|
|
15427
|
+
cb(cur, "layer_out", il);
|
|
13785
15428
|
|
|
13786
15429
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
13787
15430
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13788
15431
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13789
15432
|
}
|
|
13790
15433
|
|
|
13791
|
-
ggml_tensor * ffn_inp =
|
|
15434
|
+
ggml_tensor * ffn_inp = inpSA;
|
|
13792
15435
|
cb(ffn_inp, "ffn_inp", il);
|
|
13793
15436
|
|
|
15437
|
+
// feed-forward network
|
|
13794
15438
|
cur = build_norm(ffn_inp,
|
|
13795
15439
|
model.layers[il].ffn_norm, NULL,
|
|
13796
15440
|
LLM_NORM_RMS, il);
|
|
13797
15441
|
cb(cur, "ffn_norm", il);
|
|
13798
15442
|
|
|
13799
15443
|
cur = build_ffn(cur,
|
|
13800
|
-
model.layers[il].ffn_up,
|
|
13801
|
-
|
|
13802
|
-
model.layers[il].ffn_down,
|
|
15444
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
15445
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
15446
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
13803
15447
|
NULL,
|
|
13804
|
-
|
|
15448
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13805
15449
|
cb(cur, "ffn_out", il);
|
|
13806
15450
|
|
|
13807
|
-
cur = ggml_add(ctx0, cur,
|
|
15451
|
+
cur = ggml_add(ctx0, cur, inpSA);
|
|
13808
15452
|
|
|
13809
15453
|
cur = build_cvec(cur, il);
|
|
13810
15454
|
cb(cur, "l_out", il);
|
|
@@ -13822,6 +15466,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
13822
15466
|
cb(cur, "result_norm", -1);
|
|
13823
15467
|
res->t_embd = cur;
|
|
13824
15468
|
|
|
15469
|
+
// lm_head
|
|
13825
15470
|
cur = build_lora_mm(model.output, cur);
|
|
13826
15471
|
|
|
13827
15472
|
cb(cur, "result_output", -1);
|
|
@@ -13831,8 +15476,13 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
13831
15476
|
}
|
|
13832
15477
|
};
|
|
13833
15478
|
|
|
13834
|
-
struct
|
|
13835
|
-
|
|
15479
|
+
struct llm_build_arcee : public llm_graph_context {
|
|
15480
|
+
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
15481
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15482
|
+
|
|
15483
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
15484
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
15485
|
+
|
|
13836
15486
|
ggml_tensor * cur;
|
|
13837
15487
|
ggml_tensor * inpL;
|
|
13838
15488
|
|
|
@@ -13843,6 +15493,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13843
15493
|
|
|
13844
15494
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13845
15495
|
|
|
15496
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15497
|
+
|
|
13846
15498
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13847
15499
|
|
|
13848
15500
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -13881,9 +15533,9 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13881
15533
|
cb(Vcur, "Vcur", il);
|
|
13882
15534
|
}
|
|
13883
15535
|
|
|
13884
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur,
|
|
13885
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur,
|
|
13886
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur,
|
|
15536
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15537
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
15538
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13887
15539
|
|
|
13888
15540
|
Qcur = ggml_rope_ext(
|
|
13889
15541
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
@@ -13903,7 +15555,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13903
15555
|
|
|
13904
15556
|
cur = build_attn(inp_attn, gf,
|
|
13905
15557
|
model.layers[il].wo, model.layers[il].bo,
|
|
13906
|
-
Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
15558
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15559
|
+
cb(cur, "attn_out", il);
|
|
13907
15560
|
}
|
|
13908
15561
|
|
|
13909
15562
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13914,40 +15567,23 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13914
15567
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13915
15568
|
cb(ffn_inp, "ffn_inp", il);
|
|
13916
15569
|
|
|
15570
|
+
// feed-forward network
|
|
15571
|
+
// ARCEE uses relu^2 instead of silu
|
|
13917
15572
|
cur = build_norm(ffn_inp,
|
|
13918
15573
|
model.layers[il].ffn_norm, NULL,
|
|
13919
15574
|
LLM_NORM_RMS, il);
|
|
13920
15575
|
cb(cur, "ffn_norm", il);
|
|
13921
15576
|
|
|
13922
|
-
|
|
13923
|
-
|
|
13924
|
-
|
|
13925
|
-
|
|
13926
|
-
|
|
13927
|
-
|
|
13928
|
-
|
|
13929
|
-
n_expert, n_expert_used,
|
|
13930
|
-
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
13931
|
-
false, hparams.expert_weights_scale,
|
|
13932
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
13933
|
-
il);
|
|
13934
|
-
cb(moe_out, "ffn_moe_out", il);
|
|
13935
|
-
|
|
13936
|
-
// FFN shared expert
|
|
13937
|
-
{
|
|
13938
|
-
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
13939
|
-
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
13940
|
-
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
13941
|
-
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
13942
|
-
NULL,
|
|
13943
|
-
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13944
|
-
cb(ffn_shexp, "ffn_shexp", il);
|
|
13945
|
-
|
|
13946
|
-
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
13947
|
-
cb(cur, "ffn_out", il);
|
|
13948
|
-
}
|
|
15577
|
+
cur = build_ffn(cur,
|
|
15578
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
15579
|
+
NULL, NULL, NULL,
|
|
15580
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
15581
|
+
NULL,
|
|
15582
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
15583
|
+
cb(cur, "ffn_out", il);
|
|
13949
15584
|
|
|
13950
15585
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
15586
|
+
cb(cur, "ffn_out", il);
|
|
13951
15587
|
|
|
13952
15588
|
cur = build_cvec(cur, il);
|
|
13953
15589
|
cb(cur, "l_out", il);
|
|
@@ -13975,8 +15611,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13975
15611
|
}
|
|
13976
15612
|
};
|
|
13977
15613
|
|
|
13978
|
-
struct
|
|
13979
|
-
|
|
15614
|
+
struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
15615
|
+
llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
13980
15616
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13981
15617
|
|
|
13982
15618
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -13992,6 +15628,8 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
13992
15628
|
|
|
13993
15629
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13994
15630
|
|
|
15631
|
+
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
15632
|
+
|
|
13995
15633
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13996
15634
|
|
|
13997
15635
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -14003,47 +15641,67 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
14003
15641
|
LLM_NORM_RMS, il);
|
|
14004
15642
|
cb(cur, "attn_norm", il);
|
|
14005
15643
|
|
|
14006
|
-
//
|
|
15644
|
+
// self-attention
|
|
14007
15645
|
{
|
|
15646
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
15647
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
15648
|
+
|
|
14008
15649
|
// compute Q and K and RoPE them
|
|
14009
15650
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14010
15651
|
cb(Qcur, "Qcur", il);
|
|
15652
|
+
if (model.layers[il].bq) {
|
|
15653
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15654
|
+
cb(Qcur, "Qcur", il);
|
|
15655
|
+
}
|
|
14011
15656
|
|
|
14012
15657
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14013
15658
|
cb(Kcur, "Kcur", il);
|
|
15659
|
+
if (model.layers[il].bk) {
|
|
15660
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15661
|
+
cb(Kcur, "Kcur", il);
|
|
15662
|
+
}
|
|
14014
15663
|
|
|
14015
15664
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14016
15665
|
cb(Vcur, "Vcur", il);
|
|
15666
|
+
if (model.layers[il].bv) {
|
|
15667
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15668
|
+
cb(Vcur, "Vcur", il);
|
|
15669
|
+
}
|
|
14017
15670
|
|
|
14018
15671
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
14019
15672
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14020
15673
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14021
15674
|
|
|
14022
|
-
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
14023
|
-
cb(Qcur, "Qcur_normed", il);
|
|
14024
|
-
|
|
14025
15675
|
Qcur = ggml_rope_ext(
|
|
14026
|
-
ctx0, Qcur, inp_pos,
|
|
15676
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
14027
15677
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14028
15678
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14029
15679
|
);
|
|
14030
15680
|
|
|
14031
|
-
|
|
14032
|
-
cb(Kcur, "
|
|
15681
|
+
cb(Qcur, "Qcur", il);
|
|
15682
|
+
cb(Kcur, "Kcur", il);
|
|
15683
|
+
cb(Vcur, "Vcur", il);
|
|
14033
15684
|
|
|
14034
15685
|
Kcur = ggml_rope_ext(
|
|
14035
|
-
ctx0, Kcur, inp_pos,
|
|
15686
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
14036
15687
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14037
15688
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14038
15689
|
);
|
|
14039
15690
|
|
|
14040
|
-
|
|
14041
|
-
|
|
14042
|
-
|
|
15691
|
+
Kcur = build_norm(Kcur,
|
|
15692
|
+
model.layers[il].attn_k_norm, nullptr,
|
|
15693
|
+
LLM_NORM_RMS, il);
|
|
15694
|
+
cb(Kcur, "Kcur_norm", il);
|
|
15695
|
+
|
|
15696
|
+
Qcur = build_norm(Qcur,
|
|
15697
|
+
model.layers[il].attn_q_norm, nullptr,
|
|
15698
|
+
LLM_NORM_RMS, il);
|
|
15699
|
+
cb(Qcur, "Qcur_norm", il);
|
|
14043
15700
|
|
|
14044
15701
|
cur = build_attn(inp_attn, gf,
|
|
14045
15702
|
model.layers[il].wo, model.layers[il].bo,
|
|
14046
|
-
Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
15703
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15704
|
+
cb(cur, "attn_out", il);
|
|
14047
15705
|
}
|
|
14048
15706
|
|
|
14049
15707
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -14054,50 +15712,40 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
14054
15712
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14055
15713
|
cb(ffn_inp, "ffn_inp", il);
|
|
14056
15714
|
|
|
14057
|
-
// MoE branch
|
|
14058
15715
|
cur = build_norm(ffn_inp,
|
|
14059
|
-
|
|
14060
|
-
|
|
15716
|
+
model.layers[il].ffn_norm, NULL,
|
|
15717
|
+
LLM_NORM_RMS, il);
|
|
14061
15718
|
cb(cur, "ffn_norm", il);
|
|
14062
15719
|
|
|
14063
|
-
|
|
14064
|
-
|
|
14065
|
-
|
|
14066
|
-
|
|
14067
|
-
|
|
14068
|
-
|
|
14069
|
-
|
|
14070
|
-
|
|
14071
|
-
} else {
|
|
14072
|
-
ggml_tensor * moe_out =
|
|
14073
|
-
build_moe_ffn(cur,
|
|
14074
|
-
model.layers[il].ffn_gate_inp,
|
|
14075
|
-
model.layers[il].ffn_up_exps,
|
|
14076
|
-
model.layers[il].ffn_gate_exps,
|
|
14077
|
-
model.layers[il].ffn_down_exps,
|
|
14078
|
-
model.layers[il].ffn_exp_probs_b,
|
|
14079
|
-
n_expert, n_expert_used,
|
|
14080
|
-
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
14081
|
-
true, hparams.expert_weights_scale,
|
|
14082
|
-
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
14083
|
-
il);
|
|
14084
|
-
cb(moe_out, "ffn_moe_out", il);
|
|
15720
|
+
// feed-forward network (non-MoE)
|
|
15721
|
+
ggml_tensor * cur_mlp = build_ffn(cur,
|
|
15722
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
15723
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
15724
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
15725
|
+
NULL,
|
|
15726
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15727
|
+
cb(cur_mlp, "ffn_mlp", il);
|
|
14085
15728
|
|
|
14086
|
-
|
|
14087
|
-
|
|
14088
|
-
|
|
14089
|
-
|
|
14090
|
-
|
|
14091
|
-
|
|
14092
|
-
|
|
14093
|
-
|
|
15729
|
+
// MoE branch
|
|
15730
|
+
ggml_tensor * cur_moe = build_moe_ffn(cur,
|
|
15731
|
+
model.layers[il].ffn_gate_inp,
|
|
15732
|
+
model.layers[il].ffn_up_exps,
|
|
15733
|
+
model.layers[il].ffn_gate_exps,
|
|
15734
|
+
model.layers[il].ffn_down_exps,
|
|
15735
|
+
nullptr,
|
|
15736
|
+
n_expert, n_expert_used,
|
|
15737
|
+
LLM_FFN_SILU,
|
|
15738
|
+
true, // norm_topk_prob
|
|
15739
|
+
false,
|
|
15740
|
+
0.0,
|
|
15741
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
15742
|
+
il);
|
|
15743
|
+
cb(cur_moe, "ffn_moe_out", il);
|
|
14094
15744
|
|
|
14095
|
-
|
|
14096
|
-
|
|
14097
|
-
}
|
|
14098
|
-
}
|
|
15745
|
+
ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
|
|
15746
|
+
cb(ffn_out, "ffn_out", il);
|
|
14099
15747
|
|
|
14100
|
-
cur = ggml_add(ctx0,
|
|
15748
|
+
cur = ggml_add(ctx0, ffn_out, ffn_inp);
|
|
14101
15749
|
|
|
14102
15750
|
cur = build_cvec(cur, il);
|
|
14103
15751
|
cb(cur, "l_out", il);
|
|
@@ -14117,7 +15765,6 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
14117
15765
|
|
|
14118
15766
|
// lm_head
|
|
14119
15767
|
cur = build_lora_mm(model.output, cur);
|
|
14120
|
-
|
|
14121
15768
|
cb(cur, "result_output", -1);
|
|
14122
15769
|
res->t_logits = cur;
|
|
14123
15770
|
|
|
@@ -14125,8 +15772,8 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
14125
15772
|
}
|
|
14126
15773
|
};
|
|
14127
15774
|
|
|
14128
|
-
struct
|
|
14129
|
-
|
|
15775
|
+
struct llm_build_smollm3 : public llm_graph_context {
|
|
15776
|
+
llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14130
15777
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14131
15778
|
|
|
14132
15779
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -14149,6 +15796,8 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14149
15796
|
for (int il = 0; il < n_layer; ++il) {
|
|
14150
15797
|
ggml_tensor * inpSA = inpL;
|
|
14151
15798
|
|
|
15799
|
+
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
15800
|
+
|
|
14152
15801
|
// norm
|
|
14153
15802
|
cur = build_norm(inpL,
|
|
14154
15803
|
model.layers[il].attn_norm, NULL,
|
|
@@ -14157,9 +15806,6 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14157
15806
|
|
|
14158
15807
|
// self-attention
|
|
14159
15808
|
{
|
|
14160
|
-
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
14161
|
-
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
14162
|
-
|
|
14163
15809
|
// compute Q and K and RoPE them
|
|
14164
15810
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14165
15811
|
cb(Qcur, "Qcur", il);
|
|
@@ -14186,17 +15832,19 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14186
15832
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14187
15833
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14188
15834
|
|
|
14189
|
-
|
|
14190
|
-
|
|
14191
|
-
|
|
14192
|
-
|
|
14193
|
-
|
|
15835
|
+
if (use_rope) {
|
|
15836
|
+
Qcur = ggml_rope_ext(
|
|
15837
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
15838
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15839
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15840
|
+
);
|
|
14194
15841
|
|
|
14195
|
-
|
|
14196
|
-
|
|
14197
|
-
|
|
14198
|
-
|
|
14199
|
-
|
|
15842
|
+
Kcur = ggml_rope_ext(
|
|
15843
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
15844
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15845
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15846
|
+
);
|
|
15847
|
+
}
|
|
14200
15848
|
|
|
14201
15849
|
cb(Qcur, "Qcur", il);
|
|
14202
15850
|
cb(Kcur, "Kcur", il);
|
|
@@ -14217,19 +15865,20 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14217
15865
|
cb(ffn_inp, "ffn_inp", il);
|
|
14218
15866
|
|
|
14219
15867
|
// feed-forward network
|
|
14220
|
-
|
|
14221
|
-
|
|
14222
|
-
|
|
14223
|
-
|
|
14224
|
-
|
|
15868
|
+
{
|
|
15869
|
+
cur = build_norm(ffn_inp,
|
|
15870
|
+
model.layers[il].ffn_norm, NULL,
|
|
15871
|
+
LLM_NORM_RMS, il);
|
|
15872
|
+
cb(cur, "ffn_norm", il);
|
|
14225
15873
|
|
|
14226
|
-
|
|
14227
|
-
|
|
14228
|
-
|
|
14229
|
-
|
|
14230
|
-
|
|
14231
|
-
|
|
14232
|
-
|
|
15874
|
+
cur = build_ffn(cur,
|
|
15875
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
15876
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
15877
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
15878
|
+
NULL,
|
|
15879
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15880
|
+
cb(cur, "ffn_out", il);
|
|
15881
|
+
}
|
|
14233
15882
|
|
|
14234
15883
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14235
15884
|
cb(cur, "ffn_out", il);
|
|
@@ -14260,6 +15909,163 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14260
15909
|
}
|
|
14261
15910
|
};
|
|
14262
15911
|
|
|
15912
|
+
struct llm_build_lfm2 : public llm_graph_context {
|
|
15913
|
+
const llama_model & model;
|
|
15914
|
+
|
|
15915
|
+
llm_build_lfm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
|
|
15916
|
+
|
|
15917
|
+
ggml_tensor * cur = build_inp_embd(model.tok_embd);
|
|
15918
|
+
cb(cur, "model.embed_tokens", -1);
|
|
15919
|
+
|
|
15920
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
15921
|
+
auto * inp_hybrid = build_inp_mem_hybrid();
|
|
15922
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15923
|
+
|
|
15924
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
15925
|
+
auto * prev_cur = cur;
|
|
15926
|
+
cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
15927
|
+
cb(cur, "model.layers.{}.operator_norm", il);
|
|
15928
|
+
|
|
15929
|
+
cur = hparams.is_recurrent(il) ?
|
|
15930
|
+
build_shortconv_block(gf, cur, inp_hybrid->get_recr(), il) :
|
|
15931
|
+
build_attn_block(gf, cur, inp_pos, inp_hybrid->get_attn(), il) ;
|
|
15932
|
+
|
|
15933
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
15934
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15935
|
+
prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
|
|
15936
|
+
}
|
|
15937
|
+
|
|
15938
|
+
cur = ggml_add(ctx0, prev_cur, cur);
|
|
15939
|
+
cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
|
|
15940
|
+
}
|
|
15941
|
+
|
|
15942
|
+
cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
|
|
15943
|
+
cb(cur, "model.embedding_norm", -1);
|
|
15944
|
+
res->t_embd = cur;
|
|
15945
|
+
|
|
15946
|
+
// lm_head is tied with embeddings
|
|
15947
|
+
cur = build_lora_mm(model.tok_embd, cur);
|
|
15948
|
+
cb(cur, "lm_head", -1);
|
|
15949
|
+
|
|
15950
|
+
res->t_logits = cur;
|
|
15951
|
+
|
|
15952
|
+
ggml_build_forward_expand(gf, cur);
|
|
15953
|
+
}
|
|
15954
|
+
|
|
15955
|
+
ggml_tensor * build_feed_forward(ggml_tensor * cur,
|
|
15956
|
+
int il) const {
|
|
15957
|
+
cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
15958
|
+
cb(cur, "model.layers.{}.ffn_norm", il);
|
|
15959
|
+
|
|
15960
|
+
GGML_ASSERT(!model.layers[il].ffn_up_b);
|
|
15961
|
+
GGML_ASSERT(!model.layers[il].ffn_gate_b);
|
|
15962
|
+
GGML_ASSERT(!model.layers[il].ffn_down_b);
|
|
15963
|
+
cur = build_ffn(cur,
|
|
15964
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
15965
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
15966
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
15967
|
+
NULL,
|
|
15968
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15969
|
+
cb(cur, "model.layers.{}.feed_forward.w2", il);
|
|
15970
|
+
|
|
15971
|
+
return cur;
|
|
15972
|
+
}
|
|
15973
|
+
|
|
15974
|
+
ggml_tensor * build_attn_block(ggml_cgraph * gf,
|
|
15975
|
+
ggml_tensor * cur,
|
|
15976
|
+
ggml_tensor * inp_pos,
|
|
15977
|
+
llm_graph_input_attn_kv_unified * inp_attn,
|
|
15978
|
+
int il) const {
|
|
15979
|
+
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
|
15980
|
+
auto const n_embd_head = hparams.n_embd_head_v;
|
|
15981
|
+
auto const n_head_kv = hparams.n_head_kv(il);
|
|
15982
|
+
|
|
15983
|
+
auto * q = build_lora_mm(model.layers[il].wq, cur);
|
|
15984
|
+
cb(q, "model.layers.{}.self_attn.q_proj", il);
|
|
15985
|
+
auto * k = build_lora_mm(model.layers[il].wk, cur);
|
|
15986
|
+
cb(k, "model.layers.{}.self_attn.k_proj", il);
|
|
15987
|
+
auto * v = build_lora_mm(model.layers[il].wv, cur);
|
|
15988
|
+
cb(v, "model.layers.{}.self_attn.v_proj", il);
|
|
15989
|
+
|
|
15990
|
+
q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
|
|
15991
|
+
k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
|
|
15992
|
+
v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
|
|
15993
|
+
|
|
15994
|
+
// qk norm
|
|
15995
|
+
q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
15996
|
+
cb(q, "model.layers.{}.self_attn.q_layernorm", il);
|
|
15997
|
+
k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
15998
|
+
cb(k, "model.layers.{}.self_attn.k_layernorm", il);
|
|
15999
|
+
|
|
16000
|
+
// RoPE
|
|
16001
|
+
q = ggml_rope_ext(
|
|
16002
|
+
ctx0, q, inp_pos, nullptr,
|
|
16003
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
16004
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16005
|
+
);
|
|
16006
|
+
k = ggml_rope_ext(
|
|
16007
|
+
ctx0, k, inp_pos, nullptr,
|
|
16008
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
16009
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16010
|
+
);
|
|
16011
|
+
|
|
16012
|
+
cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL,
|
|
16013
|
+
q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16014
|
+
|
|
16015
|
+
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
|
16016
|
+
|
|
16017
|
+
return cur;
|
|
16018
|
+
}
|
|
16019
|
+
|
|
16020
|
+
ggml_tensor * build_shortconv_block(ggml_cgraph * gf,
|
|
16021
|
+
ggml_tensor * cur,
|
|
16022
|
+
llm_graph_input_rs * inp_recr,
|
|
16023
|
+
int il) {
|
|
16024
|
+
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
|
|
16025
|
+
|
|
16026
|
+
auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
|
|
16027
|
+
cb(bcx, "model.layers.{}.conv.in_proj", il);
|
|
16028
|
+
|
|
16029
|
+
constexpr auto n_chunks = 3;
|
|
16030
|
+
GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
|
|
16031
|
+
auto const chunk_size = bcx->ne[0] / n_chunks;
|
|
16032
|
+
auto * b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx));
|
|
16033
|
+
auto * c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx));
|
|
16034
|
+
auto * x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx));
|
|
16035
|
+
|
|
16036
|
+
auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
|
|
16037
|
+
|
|
16038
|
+
// read conv state directly, with build_rs generation is slower
|
|
16039
|
+
ggml_tensor * conv_state = mctx_cur->get_r_l(il);
|
|
16040
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
|
16041
|
+
ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs);
|
|
16042
|
+
conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs);
|
|
16043
|
+
|
|
16044
|
+
bx = ggml_concat(ctx0, conv, bx, 0);
|
|
16045
|
+
GGML_ASSERT(bx->ne[0] > conv->ne[0]);
|
|
16046
|
+
|
|
16047
|
+
auto * new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
|
|
16048
|
+
GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
|
|
16049
|
+
|
|
16050
|
+
// write conv state
|
|
16051
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state));
|
|
16052
|
+
|
|
16053
|
+
auto * conv_kernel = model.layers[il].shortconv.conv;
|
|
16054
|
+
GGML_ASSERT(hparams.n_shortconv_l_cache > 0);
|
|
16055
|
+
|
|
16056
|
+
// construct ssm_conv op
|
|
16057
|
+
ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
|
|
16058
|
+
cb(conv_out, "model.layers.{}.conv.conv", il);
|
|
16059
|
+
|
|
16060
|
+
auto * y = ggml_mul(ctx0, c, conv_out);
|
|
16061
|
+
|
|
16062
|
+
y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
|
|
16063
|
+
cb(y, "model.layers.{}.conv.out_proj", il);
|
|
16064
|
+
|
|
16065
|
+
return y;
|
|
16066
|
+
}
|
|
16067
|
+
};
|
|
16068
|
+
|
|
14263
16069
|
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
14264
16070
|
llama_memory_i * res;
|
|
14265
16071
|
|
|
@@ -14306,7 +16112,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
14306
16112
|
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
14307
16113
|
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
14308
16114
|
/* n_seq_max */ cparams.n_seq_max,
|
|
14309
|
-
/* offload */ cparams.offload_kqv
|
|
16115
|
+
/* offload */ cparams.offload_kqv,
|
|
16116
|
+
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
|
16117
|
+
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
|
14310
16118
|
} else {
|
|
14311
16119
|
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
14312
16120
|
|
|
@@ -14495,9 +16303,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
14495
16303
|
llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
|
|
14496
16304
|
} break;
|
|
14497
16305
|
case LLM_ARCH_MAMBA:
|
|
16306
|
+
case LLM_ARCH_MAMBA2:
|
|
14498
16307
|
{
|
|
14499
16308
|
llm = std::make_unique<llm_build_mamba>(*this, params, gf);
|
|
14500
16309
|
} break;
|
|
16310
|
+
case LLM_ARCH_JAMBA:
|
|
16311
|
+
{
|
|
16312
|
+
llm = std::make_unique<llm_build_jamba>(*this, params, gf);
|
|
16313
|
+
} break;
|
|
14501
16314
|
case LLM_ARCH_XVERSE:
|
|
14502
16315
|
{
|
|
14503
16316
|
llm = std::make_unique<llm_build_xverse>(*this, params, gf);
|
|
@@ -14611,6 +16424,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
14611
16424
|
{
|
|
14612
16425
|
llm = std::make_unique<llm_build_granite>(*this, params, gf);
|
|
14613
16426
|
} break;
|
|
16427
|
+
case LLM_ARCH_GRANITE_HYBRID:
|
|
16428
|
+
{
|
|
16429
|
+
llm = std::make_unique<llm_build_granite_hybrid>(*this, params, gf);
|
|
16430
|
+
} break;
|
|
14614
16431
|
case LLM_ARCH_CHAMELEON:
|
|
14615
16432
|
{
|
|
14616
16433
|
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
|
@@ -14635,6 +16452,26 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
14635
16452
|
{
|
|
14636
16453
|
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
|
|
14637
16454
|
} break;
|
|
16455
|
+
case LLM_ARCH_ERNIE4_5:
|
|
16456
|
+
{
|
|
16457
|
+
llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
|
|
16458
|
+
} break;
|
|
16459
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
16460
|
+
{
|
|
16461
|
+
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
|
|
16462
|
+
} break;
|
|
16463
|
+
case LLM_ARCH_SMOLLM3:
|
|
16464
|
+
{
|
|
16465
|
+
llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
|
|
16466
|
+
} break;
|
|
16467
|
+
case LLM_ARCH_FALCON_H1:
|
|
16468
|
+
{
|
|
16469
|
+
llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
|
|
16470
|
+
} break;
|
|
16471
|
+
case LLM_ARCH_LFM2:
|
|
16472
|
+
{
|
|
16473
|
+
llm = std::make_unique<llm_build_lfm2>(*this, params, gf);
|
|
16474
|
+
} break;
|
|
14638
16475
|
default:
|
|
14639
16476
|
GGML_ABORT("fatal error");
|
|
14640
16477
|
}
|
|
@@ -14751,6 +16588,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14751
16588
|
case LLM_ARCH_REFACT:
|
|
14752
16589
|
case LLM_ARCH_BLOOM:
|
|
14753
16590
|
case LLM_ARCH_MAMBA:
|
|
16591
|
+
case LLM_ARCH_MAMBA2:
|
|
16592
|
+
case LLM_ARCH_JAMBA:
|
|
14754
16593
|
case LLM_ARCH_JINA_BERT_V2:
|
|
14755
16594
|
case LLM_ARCH_T5:
|
|
14756
16595
|
case LLM_ARCH_T5ENCODER:
|
|
@@ -14782,14 +16621,18 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14782
16621
|
case LLM_ARCH_GLM4:
|
|
14783
16622
|
case LLM_ARCH_GRANITE:
|
|
14784
16623
|
case LLM_ARCH_GRANITE_MOE:
|
|
16624
|
+
case LLM_ARCH_GRANITE_HYBRID:
|
|
14785
16625
|
case LLM_ARCH_CHAMELEON:
|
|
14786
16626
|
case LLM_ARCH_BAILINGMOE:
|
|
14787
16627
|
case LLM_ARCH_NEO_BERT:
|
|
16628
|
+
case LLM_ARCH_SMOLLM3:
|
|
14788
16629
|
case LLM_ARCH_ARCEE:
|
|
16630
|
+
case LLM_ARCH_ERNIE4_5:
|
|
14789
16631
|
return LLAMA_ROPE_TYPE_NORM;
|
|
14790
16632
|
|
|
14791
16633
|
// the pairs of head values are offset by n_rot/2
|
|
14792
16634
|
case LLM_ARCH_FALCON:
|
|
16635
|
+
case LLM_ARCH_FALCON_H1:
|
|
14793
16636
|
case LLM_ARCH_GROK:
|
|
14794
16637
|
case LLM_ARCH_DBRX:
|
|
14795
16638
|
case LLM_ARCH_BERT:
|
|
@@ -14821,6 +16664,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14821
16664
|
case LLM_ARCH_EXAONE:
|
|
14822
16665
|
case LLM_ARCH_MINICPM3:
|
|
14823
16666
|
case LLM_ARCH_DOTS1:
|
|
16667
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
16668
|
+
case LLM_ARCH_LFM2:
|
|
14824
16669
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
14825
16670
|
|
|
14826
16671
|
case LLM_ARCH_QWEN2VL:
|