@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -8,7 +8,8 @@
|
|
|
8
8
|
|
|
9
9
|
#include "llama-kv-cache-unified.h"
|
|
10
10
|
#include "llama-kv-cache-unified-iswa.h"
|
|
11
|
-
#include "llama-
|
|
11
|
+
#include "llama-memory-hybrid.h"
|
|
12
|
+
#include "llama-memory-recurrent.h"
|
|
12
13
|
|
|
13
14
|
#include "ggml-cpp.h"
|
|
14
15
|
|
|
@@ -80,6 +81,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
80
81
|
case LLM_TYPE_40B: return "40B";
|
|
81
82
|
case LLM_TYPE_65B: return "65B";
|
|
82
83
|
case LLM_TYPE_70B: return "70B";
|
|
84
|
+
case LLM_TYPE_142B: return "142B";
|
|
83
85
|
case LLM_TYPE_236B: return "236B";
|
|
84
86
|
case LLM_TYPE_290B: return "290B";
|
|
85
87
|
case LLM_TYPE_314B: return "314B";
|
|
@@ -101,6 +103,8 @@ const char * llm_type_name(llm_type type) {
|
|
|
101
103
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
102
104
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
103
105
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
106
|
+
case LLM_TYPE_E2B: return "E2B";
|
|
107
|
+
case LLM_TYPE_E4B: return "E4B";
|
|
104
108
|
default: return "?B";
|
|
105
109
|
}
|
|
106
110
|
}
|
|
@@ -469,6 +473,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
469
473
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
470
474
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
|
471
475
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
|
476
|
+
std::fill(
|
|
477
|
+
hparams.recurrent_layer_arr.begin(),
|
|
478
|
+
hparams.recurrent_layer_arr.end(),
|
|
479
|
+
llm_arch_is_recurrent(ml.get_arch()));
|
|
472
480
|
|
|
473
481
|
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
|
474
482
|
|
|
@@ -543,6 +551,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
543
551
|
uint32_t n_vocab = 0;
|
|
544
552
|
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
|
545
553
|
|
|
554
|
+
// for classifier models
|
|
555
|
+
ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
|
|
556
|
+
if (!classifier_labels.empty()) {
|
|
557
|
+
hparams.n_cls_out = classifier_labels.size();
|
|
558
|
+
}
|
|
559
|
+
|
|
546
560
|
// arch-specific KVs
|
|
547
561
|
switch (arch) {
|
|
548
562
|
case LLM_ARCH_LLAMA:
|
|
@@ -592,6 +606,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
592
606
|
hparams.use_kq_norm = false;
|
|
593
607
|
}
|
|
594
608
|
} break;
|
|
609
|
+
case LLM_ARCH_ARCEE:
|
|
610
|
+
{
|
|
611
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
612
|
+
|
|
613
|
+
// Arcee uses the same structure as Llama
|
|
614
|
+
switch (hparams.n_layer) {
|
|
615
|
+
case 36: type = LLM_TYPE_4B; break;
|
|
616
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
617
|
+
}
|
|
618
|
+
} break;
|
|
595
619
|
case LLM_ARCH_DECI:
|
|
596
620
|
{
|
|
597
621
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -686,7 +710,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
686
710
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
687
711
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
688
712
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
689
|
-
ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
|
|
690
713
|
|
|
691
714
|
switch (hparams.n_layer) {
|
|
692
715
|
case 3:
|
|
@@ -733,6 +756,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
733
756
|
}
|
|
734
757
|
}
|
|
735
758
|
} break;
|
|
759
|
+
case LLM_ARCH_NEO_BERT:
|
|
760
|
+
{
|
|
761
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
762
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
763
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
764
|
+
|
|
765
|
+
if (hparams.n_layer == 28) {
|
|
766
|
+
type = LLM_TYPE_250M;
|
|
767
|
+
}
|
|
768
|
+
} break;
|
|
736
769
|
case LLM_ARCH_BLOOM:
|
|
737
770
|
{
|
|
738
771
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -956,6 +989,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
956
989
|
case 46: type = LLM_TYPE_27B; break;
|
|
957
990
|
default: type = LLM_TYPE_UNKNOWN;
|
|
958
991
|
}
|
|
992
|
+
|
|
993
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
|
|
994
|
+
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
995
|
+
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
996
|
+
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
959
997
|
} break;
|
|
960
998
|
case LLM_ARCH_GEMMA3:
|
|
961
999
|
{
|
|
@@ -976,10 +1014,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
976
1014
|
default: type = LLM_TYPE_UNKNOWN;
|
|
977
1015
|
}
|
|
978
1016
|
|
|
1017
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
|
|
979
1018
|
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
980
1019
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
981
1020
|
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
982
1021
|
} break;
|
|
1022
|
+
case LLM_ARCH_GEMMA3N:
|
|
1023
|
+
{
|
|
1024
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1025
|
+
hparams.set_swa_pattern(5);
|
|
1026
|
+
|
|
1027
|
+
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1028
|
+
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1029
|
+
hparams.f_attention_scale = 1.0f;
|
|
1030
|
+
|
|
1031
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1032
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1033
|
+
|
|
1034
|
+
switch (hparams.n_layer) {
|
|
1035
|
+
case 30: type = LLM_TYPE_E2B; break;
|
|
1036
|
+
case 35: type = LLM_TYPE_E4B; break;
|
|
1037
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1038
|
+
}
|
|
1039
|
+
} break;
|
|
983
1040
|
case LLM_ARCH_STARCODER2:
|
|
984
1041
|
{
|
|
985
1042
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1433,6 +1490,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1433
1490
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1434
1491
|
}
|
|
1435
1492
|
} break;
|
|
1493
|
+
case LLM_ARCH_DOTS1:
|
|
1494
|
+
{
|
|
1495
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1496
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1497
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1498
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1499
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1500
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1501
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1502
|
+
switch (hparams.n_layer) {
|
|
1503
|
+
case 62: type = LLM_TYPE_142B; break;
|
|
1504
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1505
|
+
}
|
|
1506
|
+
} break;
|
|
1436
1507
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1437
1508
|
}
|
|
1438
1509
|
|
|
@@ -2176,6 +2247,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2176
2247
|
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
2177
2248
|
}
|
|
2178
2249
|
} break;
|
|
2250
|
+
case LLM_ARCH_NEO_BERT:
|
|
2251
|
+
{
|
|
2252
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2253
|
+
|
|
2254
|
+
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
2255
|
+
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2256
|
+
|
|
2257
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2258
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2259
|
+
|
|
2260
|
+
output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2261
|
+
|
|
2262
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2263
|
+
auto & layer = layers[i];
|
|
2264
|
+
|
|
2265
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2266
|
+
|
|
2267
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2268
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2269
|
+
|
|
2270
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2271
|
+
|
|
2272
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
|
|
2273
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2274
|
+
}
|
|
2275
|
+
} break;
|
|
2179
2276
|
case LLM_ARCH_JINA_BERT_V2:
|
|
2180
2277
|
{
|
|
2181
2278
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
|
@@ -2213,8 +2310,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2213
2310
|
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2214
2311
|
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2215
2312
|
|
|
2216
|
-
layer.
|
|
2217
|
-
layer.
|
|
2313
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2314
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
|
|
2218
2315
|
|
|
2219
2316
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2220
2317
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
@@ -2873,6 +2970,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2873
2970
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2874
2971
|
}
|
|
2875
2972
|
} break;
|
|
2973
|
+
case LLM_ARCH_GEMMA3N:
|
|
2974
|
+
{
|
|
2975
|
+
const int64_t n_altup = hparams.n_altup;
|
|
2976
|
+
const int64_t laurel_rank = hparams.laurel_rank;
|
|
2977
|
+
const int64_t n_embd_altup = hparams.n_embd_altup;
|
|
2978
|
+
|
|
2979
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2980
|
+
// if output is NULL, init from the input tok embed
|
|
2981
|
+
if (output == NULL) {
|
|
2982
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2983
|
+
}
|
|
2984
|
+
|
|
2985
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2986
|
+
tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
|
|
2987
|
+
|
|
2988
|
+
altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
|
|
2989
|
+
altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
|
|
2990
|
+
per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
|
|
2991
|
+
per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
|
|
2992
|
+
|
|
2993
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2994
|
+
|
|
2995
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2996
|
+
auto & layer = layers[i];
|
|
2997
|
+
|
|
2998
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2999
|
+
|
|
3000
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3001
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3002
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3003
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
3004
|
+
|
|
3005
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
3006
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
3007
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3008
|
+
|
|
3009
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3010
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3011
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3012
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3013
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3014
|
+
|
|
3015
|
+
// altup & laurel
|
|
3016
|
+
layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
|
|
3017
|
+
layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
|
|
3018
|
+
layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3019
|
+
layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
|
|
3020
|
+
layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
|
|
3021
|
+
layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
|
|
3022
|
+
layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
|
|
3023
|
+
layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
|
|
3024
|
+
layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
|
|
3025
|
+
layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
|
|
3026
|
+
layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3027
|
+
}
|
|
3028
|
+
} break;
|
|
2876
3029
|
case LLM_ARCH_STARCODER2:
|
|
2877
3030
|
{
|
|
2878
3031
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4112,6 +4265,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4112
4265
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4113
4266
|
}
|
|
4114
4267
|
} break;
|
|
4268
|
+
case LLM_ARCH_DOTS1:
|
|
4269
|
+
{
|
|
4270
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
4271
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4272
|
+
|
|
4273
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4274
|
+
|
|
4275
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4276
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
4277
|
+
|
|
4278
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4279
|
+
auto & layer = layers[i];
|
|
4280
|
+
|
|
4281
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4282
|
+
|
|
4283
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4284
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4285
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4286
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4287
|
+
|
|
4288
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4289
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4290
|
+
|
|
4291
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4292
|
+
|
|
4293
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
|
4294
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4295
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4296
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4297
|
+
} else {
|
|
4298
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4299
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
4300
|
+
|
|
4301
|
+
if (n_expert == 0) {
|
|
4302
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
4303
|
+
}
|
|
4304
|
+
if (n_expert_used == 0) {
|
|
4305
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
4306
|
+
}
|
|
4307
|
+
|
|
4308
|
+
// MoE branch
|
|
4309
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4310
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
4311
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4312
|
+
|
|
4313
|
+
// Shared expert branch
|
|
4314
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4315
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
4316
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4317
|
+
}
|
|
4318
|
+
}
|
|
4319
|
+
} break;
|
|
4320
|
+
case LLM_ARCH_ARCEE:
|
|
4321
|
+
{
|
|
4322
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4323
|
+
|
|
4324
|
+
// output
|
|
4325
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4326
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4327
|
+
|
|
4328
|
+
// if output is NULL, init from the input tok embed
|
|
4329
|
+
if (output == NULL) {
|
|
4330
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4331
|
+
}
|
|
4332
|
+
|
|
4333
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4334
|
+
auto & layer = layers[i];
|
|
4335
|
+
|
|
4336
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4337
|
+
|
|
4338
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4339
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4340
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4341
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4342
|
+
|
|
4343
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4344
|
+
|
|
4345
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4346
|
+
|
|
4347
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4348
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4349
|
+
}
|
|
4350
|
+
} break;
|
|
4115
4351
|
default:
|
|
4116
4352
|
throw std::runtime_error("unknown architecture");
|
|
4117
4353
|
}
|
|
@@ -4356,6 +4592,15 @@ void llama_model::print_info() const {
|
|
|
4356
4592
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
4357
4593
|
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
4358
4594
|
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
4595
|
+
|
|
4596
|
+
if (!classifier_labels.empty()) {
|
|
4597
|
+
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
|
4598
|
+
|
|
4599
|
+
size_t i = 0;
|
|
4600
|
+
for (auto label : classifier_labels) {
|
|
4601
|
+
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
|
|
4602
|
+
}
|
|
4603
|
+
}
|
|
4359
4604
|
}
|
|
4360
4605
|
|
|
4361
4606
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
@@ -4538,6 +4783,8 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4538
4783
|
|
|
4539
4784
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4540
4785
|
|
|
4786
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4787
|
+
|
|
4541
4788
|
for (int il = 0; il < n_layer; ++il) {
|
|
4542
4789
|
ggml_tensor * inpSA = inpL;
|
|
4543
4790
|
|
|
@@ -4600,9 +4847,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4600
4847
|
cb(cur, "attn_out", il);
|
|
4601
4848
|
}
|
|
4602
4849
|
|
|
4603
|
-
if (il == n_layer - 1) {
|
|
4604
|
-
// skip computing output for unused tokens
|
|
4605
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4850
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
4606
4851
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4607
4852
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4608
4853
|
}
|
|
@@ -4698,6 +4943,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
4698
4943
|
|
|
4699
4944
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4700
4945
|
|
|
4946
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4947
|
+
|
|
4701
4948
|
for (int il = 0; il < n_layer; ++il) {
|
|
4702
4949
|
ggml_tensor * inpSA = inpL;
|
|
4703
4950
|
|
|
@@ -4774,9 +5021,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
4774
5021
|
cb(cur, "attn_out", il);
|
|
4775
5022
|
}
|
|
4776
5023
|
|
|
4777
|
-
if (il == n_layer - 1) {
|
|
4778
|
-
// skip computing output for unused tokens
|
|
4779
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5024
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
4780
5025
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4781
5026
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4782
5027
|
}
|
|
@@ -4876,6 +5121,9 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4876
5121
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
4877
5122
|
|
|
4878
5123
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
5124
|
+
|
|
5125
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5126
|
+
|
|
4879
5127
|
for (int il = 0; il < n_layer; ++il) {
|
|
4880
5128
|
ggml_tensor * inpSA = inpL;
|
|
4881
5129
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
@@ -4949,9 +5197,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4949
5197
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4950
5198
|
}
|
|
4951
5199
|
|
|
4952
|
-
if (il == n_layer - 1) {
|
|
4953
|
-
// skip computing output for unused tokens
|
|
4954
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5200
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
4955
5201
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4956
5202
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4957
5203
|
}
|
|
@@ -5030,6 +5276,8 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
5030
5276
|
|
|
5031
5277
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5032
5278
|
|
|
5279
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5280
|
+
|
|
5033
5281
|
for (int il = 0; il < n_layer; ++il) {
|
|
5034
5282
|
ggml_tensor * inpSA = inpL;
|
|
5035
5283
|
|
|
@@ -5081,9 +5329,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
5081
5329
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5082
5330
|
}
|
|
5083
5331
|
|
|
5084
|
-
if (il == n_layer - 1) {
|
|
5085
|
-
// skip computing output for unused tokens
|
|
5086
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5332
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5087
5333
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5088
5334
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5089
5335
|
}
|
|
@@ -5152,6 +5398,8 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
5152
5398
|
|
|
5153
5399
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5154
5400
|
|
|
5401
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5402
|
+
|
|
5155
5403
|
for (int il = 0; il < n_layer; ++il) {
|
|
5156
5404
|
ggml_tensor * inpSA = inpL;
|
|
5157
5405
|
|
|
@@ -5196,9 +5444,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
5196
5444
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5197
5445
|
}
|
|
5198
5446
|
|
|
5199
|
-
if (il == n_layer - 1) {
|
|
5200
|
-
// skip computing output for unused tokens
|
|
5201
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5447
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5202
5448
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5203
5449
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5204
5450
|
}
|
|
@@ -5266,6 +5512,8 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5266
5512
|
|
|
5267
5513
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5268
5514
|
|
|
5515
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5516
|
+
|
|
5269
5517
|
for (int il = 0; il < n_layer; ++il) {
|
|
5270
5518
|
ggml_tensor * attn_norm;
|
|
5271
5519
|
|
|
@@ -5321,9 +5569,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5321
5569
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5322
5570
|
}
|
|
5323
5571
|
|
|
5324
|
-
if (il == n_layer - 1) {
|
|
5325
|
-
// skip computing output for unused tokens
|
|
5326
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5572
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5327
5573
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5328
5574
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5329
5575
|
attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
|
|
@@ -5392,6 +5638,8 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
5392
5638
|
|
|
5393
5639
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5394
5640
|
|
|
5641
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5642
|
+
|
|
5395
5643
|
for (int il = 0; il < n_layer; ++il) {
|
|
5396
5644
|
ggml_tensor * inpSA = inpL;
|
|
5397
5645
|
|
|
@@ -5451,9 +5699,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
5451
5699
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
5452
5700
|
}
|
|
5453
5701
|
|
|
5454
|
-
if (il == n_layer - 1) {
|
|
5455
|
-
// skip computing output for unused tokens
|
|
5456
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5702
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5457
5703
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5458
5704
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5459
5705
|
}
|
|
@@ -5552,6 +5798,8 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
5552
5798
|
|
|
5553
5799
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5554
5800
|
|
|
5801
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5802
|
+
|
|
5555
5803
|
for (int il = 0; il < n_layer; ++il) {
|
|
5556
5804
|
ggml_tensor * inpSA = inpL;
|
|
5557
5805
|
|
|
@@ -5602,9 +5850,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
5602
5850
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5603
5851
|
}
|
|
5604
5852
|
|
|
5605
|
-
if (il == n_layer - 1) {
|
|
5606
|
-
// skip computing output for unused tokens
|
|
5607
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5853
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5608
5854
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5609
5855
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5610
5856
|
}
|
|
@@ -5684,6 +5930,8 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
5684
5930
|
inpL = ggml_add(ctx0, inpL, pos);
|
|
5685
5931
|
cb(inpL, "inpL", -1);
|
|
5686
5932
|
|
|
5933
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5934
|
+
|
|
5687
5935
|
for (int il = 0; il < n_layer; ++il) {
|
|
5688
5936
|
cur = build_norm(inpL,
|
|
5689
5937
|
model.layers[il].attn_norm,
|
|
@@ -5716,9 +5964,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
5716
5964
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5717
5965
|
}
|
|
5718
5966
|
|
|
5719
|
-
if (il == n_layer - 1) {
|
|
5720
|
-
// skip computing output for unused tokens
|
|
5721
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5967
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5722
5968
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5723
5969
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5724
5970
|
}
|
|
@@ -5783,6 +6029,8 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
5783
6029
|
|
|
5784
6030
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5785
6031
|
|
|
6032
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6033
|
+
|
|
5786
6034
|
for (int il = 0; il < n_layer; ++il) {
|
|
5787
6035
|
ggml_tensor * inpSA = inpL;
|
|
5788
6036
|
|
|
@@ -5815,9 +6063,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
5815
6063
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5816
6064
|
}
|
|
5817
6065
|
|
|
5818
|
-
if (il == n_layer - 1) {
|
|
5819
|
-
// skip computing output for unused tokens
|
|
5820
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6066
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5821
6067
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5822
6068
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5823
6069
|
}
|
|
@@ -5903,78 +6149,79 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5903
6149
|
|
|
5904
6150
|
auto * inp_attn = build_attn_inp_no_cache();
|
|
5905
6151
|
|
|
5906
|
-
|
|
6152
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6153
|
+
|
|
5907
6154
|
for (int il = 0; il < n_layer; ++il) {
|
|
5908
6155
|
ggml_tensor * cur = inpL;
|
|
5909
6156
|
|
|
5910
|
-
|
|
5911
|
-
|
|
5912
|
-
|
|
6157
|
+
{
|
|
6158
|
+
ggml_tensor * Qcur;
|
|
6159
|
+
ggml_tensor * Kcur;
|
|
6160
|
+
ggml_tensor * Vcur;
|
|
5913
6161
|
|
|
5914
|
-
|
|
5915
|
-
|
|
5916
|
-
|
|
5917
|
-
|
|
6162
|
+
// self-attention
|
|
6163
|
+
if (model.layers[il].wqkv) {
|
|
6164
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6165
|
+
cb(cur, "wqkv", il);
|
|
5918
6166
|
|
|
5919
|
-
|
|
5920
|
-
|
|
5921
|
-
|
|
6167
|
+
if (model.layers[il].bqkv) {
|
|
6168
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6169
|
+
cb(cur, "bqkv", il);
|
|
6170
|
+
}
|
|
6171
|
+
|
|
6172
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6173
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6174
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6175
|
+
} else {
|
|
6176
|
+
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
6177
|
+
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
6178
|
+
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
5922
6179
|
}
|
|
5923
6180
|
|
|
5924
|
-
|
|
5925
|
-
|
|
5926
|
-
|
|
5927
|
-
|
|
5928
|
-
|
|
5929
|
-
|
|
5930
|
-
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
5931
|
-
}
|
|
6181
|
+
if (model.layers[il].attn_q_norm) {
|
|
6182
|
+
Qcur = build_norm(Qcur,
|
|
6183
|
+
model.layers[il].attn_q_norm,
|
|
6184
|
+
model.layers[il].attn_q_norm_b,
|
|
6185
|
+
LLM_NORM, il);
|
|
6186
|
+
}
|
|
5932
6187
|
|
|
5933
|
-
|
|
5934
|
-
|
|
5935
|
-
|
|
5936
|
-
|
|
5937
|
-
|
|
5938
|
-
|
|
6188
|
+
if (model.layers[il].attn_k_norm) {
|
|
6189
|
+
Kcur = build_norm(Kcur,
|
|
6190
|
+
model.layers[il].attn_k_norm,
|
|
6191
|
+
model.layers[il].attn_k_norm_b,
|
|
6192
|
+
LLM_NORM, il);
|
|
6193
|
+
}
|
|
5939
6194
|
|
|
5940
|
-
|
|
5941
|
-
Kcur =
|
|
5942
|
-
|
|
5943
|
-
model.layers[il].attn_k_norm_b,
|
|
5944
|
-
LLM_NORM, il);
|
|
5945
|
-
}
|
|
6195
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6196
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6197
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5946
6198
|
|
|
5947
|
-
|
|
5948
|
-
|
|
5949
|
-
|
|
6199
|
+
// RoPE
|
|
6200
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
6201
|
+
Qcur = ggml_rope_ext(
|
|
6202
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6203
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6204
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6205
|
+
);
|
|
5950
6206
|
|
|
5951
|
-
|
|
5952
|
-
|
|
5953
|
-
|
|
5954
|
-
|
|
5955
|
-
|
|
5956
|
-
|
|
5957
|
-
);
|
|
6207
|
+
Kcur = ggml_rope_ext(
|
|
6208
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6209
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6210
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6211
|
+
);
|
|
6212
|
+
}
|
|
5958
6213
|
|
|
5959
|
-
|
|
5960
|
-
|
|
5961
|
-
|
|
5962
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
5963
|
-
);
|
|
5964
|
-
}
|
|
5965
|
-
|
|
5966
|
-
cb(Qcur, "Qcur", il);
|
|
5967
|
-
cb(Kcur, "Kcur", il);
|
|
5968
|
-
cb(Vcur, "Vcur", il);
|
|
6214
|
+
cb(Qcur, "Qcur", il);
|
|
6215
|
+
cb(Kcur, "Kcur", il);
|
|
6216
|
+
cb(Vcur, "Vcur", il);
|
|
5969
6217
|
|
|
5970
|
-
|
|
5971
|
-
|
|
5972
|
-
|
|
5973
|
-
|
|
6218
|
+
cur = build_attn(inp_attn, gf,
|
|
6219
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
6220
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6221
|
+
cb(cur, "kqv_out", il);
|
|
6222
|
+
}
|
|
5974
6223
|
|
|
5975
|
-
if (il == n_layer - 1 &&
|
|
5976
|
-
// skip computing output for unused tokens
|
|
5977
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6224
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5978
6225
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5979
6226
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5980
6227
|
}
|
|
@@ -6023,7 +6270,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6023
6270
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
6024
6271
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
6025
6272
|
NULL,
|
|
6026
|
-
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
6273
|
+
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
|
|
6027
6274
|
cb(cur, "ffn_out", il);
|
|
6028
6275
|
} else {
|
|
6029
6276
|
cur = build_ffn(cur,
|
|
@@ -6054,6 +6301,118 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6054
6301
|
}
|
|
6055
6302
|
};
|
|
6056
6303
|
|
|
6304
|
+
struct llm_build_neo_bert : public llm_graph_context {
|
|
6305
|
+
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6306
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6307
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6308
|
+
|
|
6309
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6310
|
+
|
|
6311
|
+
ggml_tensor * cur;
|
|
6312
|
+
ggml_tensor * inpL;
|
|
6313
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
6314
|
+
|
|
6315
|
+
// construct input embeddings (token, type, position)
|
|
6316
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
6317
|
+
cb(inpL, "inp_embd", -1);
|
|
6318
|
+
|
|
6319
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
6320
|
+
|
|
6321
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6322
|
+
|
|
6323
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
6324
|
+
ggml_tensor * cur = inpL;
|
|
6325
|
+
|
|
6326
|
+
// pre-norm
|
|
6327
|
+
cur = build_norm(inpL,
|
|
6328
|
+
model.layers[il].attn_norm, NULL,
|
|
6329
|
+
LLM_NORM_RMS, il);
|
|
6330
|
+
|
|
6331
|
+
{
|
|
6332
|
+
ggml_tensor * Qcur;
|
|
6333
|
+
ggml_tensor * Kcur;
|
|
6334
|
+
ggml_tensor * Vcur;
|
|
6335
|
+
|
|
6336
|
+
// self-attention
|
|
6337
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6338
|
+
cb(cur, "wqkv", il);
|
|
6339
|
+
|
|
6340
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6341
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6342
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6343
|
+
|
|
6344
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6345
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6346
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6347
|
+
|
|
6348
|
+
// RoPE
|
|
6349
|
+
Qcur = ggml_rope_ext(
|
|
6350
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6351
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6352
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6353
|
+
);
|
|
6354
|
+
|
|
6355
|
+
Kcur = ggml_rope_ext(
|
|
6356
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6357
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6358
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6359
|
+
);
|
|
6360
|
+
|
|
6361
|
+
cb(Qcur, "Qcur", il);
|
|
6362
|
+
cb(Kcur, "Kcur", il);
|
|
6363
|
+
cb(Vcur, "Vcur", il);
|
|
6364
|
+
|
|
6365
|
+
cur = build_attn(inp_attn, gf,
|
|
6366
|
+
model.layers[il].wo, nullptr,
|
|
6367
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6368
|
+
cb(cur, "kqv_out", il);
|
|
6369
|
+
}
|
|
6370
|
+
|
|
6371
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6372
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6373
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6374
|
+
}
|
|
6375
|
+
|
|
6376
|
+
// re-add the layer input
|
|
6377
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
6378
|
+
|
|
6379
|
+
ggml_tensor * ffn_inp = cur;
|
|
6380
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
6381
|
+
|
|
6382
|
+
// pre-norm
|
|
6383
|
+
cur = build_norm(ffn_inp,
|
|
6384
|
+
model.layers[il].ffn_norm, NULL,
|
|
6385
|
+
LLM_NORM_RMS, il);
|
|
6386
|
+
cb(cur, "ffn_norm", il);
|
|
6387
|
+
|
|
6388
|
+
// feed-forward network
|
|
6389
|
+
cur = build_ffn(cur,
|
|
6390
|
+
model.layers[il].ffn_up,
|
|
6391
|
+
NULL, NULL, NULL, NULL, NULL,
|
|
6392
|
+
model.layers[il].ffn_down,
|
|
6393
|
+
NULL, NULL, NULL,
|
|
6394
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
6395
|
+
|
|
6396
|
+
// attentions bypass the intermediate layer
|
|
6397
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
6398
|
+
|
|
6399
|
+
// input for next layer
|
|
6400
|
+
inpL = cur;
|
|
6401
|
+
}
|
|
6402
|
+
|
|
6403
|
+
cur = inpL;
|
|
6404
|
+
|
|
6405
|
+
cur = build_norm(cur,
|
|
6406
|
+
model.output_norm_enc, NULL,
|
|
6407
|
+
LLM_NORM_RMS, -1);
|
|
6408
|
+
|
|
6409
|
+
cb(cur, "result_embd", -1);
|
|
6410
|
+
res->t_embd = cur;
|
|
6411
|
+
|
|
6412
|
+
ggml_build_forward_expand(gf, cur);
|
|
6413
|
+
}
|
|
6414
|
+
};
|
|
6415
|
+
|
|
6057
6416
|
struct llm_build_bloom : public llm_graph_context {
|
|
6058
6417
|
llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6059
6418
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -6074,6 +6433,8 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
6074
6433
|
LLM_NORM, -1);
|
|
6075
6434
|
cb(inpL, "inp_norm", -1);
|
|
6076
6435
|
|
|
6436
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6437
|
+
|
|
6077
6438
|
for (int il = 0; il < n_layer; ++il) {
|
|
6078
6439
|
cur = build_norm(inpL,
|
|
6079
6440
|
model.layers[il].attn_norm,
|
|
@@ -6106,9 +6467,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
6106
6467
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6107
6468
|
}
|
|
6108
6469
|
|
|
6109
|
-
if (il == n_layer - 1) {
|
|
6110
|
-
// skip computing output for unused tokens
|
|
6111
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6470
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6112
6471
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6113
6472
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6114
6473
|
}
|
|
@@ -6185,6 +6544,8 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6185
6544
|
cb(inpL, "inpL", -1);
|
|
6186
6545
|
}
|
|
6187
6546
|
|
|
6547
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6548
|
+
|
|
6188
6549
|
for (int il = 0; il < n_layer; ++il) {
|
|
6189
6550
|
ggml_tensor * attn_norm;
|
|
6190
6551
|
|
|
@@ -6247,9 +6608,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6247
6608
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6248
6609
|
}
|
|
6249
6610
|
|
|
6250
|
-
if (il == n_layer - 1) {
|
|
6251
|
-
// skip computing output for unused tokens
|
|
6252
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6611
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6253
6612
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6254
6613
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6255
6614
|
}
|
|
@@ -6318,6 +6677,8 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
6318
6677
|
|
|
6319
6678
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6320
6679
|
|
|
6680
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6681
|
+
|
|
6321
6682
|
for (int il = 0; il < n_layer; ++il) {
|
|
6322
6683
|
// norm
|
|
6323
6684
|
cur = build_norm(inpL,
|
|
@@ -6393,9 +6754,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
6393
6754
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6394
6755
|
}
|
|
6395
6756
|
|
|
6396
|
-
if (il == n_layer - 1) {
|
|
6397
|
-
// skip computing output for unused tokens
|
|
6398
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6757
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6399
6758
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6400
6759
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6401
6760
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
@@ -6470,6 +6829,8 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
6470
6829
|
|
|
6471
6830
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6472
6831
|
|
|
6832
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6833
|
+
|
|
6473
6834
|
for (int il = 0; il < n_layer; ++il) {
|
|
6474
6835
|
ggml_tensor * inpSA = inpL;
|
|
6475
6836
|
|
|
@@ -6516,9 +6877,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
6516
6877
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6517
6878
|
}
|
|
6518
6879
|
|
|
6519
|
-
if (il == n_layer - 1) {
|
|
6520
|
-
// skip computing output for unused tokens
|
|
6521
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6880
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6522
6881
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6523
6882
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6524
6883
|
}
|
|
@@ -6587,6 +6946,8 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
6587
6946
|
|
|
6588
6947
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6589
6948
|
|
|
6949
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6950
|
+
|
|
6590
6951
|
for (int il = 0; il < n_layer; ++il) {
|
|
6591
6952
|
ggml_tensor * inpSA = inpL;
|
|
6592
6953
|
|
|
@@ -6636,9 +6997,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
6636
6997
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6637
6998
|
}
|
|
6638
6999
|
|
|
6639
|
-
if (il == n_layer - 1) {
|
|
6640
|
-
// skip computing output for unused tokens
|
|
6641
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7000
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6642
7001
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6643
7002
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6644
7003
|
}
|
|
@@ -6708,6 +7067,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
6708
7067
|
int sections[4];
|
|
6709
7068
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
6710
7069
|
|
|
7070
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7071
|
+
|
|
6711
7072
|
for (int il = 0; il < n_layer; ++il) {
|
|
6712
7073
|
ggml_tensor * inpSA = inpL;
|
|
6713
7074
|
|
|
@@ -6757,9 +7118,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
6757
7118
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6758
7119
|
}
|
|
6759
7120
|
|
|
6760
|
-
if (il == n_layer - 1) {
|
|
6761
|
-
// skip computing output for unused tokens
|
|
6762
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7121
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6763
7122
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6764
7123
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6765
7124
|
}
|
|
@@ -6826,6 +7185,8 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6826
7185
|
|
|
6827
7186
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6828
7187
|
|
|
7188
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7189
|
+
|
|
6829
7190
|
for (int il = 0; il < n_layer; ++il) {
|
|
6830
7191
|
ggml_tensor * inpSA = inpL;
|
|
6831
7192
|
|
|
@@ -6884,9 +7245,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6884
7245
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6885
7246
|
}
|
|
6886
7247
|
|
|
6887
|
-
if (il == n_layer - 1) {
|
|
6888
|
-
// skip computing output for unused tokens
|
|
6889
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7248
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6890
7249
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6891
7250
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6892
7251
|
}
|
|
@@ -6985,6 +7344,8 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
6985
7344
|
|
|
6986
7345
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6987
7346
|
|
|
7347
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7348
|
+
|
|
6988
7349
|
for (int il = 0; il < n_layer; ++il) {
|
|
6989
7350
|
ggml_tensor * inpSA = inpL;
|
|
6990
7351
|
|
|
@@ -7037,9 +7398,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
7037
7398
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7038
7399
|
}
|
|
7039
7400
|
|
|
7040
|
-
if (il == n_layer - 1) {
|
|
7041
|
-
// skip computing output for unused tokens
|
|
7042
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7401
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7043
7402
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7044
7403
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7045
7404
|
}
|
|
@@ -7106,6 +7465,8 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
7106
7465
|
|
|
7107
7466
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7108
7467
|
|
|
7468
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7469
|
+
|
|
7109
7470
|
for (int il = 0; il < n_layer; ++il) {
|
|
7110
7471
|
ggml_tensor * inpSA = inpL;
|
|
7111
7472
|
|
|
@@ -7158,9 +7519,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
7158
7519
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7159
7520
|
}
|
|
7160
7521
|
|
|
7161
|
-
if (il == n_layer - 1) {
|
|
7162
|
-
// skip computing output for unused tokens
|
|
7163
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7522
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7164
7523
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7165
7524
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7166
7525
|
}
|
|
@@ -7236,6 +7595,8 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7236
7595
|
|
|
7237
7596
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7238
7597
|
|
|
7598
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7599
|
+
|
|
7239
7600
|
for (int il = 0; il < n_layer; ++il) {
|
|
7240
7601
|
attn_norm_output = build_norm(inpL,
|
|
7241
7602
|
model.layers[il].attn_norm,
|
|
@@ -7298,9 +7659,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7298
7659
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7299
7660
|
}
|
|
7300
7661
|
|
|
7301
|
-
if (il == n_layer - 1) {
|
|
7302
|
-
// skip computing output for unused tokens
|
|
7303
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7662
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7304
7663
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7305
7664
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7306
7665
|
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
|
@@ -7372,6 +7731,8 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7372
7731
|
inp_attn = build_attn_inp_kv_unified();
|
|
7373
7732
|
}
|
|
7374
7733
|
|
|
7734
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7735
|
+
|
|
7375
7736
|
for (int il = 0; il < n_layer; ++il) {
|
|
7376
7737
|
auto * residual = inpL;
|
|
7377
7738
|
|
|
@@ -7435,9 +7796,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7435
7796
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7436
7797
|
}
|
|
7437
7798
|
|
|
7438
|
-
if (il == n_layer - 1) {
|
|
7439
|
-
// skip computing output for unused tokens
|
|
7440
|
-
ggml_tensor* inp_out_ids = build_inp_out_ids();
|
|
7799
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7441
7800
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7442
7801
|
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
7443
7802
|
}
|
|
@@ -7523,15 +7882,16 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
7523
7882
|
|
|
7524
7883
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7525
7884
|
|
|
7526
|
-
|
|
7885
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7527
7886
|
|
|
7887
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7528
7888
|
// norm
|
|
7529
7889
|
cur = build_norm(inpL,
|
|
7530
7890
|
model.layers[il].attn_norm, NULL,
|
|
7531
7891
|
LLM_NORM_RMS, il);
|
|
7532
7892
|
cb(cur, "attn_norm", il);
|
|
7533
7893
|
|
|
7534
|
-
ggml_tensor *
|
|
7894
|
+
ggml_tensor * sa_inp = cur;
|
|
7535
7895
|
|
|
7536
7896
|
// self-attention
|
|
7537
7897
|
{
|
|
@@ -7569,18 +7929,17 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
7569
7929
|
model.layers[il].wo, NULL,
|
|
7570
7930
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7571
7931
|
}
|
|
7572
|
-
ggml_tensor * sa_out = cur;
|
|
7573
|
-
|
|
7574
|
-
cur = attention_norm;
|
|
7575
7932
|
|
|
7576
|
-
if (il == n_layer - 1) {
|
|
7577
|
-
// skip computing output for unused tokens
|
|
7578
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7933
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7579
7934
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7580
|
-
|
|
7935
|
+
sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
|
|
7581
7936
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7582
7937
|
}
|
|
7583
7938
|
|
|
7939
|
+
ggml_tensor * sa_out = cur;
|
|
7940
|
+
|
|
7941
|
+
cur = sa_inp;
|
|
7942
|
+
|
|
7584
7943
|
// feed-forward network
|
|
7585
7944
|
{
|
|
7586
7945
|
cur = build_ffn(cur,
|
|
@@ -7645,6 +8004,8 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
7645
8004
|
inpL = ggml_add(ctx0, inpL, pos);
|
|
7646
8005
|
cb(inpL, "inpL", -1);
|
|
7647
8006
|
|
|
8007
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8008
|
+
|
|
7648
8009
|
for (int il = 0; il < n_layer; ++il) {
|
|
7649
8010
|
cur = build_norm(inpL,
|
|
7650
8011
|
model.layers[il].attn_norm,
|
|
@@ -7677,9 +8038,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
7677
8038
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7678
8039
|
}
|
|
7679
8040
|
|
|
7680
|
-
if (il == n_layer - 1) {
|
|
7681
|
-
// skip computing output for unused tokens
|
|
7682
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8041
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7683
8042
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7684
8043
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7685
8044
|
}
|
|
@@ -7749,6 +8108,8 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
7749
8108
|
|
|
7750
8109
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7751
8110
|
|
|
8111
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8112
|
+
|
|
7752
8113
|
for (int il = 0; il < n_layer; ++il) {
|
|
7753
8114
|
cur = build_norm(inpL,
|
|
7754
8115
|
model.layers[il].attn_norm,
|
|
@@ -7793,9 +8154,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
7793
8154
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7794
8155
|
}
|
|
7795
8156
|
|
|
7796
|
-
if (il == n_layer - 1) {
|
|
7797
|
-
// skip computing output for unused tokens
|
|
7798
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8157
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7799
8158
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7800
8159
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7801
8160
|
}
|
|
@@ -7849,128 +8208,128 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
7849
8208
|
|
|
7850
8209
|
struct llm_build_orion : public llm_graph_context {
|
|
7851
8210
|
llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
7852
|
-
|
|
8211
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7853
8212
|
|
|
7854
|
-
|
|
7855
|
-
|
|
8213
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8214
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
7856
8215
|
|
|
7857
|
-
|
|
7858
|
-
|
|
8216
|
+
ggml_tensor * cur;
|
|
8217
|
+
ggml_tensor * inpL;
|
|
7859
8218
|
|
|
7860
|
-
|
|
8219
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
7861
8220
|
|
|
7862
|
-
|
|
7863
|
-
|
|
8221
|
+
// inp_pos - contains the positions
|
|
8222
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
7864
8223
|
|
|
7865
|
-
|
|
8224
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7866
8225
|
|
|
7867
|
-
|
|
7868
|
-
ggml_tensor * inpSA = inpL;
|
|
8226
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7869
8227
|
|
|
7870
|
-
|
|
7871
|
-
|
|
7872
|
-
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
|
7873
|
-
LLM_NORM, il);
|
|
7874
|
-
cb(cur, "attn_norm", il);
|
|
8228
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
8229
|
+
ggml_tensor * inpSA = inpL;
|
|
7875
8230
|
|
|
7876
|
-
|
|
7877
|
-
|
|
7878
|
-
|
|
7879
|
-
|
|
7880
|
-
cb(
|
|
7881
|
-
// if (model.layers[il].bq) {
|
|
7882
|
-
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
7883
|
-
// cb(Qcur, "Qcur", il);
|
|
7884
|
-
// }
|
|
7885
|
-
|
|
7886
|
-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
7887
|
-
cb(Kcur, "Kcur", il);
|
|
7888
|
-
// if (model.layers[il].bk) {
|
|
7889
|
-
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
7890
|
-
// cb(Kcur, "Kcur", il);
|
|
7891
|
-
// }
|
|
7892
|
-
|
|
7893
|
-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
7894
|
-
cb(Vcur, "Vcur", il);
|
|
7895
|
-
// if (model.layers[il].bv) {
|
|
7896
|
-
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
7897
|
-
// cb(Vcur, "Vcur", il);
|
|
7898
|
-
// }
|
|
7899
|
-
|
|
7900
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7901
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7902
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7903
|
-
|
|
7904
|
-
Qcur = ggml_rope_ext(
|
|
7905
|
-
ctx0, Qcur, inp_pos, nullptr,
|
|
7906
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7907
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7908
|
-
);
|
|
8231
|
+
// norm
|
|
8232
|
+
cur = build_norm(inpL,
|
|
8233
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
|
8234
|
+
LLM_NORM, il);
|
|
8235
|
+
cb(cur, "attn_norm", il);
|
|
7909
8236
|
|
|
7910
|
-
|
|
7911
|
-
|
|
7912
|
-
|
|
7913
|
-
|
|
7914
|
-
|
|
8237
|
+
// self-attention
|
|
8238
|
+
{
|
|
8239
|
+
// compute Q and K and RoPE them
|
|
8240
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8241
|
+
cb(Qcur, "Qcur", il);
|
|
8242
|
+
// if (model.layers[il].bq) {
|
|
8243
|
+
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8244
|
+
// cb(Qcur, "Qcur", il);
|
|
8245
|
+
// }
|
|
7915
8246
|
|
|
7916
|
-
|
|
7917
|
-
|
|
7918
|
-
|
|
8247
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8248
|
+
cb(Kcur, "Kcur", il);
|
|
8249
|
+
// if (model.layers[il].bk) {
|
|
8250
|
+
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
8251
|
+
// cb(Kcur, "Kcur", il);
|
|
8252
|
+
// }
|
|
7919
8253
|
|
|
7920
|
-
|
|
7921
|
-
|
|
7922
|
-
|
|
7923
|
-
|
|
8254
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8255
|
+
cb(Vcur, "Vcur", il);
|
|
8256
|
+
// if (model.layers[il].bv) {
|
|
8257
|
+
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
8258
|
+
// cb(Vcur, "Vcur", il);
|
|
8259
|
+
// }
|
|
7924
8260
|
|
|
7925
|
-
|
|
7926
|
-
|
|
7927
|
-
|
|
7928
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7929
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7930
|
-
}
|
|
8261
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8262
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8263
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7931
8264
|
|
|
7932
|
-
|
|
7933
|
-
|
|
8265
|
+
Qcur = ggml_rope_ext(
|
|
8266
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
8267
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8268
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8269
|
+
);
|
|
7934
8270
|
|
|
7935
|
-
|
|
7936
|
-
|
|
7937
|
-
|
|
7938
|
-
|
|
7939
|
-
|
|
8271
|
+
Kcur = ggml_rope_ext(
|
|
8272
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
8273
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8274
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8275
|
+
);
|
|
7940
8276
|
|
|
7941
|
-
|
|
7942
|
-
|
|
7943
|
-
|
|
7944
|
-
model.layers[il].ffn_down, NULL, NULL,
|
|
7945
|
-
NULL,
|
|
7946
|
-
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
7947
|
-
cb(cur, "ffn_out", il);
|
|
8277
|
+
cb(Qcur, "Qcur", il);
|
|
8278
|
+
cb(Kcur, "Kcur", il);
|
|
8279
|
+
cb(Vcur, "Vcur", il);
|
|
7948
8280
|
|
|
7949
|
-
|
|
8281
|
+
cur = build_attn(inp_attn, gf,
|
|
8282
|
+
model.layers[il].wo, NULL,
|
|
8283
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8284
|
+
}
|
|
7950
8285
|
|
|
7951
|
-
|
|
7952
|
-
|
|
8286
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8287
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8288
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8289
|
+
}
|
|
7953
8290
|
|
|
7954
|
-
|
|
7955
|
-
|
|
7956
|
-
|
|
8291
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
8292
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
8293
|
+
|
|
8294
|
+
// feed-forward network
|
|
8295
|
+
cur = build_norm(ffn_inp,
|
|
8296
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
|
8297
|
+
LLM_NORM, il);
|
|
8298
|
+
cb(cur, "ffn_norm", il);
|
|
8299
|
+
|
|
8300
|
+
cur = build_ffn(cur,
|
|
8301
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
8302
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
8303
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
8304
|
+
NULL,
|
|
8305
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8306
|
+
cb(cur, "ffn_out", il);
|
|
8307
|
+
|
|
8308
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
8309
|
+
|
|
8310
|
+
cur = build_cvec(cur, il);
|
|
8311
|
+
cb(cur, "l_out", il);
|
|
8312
|
+
|
|
8313
|
+
// input for next layer
|
|
8314
|
+
inpL = cur;
|
|
8315
|
+
}
|
|
7957
8316
|
|
|
7958
|
-
|
|
8317
|
+
cur = inpL;
|
|
7959
8318
|
|
|
7960
|
-
|
|
7961
|
-
|
|
7962
|
-
|
|
8319
|
+
cur = build_norm(cur,
|
|
8320
|
+
model.output_norm, model.output_norm_b,
|
|
8321
|
+
LLM_NORM, -1);
|
|
7963
8322
|
|
|
7964
|
-
|
|
7965
|
-
|
|
8323
|
+
cb(cur, "result_norm", -1);
|
|
8324
|
+
res->t_embd = cur;
|
|
7966
8325
|
|
|
7967
|
-
|
|
7968
|
-
|
|
8326
|
+
// lm_head
|
|
8327
|
+
cur = build_lora_mm(model.output, cur);
|
|
7969
8328
|
|
|
7970
|
-
|
|
7971
|
-
|
|
8329
|
+
cb(cur, "result_output", -1);
|
|
8330
|
+
res->t_logits = cur;
|
|
7972
8331
|
|
|
7973
|
-
|
|
8332
|
+
ggml_build_forward_expand(gf, cur);
|
|
7974
8333
|
}
|
|
7975
8334
|
};
|
|
7976
8335
|
|
|
@@ -7991,6 +8350,8 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
7991
8350
|
|
|
7992
8351
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7993
8352
|
|
|
8353
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8354
|
+
|
|
7994
8355
|
for (int il = 0; il < n_layer; ++il) {
|
|
7995
8356
|
ggml_tensor * inpSA = inpL;
|
|
7996
8357
|
|
|
@@ -8049,9 +8410,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
8049
8410
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8050
8411
|
}
|
|
8051
8412
|
|
|
8052
|
-
if (il == n_layer - 1) {
|
|
8053
|
-
// skip computing output for unused tokens
|
|
8054
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8413
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8055
8414
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8056
8415
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8057
8416
|
}
|
|
@@ -8127,6 +8486,8 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8127
8486
|
|
|
8128
8487
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8129
8488
|
|
|
8489
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8490
|
+
|
|
8130
8491
|
for (int il = 0; il < n_layer; ++il) {
|
|
8131
8492
|
ggml_tensor * inpSA = inpL;
|
|
8132
8493
|
|
|
@@ -8246,15 +8607,13 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8246
8607
|
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
8247
8608
|
}
|
|
8248
8609
|
|
|
8249
|
-
if (il == n_layer - 1) {
|
|
8250
|
-
// skip computing output for unused tokens
|
|
8251
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8610
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8252
8611
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8253
8612
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8254
8613
|
}
|
|
8255
8614
|
|
|
8256
8615
|
// scale_res - scale the hidden states for residual connection
|
|
8257
|
-
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
|
8616
|
+
const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
|
|
8258
8617
|
cur = ggml_scale(ctx0, cur, scale_res);
|
|
8259
8618
|
cb(cur, "hidden_scaled", il);
|
|
8260
8619
|
|
|
@@ -8331,6 +8690,8 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
8331
8690
|
|
|
8332
8691
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8333
8692
|
|
|
8693
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8694
|
+
|
|
8334
8695
|
for (int il = 0; il < n_layer; ++il) {
|
|
8335
8696
|
// norm
|
|
8336
8697
|
cur = build_norm(inpL,
|
|
@@ -8376,9 +8737,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
8376
8737
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8377
8738
|
}
|
|
8378
8739
|
|
|
8379
|
-
if (il == n_layer - 1) {
|
|
8380
|
-
// skip computing output for unused tokens
|
|
8381
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8740
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8382
8741
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8383
8742
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8384
8743
|
}
|
|
@@ -8447,6 +8806,8 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
8447
8806
|
|
|
8448
8807
|
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8449
8808
|
|
|
8809
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8810
|
+
|
|
8450
8811
|
for (int il = 0; il < n_layer; ++il) {
|
|
8451
8812
|
// norm
|
|
8452
8813
|
cur = build_norm(inpL,
|
|
@@ -8484,32 +8845,23 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
8484
8845
|
cb(Kcur, "Kcur", il);
|
|
8485
8846
|
cb(Vcur, "Vcur", il);
|
|
8486
8847
|
|
|
8487
|
-
|
|
8488
|
-
switch (model.type) {
|
|
8489
|
-
case LLM_TYPE_2B:
|
|
8490
|
-
case LLM_TYPE_9B:
|
|
8491
|
-
case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
|
|
8492
|
-
default: GGML_ABORT("fatal error");
|
|
8493
|
-
};
|
|
8494
|
-
cb(Qcur, "Qcur_scaled", il);
|
|
8848
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
8495
8849
|
|
|
8496
8850
|
cur = build_attn(inp_attn, gf,
|
|
8497
8851
|
model.layers[il].wo, NULL,
|
|
8498
8852
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8499
8853
|
}
|
|
8500
8854
|
|
|
8855
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8856
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8857
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8858
|
+
}
|
|
8859
|
+
|
|
8501
8860
|
cur = build_norm(cur,
|
|
8502
8861
|
model.layers[il].attn_post_norm, NULL,
|
|
8503
8862
|
LLM_NORM_RMS, il);
|
|
8504
8863
|
cb(cur, "attn_post_norm", il);
|
|
8505
8864
|
|
|
8506
|
-
if (il == n_layer - 1) {
|
|
8507
|
-
// skip computing output for unused tokens
|
|
8508
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8509
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8510
|
-
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8511
|
-
}
|
|
8512
|
-
|
|
8513
8865
|
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
8514
8866
|
cb(sa_out, "sa_out", il);
|
|
8515
8867
|
|
|
@@ -8588,6 +8940,8 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8588
8940
|
// TODO: is causal == true correct? might need some changes
|
|
8589
8941
|
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8590
8942
|
|
|
8943
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8944
|
+
|
|
8591
8945
|
for (int il = 0; il < n_layer; ++il) {
|
|
8592
8946
|
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
8593
8947
|
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
@@ -8632,9 +8986,17 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8632
8986
|
cb(Kcur, "Kcur", il);
|
|
8633
8987
|
cb(Vcur, "Vcur", il);
|
|
8634
8988
|
|
|
8989
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
|
|
8990
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
8991
|
+
|
|
8635
8992
|
cur = build_attn(inp_attn, gf,
|
|
8636
8993
|
model.layers[il].wo, NULL,
|
|
8637
|
-
Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
8994
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8995
|
+
}
|
|
8996
|
+
|
|
8997
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8998
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8999
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8638
9000
|
}
|
|
8639
9001
|
|
|
8640
9002
|
cur = build_norm(cur,
|
|
@@ -8642,13 +9004,6 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8642
9004
|
LLM_NORM_RMS, il);
|
|
8643
9005
|
cb(cur, "attn_post_norm", il);
|
|
8644
9006
|
|
|
8645
|
-
if (il == n_layer - 1) {
|
|
8646
|
-
// skip computing output for unused tokens
|
|
8647
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8648
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8649
|
-
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8650
|
-
}
|
|
8651
|
-
|
|
8652
9007
|
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
8653
9008
|
cb(sa_out, "sa_out", il);
|
|
8654
9009
|
|
|
@@ -8701,8 +9056,444 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8701
9056
|
}
|
|
8702
9057
|
};
|
|
8703
9058
|
|
|
8704
|
-
|
|
8705
|
-
|
|
9059
|
+
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
9060
|
+
const llama_model & model;
|
|
9061
|
+
ggml_cgraph * gf;
|
|
9062
|
+
|
|
9063
|
+
const int64_t n_embd_head;
|
|
9064
|
+
const int64_t n_embd_altup;
|
|
9065
|
+
const int64_t n_altup;
|
|
9066
|
+
const int i_altup_act;
|
|
9067
|
+
const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
|
|
9068
|
+
const int n_layer_sparsity = 10; // number of layers using activation sparsity
|
|
9069
|
+
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
|
|
9070
|
+
|
|
9071
|
+
ggml_tensor * one; // containing single element 1.0f
|
|
9072
|
+
|
|
9073
|
+
llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
|
|
9074
|
+
: llm_graph_context(params),
|
|
9075
|
+
model(model),
|
|
9076
|
+
gf(gf),
|
|
9077
|
+
n_embd_head(model.hparams.n_embd_head_k),
|
|
9078
|
+
n_embd_altup(model.hparams.n_embd_altup),
|
|
9079
|
+
n_altup(model.hparams.n_altup),
|
|
9080
|
+
i_altup_act(model.hparams.i_altup_act) {
|
|
9081
|
+
ggml_tensor * cur;
|
|
9082
|
+
ggml_tensor * inpL;
|
|
9083
|
+
|
|
9084
|
+
// TODO: remove this when ggml_scale_add is implemented
|
|
9085
|
+
one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
|
9086
|
+
{
|
|
9087
|
+
auto inp = std::make_unique<llm_graph_input_one>();
|
|
9088
|
+
inp->one = one;
|
|
9089
|
+
res->add_input(std::move(inp));
|
|
9090
|
+
}
|
|
9091
|
+
|
|
9092
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
9093
|
+
|
|
9094
|
+
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
9095
|
+
if (ubatch.token) {
|
|
9096
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
|
9097
|
+
cb(inpL, "inp_scaled", -1);
|
|
9098
|
+
}
|
|
9099
|
+
|
|
9100
|
+
// inp_pos - contains the positions
|
|
9101
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
9102
|
+
|
|
9103
|
+
// TODO: is causal == true correct? might need some changes
|
|
9104
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
9105
|
+
|
|
9106
|
+
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
|
|
9107
|
+
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
|
9108
|
+
|
|
9109
|
+
// inpL now has only 1 altup, project it to the rest of the altups
|
|
9110
|
+
// these "added" altups will be concat to the last dim of inpL
|
|
9111
|
+
{
|
|
9112
|
+
ggml_tensor * target_magnitude = calc_magnitude(inpL);
|
|
9113
|
+
ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
|
|
9114
|
+
ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
|
|
9115
|
+
ggml_tensor * new_magnitude = calc_magnitude(altup_added);
|
|
9116
|
+
altup_added = ggml_div(ctx0,
|
|
9117
|
+
ggml_mul(ctx0, altup_added, target_magnitude),
|
|
9118
|
+
new_magnitude);
|
|
9119
|
+
inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
|
|
9120
|
+
cb(inpL, "inp_stacked", -1);
|
|
9121
|
+
}
|
|
9122
|
+
|
|
9123
|
+
// inpL now has shape: [n_embd, n_tokens, n_altup]
|
|
9124
|
+
// inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
|
|
9125
|
+
|
|
9126
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
9127
|
+
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code
|
|
9128
|
+
const bool has_kv = (il < n_layer_kv);
|
|
9129
|
+
|
|
9130
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
9131
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
9132
|
+
|
|
9133
|
+
ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
|
|
9134
|
+
ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
|
|
9135
|
+
|
|
9136
|
+
// predicted value will go through self-attention and laurel
|
|
9137
|
+
ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
|
|
9138
|
+
cur = active_prediction;
|
|
9139
|
+
cb(cur, "active_prediction", il);
|
|
9140
|
+
|
|
9141
|
+
// norm
|
|
9142
|
+
cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
9143
|
+
cb(cur, "attn_norm", il);
|
|
9144
|
+
|
|
9145
|
+
// laurel
|
|
9146
|
+
ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
|
|
9147
|
+
|
|
9148
|
+
// self-attention
|
|
9149
|
+
if (has_kv) {
|
|
9150
|
+
// compute Q and K and RoPE them
|
|
9151
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
9152
|
+
cb(Qcur, "Qcur", il);
|
|
9153
|
+
|
|
9154
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
9155
|
+
cb(Kcur, "Kcur", il);
|
|
9156
|
+
|
|
9157
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
9158
|
+
cb(Vcur, "Vcur", il);
|
|
9159
|
+
|
|
9160
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9161
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9162
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9163
|
+
|
|
9164
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
9165
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
9166
|
+
Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
|
|
9167
|
+
|
|
9168
|
+
cb(Qcur, "Qcur_normed", il);
|
|
9169
|
+
cb(Kcur, "Kcur_normed", il);
|
|
9170
|
+
cb(Vcur, "Vcur_normed", il);
|
|
9171
|
+
|
|
9172
|
+
Qcur = ggml_rope_ext(
|
|
9173
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
9174
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
9175
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9176
|
+
|
|
9177
|
+
Kcur = ggml_rope_ext(
|
|
9178
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
9179
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
9180
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9181
|
+
|
|
9182
|
+
cb(Qcur, "Qcur_pos", il);
|
|
9183
|
+
cb(Kcur, "Kcur_pos", il);
|
|
9184
|
+
|
|
9185
|
+
cur = build_attn(inp_attn, gf,
|
|
9186
|
+
model.layers[il].wo, NULL,
|
|
9187
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
9188
|
+
} else {
|
|
9189
|
+
// no KV layers
|
|
9190
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
9191
|
+
cb(Qcur, "Qcur", il);
|
|
9192
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9193
|
+
|
|
9194
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
9195
|
+
cb(Qcur, "Qcur_normed", il);
|
|
9196
|
+
|
|
9197
|
+
Qcur = ggml_rope_ext(
|
|
9198
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
9199
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
9200
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9201
|
+
cb(Qcur, "Qcur_pos", il);
|
|
9202
|
+
|
|
9203
|
+
cur = build_attn(inp_attn, gf,
|
|
9204
|
+
model.layers[il].wo, NULL,
|
|
9205
|
+
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
9206
|
+
}
|
|
9207
|
+
|
|
9208
|
+
cur = build_norm(cur,
|
|
9209
|
+
model.layers[il].attn_post_norm, NULL,
|
|
9210
|
+
LLM_NORM_RMS, il);
|
|
9211
|
+
cb(cur, "attn_post_norm", il);
|
|
9212
|
+
|
|
9213
|
+
cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
|
|
9214
|
+
cb(cur, "attn_gated", il);
|
|
9215
|
+
|
|
9216
|
+
ggml_tensor * attn_laurel = ggml_scale(ctx0,
|
|
9217
|
+
ggml_add(ctx0, cur, laurel_out),
|
|
9218
|
+
1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
|
|
9219
|
+
cb(attn_laurel, "attn_laurel", il);
|
|
9220
|
+
|
|
9221
|
+
cur = build_norm(attn_laurel,
|
|
9222
|
+
model.layers[il].ffn_norm, NULL,
|
|
9223
|
+
LLM_NORM_RMS, il);
|
|
9224
|
+
cb(cur, "ffn_norm", il);
|
|
9225
|
+
|
|
9226
|
+
// feed-forward network
|
|
9227
|
+
{
|
|
9228
|
+
ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
|
|
9229
|
+
ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
|
|
9230
|
+
|
|
9231
|
+
if (il < n_layer_sparsity) {
|
|
9232
|
+
// apply activation sparsity
|
|
9233
|
+
gate_proj = gaussian_topk(gate_proj);
|
|
9234
|
+
}
|
|
9235
|
+
gate_proj = ggml_gelu(ctx0, gate_proj);
|
|
9236
|
+
|
|
9237
|
+
cur = ggml_mul(ctx0, up_proj, gate_proj);
|
|
9238
|
+
cur = build_lora_mm(model.layers[il].ffn_down, cur);
|
|
9239
|
+
cb(cur, "ffn_out", il);
|
|
9240
|
+
}
|
|
9241
|
+
|
|
9242
|
+
cur = build_norm(cur,
|
|
9243
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
9244
|
+
LLM_NORM_RMS, -1);
|
|
9245
|
+
cb(cur, "ffn_post_norm", il);
|
|
9246
|
+
|
|
9247
|
+
ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
|
|
9248
|
+
cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
|
|
9249
|
+
|
|
9250
|
+
ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
|
|
9251
|
+
|
|
9252
|
+
ggml_tensor * first_prediction; // [n_embd, n_tokens]
|
|
9253
|
+
{
|
|
9254
|
+
first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
|
|
9255
|
+
first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
|
|
9256
|
+
first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
|
|
9257
|
+
first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
|
|
9258
|
+
cb(first_prediction, "first_prediction_gated", il);
|
|
9259
|
+
ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
|
|
9260
|
+
first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
|
|
9261
|
+
cb(first_prediction, "first_prediction_scaled", il);
|
|
9262
|
+
|
|
9263
|
+
first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
|
|
9264
|
+
first_prediction = build_norm(first_prediction,
|
|
9265
|
+
model.layers[il].per_layer_post_norm, NULL,
|
|
9266
|
+
LLM_NORM_RMS, il);
|
|
9267
|
+
cb(first_prediction, "first_prediction_out", il);
|
|
9268
|
+
}
|
|
9269
|
+
|
|
9270
|
+
// equivalent to python code: corrected_predictions[1:] += first_prediction
|
|
9271
|
+
{
|
|
9272
|
+
ggml_tensor * slice_first = view_2d_slice(corrected, 0);
|
|
9273
|
+
ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
|
|
9274
|
+
ggml_row_size(corrected->type, n_embd),
|
|
9275
|
+
ggml_row_size(corrected->type, n_embd*n_tokens),
|
|
9276
|
+
n_embd*n_tokens*ggml_element_size(corrected));
|
|
9277
|
+
ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
|
|
9278
|
+
corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
|
|
9279
|
+
}
|
|
9280
|
+
|
|
9281
|
+
cur = corrected; // [n_embd, n_tokens, n_altup]
|
|
9282
|
+
cur = build_cvec(cur, il);
|
|
9283
|
+
cb(cur, "l_out", il);
|
|
9284
|
+
|
|
9285
|
+
// input for next layer
|
|
9286
|
+
inpL = cur;
|
|
9287
|
+
}
|
|
9288
|
+
|
|
9289
|
+
cur = inpL; // [n_embd, n_tokens, n_altup]
|
|
9290
|
+
|
|
9291
|
+
// cur now has multiple altup(s), we want to merge them back to 1 altup
|
|
9292
|
+
{
|
|
9293
|
+
ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
|
|
9294
|
+
// do a view to skip the first slice (active altup)
|
|
9295
|
+
ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
|
|
9296
|
+
ggml_row_size(cur->type, n_embd),
|
|
9297
|
+
ggml_row_size(cur->type, n_embd*n_tokens),
|
|
9298
|
+
n_embd*n_tokens*ggml_element_size(cur));
|
|
9299
|
+
ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
|
|
9300
|
+
ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
|
|
9301
|
+
altup_unembd = ggml_div(ctx0,
|
|
9302
|
+
ggml_mul(ctx0, altup_unembd, target_magnitude),
|
|
9303
|
+
new_magnitude);
|
|
9304
|
+
cb(altup_unembd, "altup_unembd", -1);
|
|
9305
|
+
|
|
9306
|
+
// equivalent to torch.mean(hidden_states, dim=0)
|
|
9307
|
+
cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
|
|
9308
|
+
for (int i = 0; i < n_altup - 1; ++i) {
|
|
9309
|
+
cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
|
|
9310
|
+
}
|
|
9311
|
+
cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
|
|
9312
|
+
cb(cur, "unembd_merged", -1);
|
|
9313
|
+
}
|
|
9314
|
+
|
|
9315
|
+
// cur now has shape: [n_embd, n_tokens]
|
|
9316
|
+
|
|
9317
|
+
// TODO: move this to right after the last KV layer
|
|
9318
|
+
{
|
|
9319
|
+
// skip computing output for unused tokens
|
|
9320
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9321
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9322
|
+
}
|
|
9323
|
+
|
|
9324
|
+
cur = build_norm(cur,
|
|
9325
|
+
model.output_norm, NULL,
|
|
9326
|
+
LLM_NORM_RMS, -1);
|
|
9327
|
+
|
|
9328
|
+
cb(cur, "result_norm", -1);
|
|
9329
|
+
res->t_embd = cur;
|
|
9330
|
+
|
|
9331
|
+
cur = build_lora_mm(model.output, cur);
|
|
9332
|
+
|
|
9333
|
+
{
|
|
9334
|
+
// final logit soft-capping
|
|
9335
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
|
9336
|
+
cur = ggml_tanh(ctx0, cur);
|
|
9337
|
+
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
|
9338
|
+
}
|
|
9339
|
+
|
|
9340
|
+
cb(cur, "result_output", -1);
|
|
9341
|
+
res->t_logits = cur;
|
|
9342
|
+
|
|
9343
|
+
ggml_build_forward_expand(gf, cur);
|
|
9344
|
+
}
|
|
9345
|
+
|
|
9346
|
+
ggml_tensor * calc_magnitude(ggml_tensor * x) {
|
|
9347
|
+
return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
|
|
9348
|
+
}
|
|
9349
|
+
|
|
9350
|
+
// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
|
|
9351
|
+
ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
|
|
9352
|
+
GGML_ASSERT(idx < (int)x->ne[2]);
|
|
9353
|
+
return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
|
|
9354
|
+
ggml_row_size(x->type, x->ne[0]),
|
|
9355
|
+
idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
|
|
9356
|
+
}
|
|
9357
|
+
|
|
9358
|
+
// equivalent to get_per_layer_inputs() in python code
|
|
9359
|
+
// output shape: [n_embd_altup, n_layer, n_tokens]
|
|
9360
|
+
ggml_tensor * get_per_layer_inputs() {
|
|
9361
|
+
auto inp = std::make_unique<llm_graph_input_embd>();
|
|
9362
|
+
ggml_tensor * inp_per_layer;
|
|
9363
|
+
if (ubatch.token) {
|
|
9364
|
+
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
|
9365
|
+
ggml_set_input(inp->tokens);
|
|
9366
|
+
res->t_tokens = inp->tokens;
|
|
9367
|
+
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
|
|
9368
|
+
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
|
9369
|
+
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
|
|
9370
|
+
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
|
9371
|
+
} else {
|
|
9372
|
+
GGML_ABORT("TODO: support embd input");
|
|
9373
|
+
}
|
|
9374
|
+
res->add_input(std::move(inp));
|
|
9375
|
+
return inp_per_layer;
|
|
9376
|
+
}
|
|
9377
|
+
|
|
9378
|
+
// equivalent to project_per_layer_inputs() in python code
|
|
9379
|
+
// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
|
|
9380
|
+
// output shape: [n_embd_altup, n_tokens, n_layer]
|
|
9381
|
+
ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
|
|
9382
|
+
const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
|
|
9383
|
+
const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
|
|
9384
|
+
|
|
9385
|
+
ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
|
|
9386
|
+
per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
|
|
9387
|
+
per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
|
|
9388
|
+
per_layer_proj = build_norm(per_layer_proj,
|
|
9389
|
+
model.per_layer_proj_norm, NULL,
|
|
9390
|
+
LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
|
|
9391
|
+
cb(per_layer_proj, "per_layer_proj", -1);
|
|
9392
|
+
|
|
9393
|
+
inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
|
|
9394
|
+
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
|
9395
|
+
cb(inp_per_layer, "inp_per_layer", -1);
|
|
9396
|
+
|
|
9397
|
+
// permute to shape: [n_embd_altup, n_tokens, n_layer]
|
|
9398
|
+
inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
|
|
9399
|
+
return inp_per_layer;
|
|
9400
|
+
}
|
|
9401
|
+
|
|
9402
|
+
// input cur shape: [n_altup, n_tokens]
|
|
9403
|
+
// output shape: [n_altup, n_tokens]
|
|
9404
|
+
ggml_tensor * laurel(ggml_tensor * cur, int il) {
|
|
9405
|
+
ggml_tensor * tmp = cur;
|
|
9406
|
+
tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
|
|
9407
|
+
tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
|
|
9408
|
+
tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
|
|
9409
|
+
tmp = ggml_add(ctx0, tmp, cur);
|
|
9410
|
+
cb(tmp, "laurel_out", il);
|
|
9411
|
+
return tmp;
|
|
9412
|
+
}
|
|
9413
|
+
|
|
9414
|
+
// input x shape: [n_embd, n_tokens]
|
|
9415
|
+
// output shape: [n_embd, n_tokens]
|
|
9416
|
+
ggml_tensor * gaussian_topk(ggml_tensor * x) {
|
|
9417
|
+
ggml_tensor * mean = ggml_mean(ctx0, x);
|
|
9418
|
+
ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
|
|
9419
|
+
ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
|
|
9420
|
+
1.0f / (float)(x->ne[0] - 1)
|
|
9421
|
+
));
|
|
9422
|
+
ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
|
|
9423
|
+
return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
|
|
9424
|
+
}
|
|
9425
|
+
|
|
9426
|
+
//
|
|
9427
|
+
// altup functions
|
|
9428
|
+
//
|
|
9429
|
+
|
|
9430
|
+
// equivalent to compute_router_modalities() in python code
|
|
9431
|
+
// input x shape: [n_embd, n_tokens]
|
|
9432
|
+
// output shape: [n_altup, n_tokens]
|
|
9433
|
+
ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
|
|
9434
|
+
ggml_tensor * router_inputs = build_norm(x,
|
|
9435
|
+
model.layers[il].altup_router_norm, NULL,
|
|
9436
|
+
LLM_NORM_RMS, il);
|
|
9437
|
+
|
|
9438
|
+
// router_input_scale
|
|
9439
|
+
router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
|
|
9440
|
+
|
|
9441
|
+
ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
|
|
9442
|
+
return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
|
|
9443
|
+
}
|
|
9444
|
+
|
|
9445
|
+
// input cur shape: [n_embd, n_tokens, n_altup]
|
|
9446
|
+
// output shape: [n_embd, n_tokens, n_altup]
|
|
9447
|
+
ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
|
|
9448
|
+
ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
|
|
9449
|
+
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
|
9450
|
+
cb(modalities, "modalities", il);
|
|
9451
|
+
|
|
9452
|
+
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
|
|
9453
|
+
cb(all_coefs, "all_coefs", il);
|
|
9454
|
+
// first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
|
|
9455
|
+
all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
|
|
9456
|
+
|
|
9457
|
+
// permute to [n_altup, n_embd, n_tokens]
|
|
9458
|
+
ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
|
9459
|
+
ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
|
|
9460
|
+
|
|
9461
|
+
// final shape must be the same as cur: [n_embd, n_tokens, n_altup]
|
|
9462
|
+
predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
|
|
9463
|
+
predictions = ggml_add(ctx0, predictions, cur);
|
|
9464
|
+
cb(predictions, "predictions", il);
|
|
9465
|
+
|
|
9466
|
+
return predictions;
|
|
9467
|
+
}
|
|
9468
|
+
|
|
9469
|
+
// input predictions shape: [n_embd, n_tokens, n_altup]
|
|
9470
|
+
// input activated shape: [n_embd, n_tokens]
|
|
9471
|
+
// output shape: [n_embd, n_tokens, n_altup]
|
|
9472
|
+
ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
|
|
9473
|
+
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
|
9474
|
+
cb(modalities, "modalities", il);
|
|
9475
|
+
|
|
9476
|
+
ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
|
|
9477
|
+
ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
|
|
9478
|
+
cb(innovation, "innovation", il);
|
|
9479
|
+
|
|
9480
|
+
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
|
|
9481
|
+
all_coefs = ggml_add(ctx0, all_coefs, one);
|
|
9482
|
+
cb(all_coefs, "all_coefs", il);
|
|
9483
|
+
all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
|
|
9484
|
+
all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
|
|
9485
|
+
|
|
9486
|
+
innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
|
|
9487
|
+
ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
|
|
9488
|
+
corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
|
|
9489
|
+
cb(corrected, "corrected", il);
|
|
9490
|
+
|
|
9491
|
+
return corrected;
|
|
9492
|
+
}
|
|
9493
|
+
};
|
|
9494
|
+
|
|
9495
|
+
// TODO: move up next to build_starcoder
|
|
9496
|
+
struct llm_build_starcoder2 : public llm_graph_context {
|
|
8706
9497
|
llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
8707
9498
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8708
9499
|
|
|
@@ -8719,6 +9510,8 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
8719
9510
|
|
|
8720
9511
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8721
9512
|
|
|
9513
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9514
|
+
|
|
8722
9515
|
for (int il = 0; il < n_layer; ++il) {
|
|
8723
9516
|
ggml_tensor * inpSA = inpL;
|
|
8724
9517
|
|
|
@@ -8777,9 +9570,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
8777
9570
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8778
9571
|
}
|
|
8779
9572
|
|
|
8780
|
-
if (il == n_layer - 1) {
|
|
8781
|
-
// skip computing output for unused tokens
|
|
8782
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9573
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8783
9574
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8784
9575
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8785
9576
|
}
|
|
@@ -8840,8 +9631,9 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8840
9631
|
// {n_embd, n_tokens}
|
|
8841
9632
|
inpL = build_inp_embd(model.tok_embd);
|
|
8842
9633
|
|
|
8843
|
-
|
|
8844
|
-
|
|
9634
|
+
auto * rs_inp = build_rs_inp();
|
|
9635
|
+
|
|
9636
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8845
9637
|
|
|
8846
9638
|
for (int il = 0; il < n_layer; ++il) {
|
|
8847
9639
|
// norm
|
|
@@ -8850,12 +9642,9 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8850
9642
|
LLM_NORM_RMS, il);
|
|
8851
9643
|
cb(cur, "attn_norm", il);
|
|
8852
9644
|
|
|
8853
|
-
|
|
8854
|
-
cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
|
|
9645
|
+
cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
|
|
8855
9646
|
|
|
8856
|
-
if (il == n_layer - 1) {
|
|
8857
|
-
// skip computing output for unused tokens
|
|
8858
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9647
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8859
9648
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8860
9649
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8861
9650
|
}
|
|
@@ -8889,15 +9678,14 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8889
9678
|
|
|
8890
9679
|
// TODO: split
|
|
8891
9680
|
ggml_tensor * build_mamba_layer(
|
|
8892
|
-
|
|
8893
|
-
|
|
8894
|
-
|
|
8895
|
-
|
|
8896
|
-
|
|
8897
|
-
|
|
8898
|
-
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
|
9681
|
+
llm_graph_input_rs * inp,
|
|
9682
|
+
ggml_cgraph * gf,
|
|
9683
|
+
ggml_tensor * cur,
|
|
9684
|
+
const llama_ubatch & ubatch,
|
|
9685
|
+
int il) const {
|
|
9686
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
8899
9687
|
|
|
8900
|
-
const auto kv_head =
|
|
9688
|
+
const auto kv_head = mctx_cur->get_head();
|
|
8901
9689
|
|
|
8902
9690
|
const int64_t d_conv = hparams.ssm_d_conv;
|
|
8903
9691
|
const int64_t d_inner = hparams.ssm_d_inner;
|
|
@@ -8915,17 +9703,17 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8915
9703
|
GGML_ASSERT(ubatch.equal_seqs);
|
|
8916
9704
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
8917
9705
|
|
|
8918
|
-
ggml_tensor * conv_states_all =
|
|
8919
|
-
ggml_tensor * ssm_states_all =
|
|
9706
|
+
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
9707
|
+
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
8920
9708
|
|
|
8921
9709
|
// (ab)using the KV cache to store the states
|
|
8922
|
-
ggml_tensor * conv =
|
|
8923
|
-
gf, conv_states_all,
|
|
8924
|
-
hparams.
|
|
9710
|
+
ggml_tensor * conv = build_rs(
|
|
9711
|
+
inp, gf, conv_states_all,
|
|
9712
|
+
hparams.n_embd_r(), n_seqs);
|
|
8925
9713
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
|
8926
|
-
ggml_tensor * ssm =
|
|
8927
|
-
gf, ssm_states_all,
|
|
8928
|
-
hparams.
|
|
9714
|
+
ggml_tensor * ssm = build_rs(
|
|
9715
|
+
inp, gf, ssm_states_all,
|
|
9716
|
+
hparams.n_embd_s(), n_seqs);
|
|
8929
9717
|
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
|
8930
9718
|
|
|
8931
9719
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
@@ -9038,13 +9826,15 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
9038
9826
|
|
|
9039
9827
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9040
9828
|
|
|
9041
|
-
|
|
9829
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9042
9830
|
|
|
9831
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
9043
9832
|
// norm
|
|
9044
9833
|
cur = build_norm(inpL,
|
|
9045
9834
|
model.layers[il].attn_norm, NULL,
|
|
9046
9835
|
LLM_NORM, il);
|
|
9047
9836
|
cb(cur, "attn_norm", il);
|
|
9837
|
+
|
|
9048
9838
|
ggml_tensor * ffn_inp = cur;
|
|
9049
9839
|
|
|
9050
9840
|
// self-attention
|
|
@@ -9112,9 +9902,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
9112
9902
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9113
9903
|
}
|
|
9114
9904
|
|
|
9115
|
-
if (il == n_layer - 1) {
|
|
9116
|
-
// skip computing output for unused tokens
|
|
9117
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9905
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9118
9906
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9119
9907
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9120
9908
|
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
@@ -9185,6 +9973,8 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
9185
9973
|
|
|
9186
9974
|
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
9187
9975
|
|
|
9976
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9977
|
+
|
|
9188
9978
|
for (int il = 0; il < n_layer; ++il) {
|
|
9189
9979
|
const bool is_swa = hparams.is_swa(il);
|
|
9190
9980
|
|
|
@@ -9247,9 +10037,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
9247
10037
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9248
10038
|
}
|
|
9249
10039
|
|
|
9250
|
-
if (il == n_layer - 1) {
|
|
9251
|
-
// skip computing output for unused tokens
|
|
9252
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10040
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9253
10041
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9254
10042
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9255
10043
|
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
@@ -9320,6 +10108,8 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
9320
10108
|
|
|
9321
10109
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9322
10110
|
|
|
10111
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10112
|
+
|
|
9323
10113
|
for (int il = 0; il < n_layer; ++il) {
|
|
9324
10114
|
ggml_tensor * inpSA = inpL;
|
|
9325
10115
|
|
|
@@ -9378,9 +10168,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
9378
10168
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9379
10169
|
}
|
|
9380
10170
|
|
|
9381
|
-
if (il == n_layer - 1) {
|
|
9382
|
-
// skip computing output for unused tokens
|
|
9383
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10171
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9384
10172
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9385
10173
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9386
10174
|
}
|
|
@@ -9448,6 +10236,8 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
9448
10236
|
|
|
9449
10237
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9450
10238
|
|
|
10239
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10240
|
+
|
|
9451
10241
|
for (int il = 0; il < n_layer; ++il) {
|
|
9452
10242
|
ggml_tensor * inpSA = inpL;
|
|
9453
10243
|
|
|
@@ -9498,18 +10288,16 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
9498
10288
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9499
10289
|
}
|
|
9500
10290
|
|
|
10291
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10292
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10293
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10294
|
+
}
|
|
10295
|
+
|
|
9501
10296
|
cur = build_norm(cur,
|
|
9502
10297
|
model.layers[il].attn_post_norm, NULL,
|
|
9503
10298
|
LLM_NORM_RMS, il);
|
|
9504
10299
|
cb(cur, "attn_post_norm", il);
|
|
9505
10300
|
|
|
9506
|
-
if (il == n_layer - 1) {
|
|
9507
|
-
// skip computing output for unused tokens
|
|
9508
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9509
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9510
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9511
|
-
}
|
|
9512
|
-
|
|
9513
10301
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
9514
10302
|
cb(ffn_inp, "ffn_inp", il);
|
|
9515
10303
|
|
|
@@ -9577,6 +10365,8 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
9577
10365
|
|
|
9578
10366
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9579
10367
|
|
|
10368
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10369
|
+
|
|
9580
10370
|
for (int il = 0; il < n_layer; ++il) {
|
|
9581
10371
|
ggml_tensor * inpSA = inpL;
|
|
9582
10372
|
|
|
@@ -9631,9 +10421,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
9631
10421
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9632
10422
|
}
|
|
9633
10423
|
|
|
9634
|
-
if (il == n_layer - 1) {
|
|
9635
|
-
// skip computing output for unused tokens
|
|
9636
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10424
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9637
10425
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9638
10426
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9639
10427
|
}
|
|
@@ -9703,6 +10491,8 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
9703
10491
|
|
|
9704
10492
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9705
10493
|
|
|
10494
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10495
|
+
|
|
9706
10496
|
for (int il = 0; il < n_layer; ++il) {
|
|
9707
10497
|
const int64_t n_head = hparams.n_head(il);
|
|
9708
10498
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
@@ -9764,11 +10554,9 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
9764
10554
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9765
10555
|
}
|
|
9766
10556
|
|
|
9767
|
-
if (il == n_layer - 1) {
|
|
9768
|
-
// skip computing output for unused tokens
|
|
9769
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10557
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9770
10558
|
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
9771
|
-
cur
|
|
10559
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9772
10560
|
}
|
|
9773
10561
|
|
|
9774
10562
|
ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
|
@@ -9834,6 +10622,8 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
9834
10622
|
|
|
9835
10623
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9836
10624
|
|
|
10625
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10626
|
+
|
|
9837
10627
|
for (int il = 0; il < n_layer; ++il) {
|
|
9838
10628
|
cur = build_norm(inpL,
|
|
9839
10629
|
model.layers[il].attn_norm,
|
|
@@ -9878,9 +10668,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
9878
10668
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9879
10669
|
}
|
|
9880
10670
|
|
|
9881
|
-
if (il == n_layer - 1) {
|
|
9882
|
-
// skip computing output for unused tokens
|
|
9883
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10671
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9884
10672
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9885
10673
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9886
10674
|
}
|
|
@@ -9982,6 +10770,8 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
9982
10770
|
|
|
9983
10771
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9984
10772
|
|
|
10773
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10774
|
+
|
|
9985
10775
|
for (int il = 0; il < n_layer; ++il) {
|
|
9986
10776
|
ggml_tensor * inpSA = inpL;
|
|
9987
10777
|
|
|
@@ -10028,9 +10818,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
10028
10818
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10029
10819
|
}
|
|
10030
10820
|
|
|
10031
|
-
if (il == n_layer - 1) {
|
|
10032
|
-
// skip computing output for unused tokens
|
|
10033
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10821
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10034
10822
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10035
10823
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10036
10824
|
}
|
|
@@ -10122,6 +10910,8 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
10122
10910
|
|
|
10123
10911
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
10124
10912
|
|
|
10913
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10914
|
+
|
|
10125
10915
|
for (int il = 0; il < n_layer; ++il) {
|
|
10126
10916
|
ggml_tensor * inpSA = inpL;
|
|
10127
10917
|
|
|
@@ -10183,14 +10973,11 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
10183
10973
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
10184
10974
|
}
|
|
10185
10975
|
|
|
10186
|
-
if (il == n_layer - 1) {
|
|
10187
|
-
// skip computing output for unused tokens
|
|
10188
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10976
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10189
10977
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10190
10978
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10191
10979
|
}
|
|
10192
10980
|
|
|
10193
|
-
|
|
10194
10981
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
10195
10982
|
cb(ffn_inp, "ffn_inp", il);
|
|
10196
10983
|
|
|
@@ -10298,6 +11085,8 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
10298
11085
|
|
|
10299
11086
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10300
11087
|
|
|
11088
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11089
|
+
|
|
10301
11090
|
for (int il = 0; il < n_layer; ++il) {
|
|
10302
11091
|
ggml_tensor * inpSA = inpL;
|
|
10303
11092
|
|
|
@@ -10447,9 +11236,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
10447
11236
|
}
|
|
10448
11237
|
}
|
|
10449
11238
|
|
|
10450
|
-
if (il == n_layer - 1) {
|
|
10451
|
-
// skip computing output for unused tokens
|
|
10452
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11239
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10453
11240
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10454
11241
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10455
11242
|
}
|
|
@@ -10545,6 +11332,8 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
10545
11332
|
|
|
10546
11333
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10547
11334
|
|
|
11335
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11336
|
+
|
|
10548
11337
|
for (int il = 0; il < n_layer; ++il) {
|
|
10549
11338
|
ggml_tensor * inpSA = inpL;
|
|
10550
11339
|
|
|
@@ -10627,9 +11416,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
10627
11416
|
cb(cur, "attn_o_out", il);
|
|
10628
11417
|
}
|
|
10629
11418
|
|
|
10630
|
-
if (il == n_layer - 1) {
|
|
10631
|
-
// skip computing output for unused tokens
|
|
10632
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11419
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10633
11420
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10634
11421
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10635
11422
|
}
|
|
@@ -10704,6 +11491,8 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
10704
11491
|
|
|
10705
11492
|
auto * inp_attn = build_attn_inp_no_cache();
|
|
10706
11493
|
|
|
11494
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11495
|
+
|
|
10707
11496
|
for (int il = 0; il < n_layer; ++il) {
|
|
10708
11497
|
ggml_tensor * inpSA = inpL;
|
|
10709
11498
|
|
|
@@ -10737,9 +11526,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
10737
11526
|
cb(cur, "kqv_out", il);
|
|
10738
11527
|
}
|
|
10739
11528
|
|
|
10740
|
-
if (il == n_layer - 1) {
|
|
10741
|
-
// skip computing output for unused tokens
|
|
10742
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11529
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10743
11530
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10744
11531
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10745
11532
|
}
|
|
@@ -10810,6 +11597,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
10810
11597
|
auto * inp_attn_self = build_attn_inp_kv_unified();
|
|
10811
11598
|
auto * inp_attn_cross = build_attn_inp_cross();
|
|
10812
11599
|
|
|
11600
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11601
|
+
|
|
10813
11602
|
for (int il = 0; il < n_layer; ++il) {
|
|
10814
11603
|
ggml_tensor * inpSA = inpL;
|
|
10815
11604
|
|
|
@@ -10901,11 +11690,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
10901
11690
|
//cb(cur, "kqv_out", il);
|
|
10902
11691
|
}
|
|
10903
11692
|
|
|
10904
|
-
if (il == n_layer - 1) {
|
|
10905
|
-
// skip computing output for unused tokens
|
|
10906
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11693
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10907
11694
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10908
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10909
11695
|
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
|
10910
11696
|
}
|
|
10911
11697
|
|
|
@@ -10975,6 +11761,8 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
10975
11761
|
|
|
10976
11762
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10977
11763
|
|
|
11764
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11765
|
+
|
|
10978
11766
|
for (int il = 0; il < n_layer; ++il) {
|
|
10979
11767
|
cur = build_norm(inpL,
|
|
10980
11768
|
model.layers[il].attn_norm,
|
|
@@ -11007,9 +11795,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
11007
11795
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
11008
11796
|
}
|
|
11009
11797
|
|
|
11010
|
-
if (il == n_layer - 1) {
|
|
11011
|
-
// skip computing output for unused tokens
|
|
11012
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11798
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11013
11799
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11014
11800
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
11015
11801
|
}
|
|
@@ -11073,6 +11859,8 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11073
11859
|
|
|
11074
11860
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11075
11861
|
|
|
11862
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11863
|
+
|
|
11076
11864
|
for (int il = 0; il < n_layer; ++il) {
|
|
11077
11865
|
ggml_tensor * inpSA = inpL;
|
|
11078
11866
|
|
|
@@ -11139,9 +11927,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11139
11927
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11140
11928
|
}
|
|
11141
11929
|
|
|
11142
|
-
if (il == n_layer - 1) {
|
|
11143
|
-
// skip computing output for unused tokens
|
|
11144
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11930
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11145
11931
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11146
11932
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11147
11933
|
}
|
|
@@ -11206,6 +11992,8 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
11206
11992
|
|
|
11207
11993
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11208
11994
|
|
|
11995
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11996
|
+
|
|
11209
11997
|
for (int il = 0; il < n_layer; ++il) {
|
|
11210
11998
|
ggml_tensor * inpSA = inpL;
|
|
11211
11999
|
|
|
@@ -11272,9 +12060,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
11272
12060
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11273
12061
|
}
|
|
11274
12062
|
|
|
11275
|
-
if (il == n_layer - 1) {
|
|
11276
|
-
// skip computing output for unused tokens
|
|
11277
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12063
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11278
12064
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11279
12065
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11280
12066
|
}
|
|
@@ -11357,6 +12143,8 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
11357
12143
|
|
|
11358
12144
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11359
12145
|
|
|
12146
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12147
|
+
|
|
11360
12148
|
for (int il = 0; il < n_layer; ++il) {
|
|
11361
12149
|
ggml_tensor * inpSA = inpL;
|
|
11362
12150
|
|
|
@@ -11416,9 +12204,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
11416
12204
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11417
12205
|
}
|
|
11418
12206
|
|
|
11419
|
-
if (il == n_layer - 1) {
|
|
11420
|
-
// skip computing output for unused tokens
|
|
11421
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12207
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11422
12208
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11423
12209
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11424
12210
|
}
|
|
@@ -11486,6 +12272,8 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
11486
12272
|
|
|
11487
12273
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11488
12274
|
|
|
12275
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12276
|
+
|
|
11489
12277
|
for (int il = 0; il < n_layer; ++il) {
|
|
11490
12278
|
ggml_tensor * inpSA = inpL;
|
|
11491
12279
|
|
|
@@ -11547,9 +12335,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
11547
12335
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11548
12336
|
}
|
|
11549
12337
|
|
|
11550
|
-
if (il == n_layer - 1) {
|
|
11551
|
-
// skip computing output for unused tokens
|
|
11552
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12338
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11553
12339
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11554
12340
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11555
12341
|
}
|
|
@@ -11636,14 +12422,13 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11636
12422
|
}
|
|
11637
12423
|
|
|
11638
12424
|
ggml_tensor * build_rwkv6_time_mix(
|
|
12425
|
+
llm_graph_input_rs * inp,
|
|
11639
12426
|
ggml_cgraph * gf,
|
|
11640
12427
|
ggml_tensor * cur,
|
|
11641
12428
|
ggml_tensor * x_prev,
|
|
11642
|
-
ggml_tensor * state_copy,
|
|
11643
|
-
ggml_tensor * state_mask,
|
|
11644
12429
|
const llama_ubatch & ubatch,
|
|
11645
12430
|
int il) const {
|
|
11646
|
-
const auto *
|
|
12431
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
11647
12432
|
|
|
11648
12433
|
const auto n_tokens = ubatch.n_tokens;
|
|
11649
12434
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -11653,7 +12438,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11653
12438
|
const auto n_head = n_embd / head_size;
|
|
11654
12439
|
const auto n_head_kv = hparams.n_head_kv(il);
|
|
11655
12440
|
|
|
11656
|
-
const auto kv_head =
|
|
12441
|
+
const auto kv_head = mctx_cur->get_head();
|
|
11657
12442
|
|
|
11658
12443
|
const auto & layer = model.layers[il];
|
|
11659
12444
|
|
|
@@ -11764,9 +12549,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11764
12549
|
k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
|
|
11765
12550
|
}
|
|
11766
12551
|
|
|
11767
|
-
ggml_tensor * wkv_state =
|
|
11768
|
-
gf,
|
|
11769
|
-
hparams.
|
|
12552
|
+
ggml_tensor * wkv_state = build_rs(
|
|
12553
|
+
inp, gf, mctx_cur->get_s_l(il),
|
|
12554
|
+
hparams.n_embd_s(), n_seqs);
|
|
11770
12555
|
|
|
11771
12556
|
ggml_tensor * wkv_output;
|
|
11772
12557
|
if (is_qrwkv) {
|
|
@@ -11784,9 +12569,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11784
12569
|
wkv_state,
|
|
11785
12570
|
ggml_view_1d(
|
|
11786
12571
|
ctx0,
|
|
11787
|
-
|
|
11788
|
-
hparams.
|
|
11789
|
-
hparams.
|
|
12572
|
+
mctx_cur->get_s_l(il),
|
|
12573
|
+
hparams.n_embd_s() * n_seqs,
|
|
12574
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
|
|
11790
12575
|
)
|
|
11791
12576
|
)
|
|
11792
12577
|
);
|
|
@@ -11820,20 +12605,19 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11820
12605
|
inpL = build_inp_embd(model.tok_embd);
|
|
11821
12606
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
|
11822
12607
|
|
|
11823
|
-
|
|
11824
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12608
|
+
auto * rs_inp = build_rs_inp();
|
|
11825
12609
|
|
|
11826
12610
|
const auto n_embd = hparams.n_embd;
|
|
11827
12611
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
11828
12612
|
const auto n_seqs = ubatch.n_seqs;
|
|
11829
12613
|
|
|
12614
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12615
|
+
|
|
11830
12616
|
for (int il = 0; il < n_layer; ++il) {
|
|
11831
12617
|
const llama_layer * layer = &model.layers[il];
|
|
11832
12618
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
11833
12619
|
|
|
11834
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
11835
|
-
gf, state_copy, state_mask, ubatch, il
|
|
11836
|
-
);
|
|
12620
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
11837
12621
|
|
|
11838
12622
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
11839
12623
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -11848,7 +12632,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11848
12632
|
1
|
|
11849
12633
|
);
|
|
11850
12634
|
|
|
11851
|
-
cur = build_rwkv6_time_mix(gf, att_norm, x_prev,
|
|
12635
|
+
cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
|
|
11852
12636
|
|
|
11853
12637
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
11854
12638
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -11870,13 +12654,16 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11870
12654
|
);
|
|
11871
12655
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
11872
12656
|
|
|
11873
|
-
|
|
11874
|
-
|
|
11875
|
-
|
|
11876
|
-
|
|
11877
|
-
|
|
11878
|
-
|
|
11879
|
-
|
|
12657
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
12658
|
+
ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
|
|
12659
|
+
x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
|
|
12660
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
12661
|
+
|
|
12662
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12663
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
12664
|
+
ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
|
|
12665
|
+
x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
|
|
12666
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11880
12667
|
}
|
|
11881
12668
|
|
|
11882
12669
|
cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
|
|
@@ -11911,27 +12698,26 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11911
12698
|
// ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
|
|
11912
12699
|
struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
11913
12700
|
llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
|
|
11914
|
-
GGML_ASSERT(n_embd == hparams.
|
|
12701
|
+
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
11915
12702
|
|
|
11916
12703
|
ggml_tensor * cur;
|
|
11917
12704
|
ggml_tensor * inpL;
|
|
11918
12705
|
|
|
11919
12706
|
inpL = build_inp_embd(model.tok_embd);
|
|
11920
12707
|
|
|
11921
|
-
|
|
11922
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12708
|
+
auto * rs_inp = build_rs_inp();
|
|
11923
12709
|
|
|
11924
12710
|
const auto n_embd = hparams.n_embd;
|
|
11925
12711
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
11926
12712
|
const auto n_seqs = ubatch.n_seqs;
|
|
11927
12713
|
|
|
12714
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12715
|
+
|
|
11928
12716
|
for (int il = 0; il < n_layer; ++il) {
|
|
11929
12717
|
const llama_layer * layer = &model.layers[il];
|
|
11930
12718
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
11931
12719
|
|
|
11932
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
11933
|
-
gf, state_copy, state_mask, ubatch, il
|
|
11934
|
-
);
|
|
12720
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
11935
12721
|
|
|
11936
12722
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
11937
12723
|
cb(att_norm, "attn_norm", il);
|
|
@@ -11943,7 +12729,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
11943
12729
|
1
|
|
11944
12730
|
);
|
|
11945
12731
|
|
|
11946
|
-
cur = build_rwkv6_time_mix(gf, att_norm, x_prev,
|
|
12732
|
+
cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
|
|
11947
12733
|
|
|
11948
12734
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
11949
12735
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -11951,11 +12737,12 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
11951
12737
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
11952
12738
|
cb(ffn_inp, "ffn_inp", il);
|
|
11953
12739
|
|
|
11954
|
-
|
|
11955
|
-
|
|
11956
|
-
|
|
11957
|
-
|
|
11958
|
-
|
|
12740
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
12741
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
12742
|
+
|
|
12743
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12744
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12745
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
11959
12746
|
}
|
|
11960
12747
|
|
|
11961
12748
|
// feed-forward network
|
|
@@ -12031,15 +12818,14 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12031
12818
|
}
|
|
12032
12819
|
|
|
12033
12820
|
ggml_tensor * build_rwkv7_time_mix(
|
|
12821
|
+
llm_graph_input_rs * inp,
|
|
12034
12822
|
ggml_cgraph * gf,
|
|
12035
12823
|
ggml_tensor * cur,
|
|
12036
12824
|
ggml_tensor * x_prev,
|
|
12037
|
-
ggml_tensor * state_copy,
|
|
12038
|
-
ggml_tensor * state_mask,
|
|
12039
12825
|
ggml_tensor *& first_layer_value,
|
|
12040
12826
|
const llama_ubatch & ubatch,
|
|
12041
12827
|
int il) const {
|
|
12042
|
-
const auto *
|
|
12828
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
12043
12829
|
|
|
12044
12830
|
const auto n_tokens = ubatch.n_tokens;
|
|
12045
12831
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -12048,7 +12834,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12048
12834
|
const auto head_count = n_embd / head_size;
|
|
12049
12835
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12050
12836
|
|
|
12051
|
-
const auto kv_head =
|
|
12837
|
+
const auto kv_head = mctx_cur->get_head();
|
|
12052
12838
|
|
|
12053
12839
|
const auto & layer = model.layers[il];
|
|
12054
12840
|
|
|
@@ -12118,9 +12904,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12118
12904
|
v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
|
|
12119
12905
|
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
|
12120
12906
|
|
|
12121
|
-
ggml_tensor * wkv_state =
|
|
12122
|
-
gf,
|
|
12123
|
-
hparams.
|
|
12907
|
+
ggml_tensor * wkv_state = build_rs(
|
|
12908
|
+
inp, gf, mctx_cur->get_s_l(il),
|
|
12909
|
+
hparams.n_embd_s(), n_seqs);
|
|
12124
12910
|
|
|
12125
12911
|
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
|
12126
12912
|
cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
|
|
@@ -12133,9 +12919,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12133
12919
|
wkv_state,
|
|
12134
12920
|
ggml_view_1d(
|
|
12135
12921
|
ctx0,
|
|
12136
|
-
|
|
12137
|
-
hparams.
|
|
12138
|
-
hparams.
|
|
12922
|
+
mctx_cur->get_s_l(il),
|
|
12923
|
+
hparams.n_embd_s() * n_seqs,
|
|
12924
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
|
|
12139
12925
|
)
|
|
12140
12926
|
)
|
|
12141
12927
|
);
|
|
@@ -12176,20 +12962,19 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12176
12962
|
inpL = build_inp_embd(model.tok_embd);
|
|
12177
12963
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
|
12178
12964
|
|
|
12179
|
-
|
|
12180
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12965
|
+
auto * rs_inp = build_rs_inp();
|
|
12181
12966
|
|
|
12182
12967
|
const auto n_embd = hparams.n_embd;
|
|
12183
12968
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12184
12969
|
const auto n_seqs = ubatch.n_seqs;
|
|
12185
12970
|
|
|
12971
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12972
|
+
|
|
12186
12973
|
for (int il = 0; il < n_layer; ++il) {
|
|
12187
12974
|
const llama_layer * layer = &model.layers[il];
|
|
12188
12975
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
12189
12976
|
|
|
12190
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
12191
|
-
gf, state_copy, state_mask, ubatch, il
|
|
12192
|
-
);
|
|
12977
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
12193
12978
|
|
|
12194
12979
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
12195
12980
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -12204,7 +12989,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12204
12989
|
1
|
|
12205
12990
|
);
|
|
12206
12991
|
|
|
12207
|
-
cur = build_rwkv7_time_mix(gf, att_norm, x_prev,
|
|
12992
|
+
cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
|
|
12208
12993
|
|
|
12209
12994
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
12210
12995
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -12226,12 +13011,14 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12226
13011
|
);
|
|
12227
13012
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
12228
13013
|
|
|
12229
|
-
|
|
12230
|
-
|
|
12231
|
-
|
|
12232
|
-
|
|
12233
|
-
|
|
12234
|
-
|
|
13014
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
13015
|
+
ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
|
|
13016
|
+
x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
|
|
13017
|
+
|
|
13018
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13019
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
13020
|
+
ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
|
|
13021
|
+
x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
|
|
12235
13022
|
}
|
|
12236
13023
|
|
|
12237
13024
|
cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
|
|
@@ -12262,7 +13049,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12262
13049
|
|
|
12263
13050
|
struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
12264
13051
|
llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
|
|
12265
|
-
GGML_ASSERT(n_embd == hparams.
|
|
13052
|
+
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
12266
13053
|
|
|
12267
13054
|
ggml_tensor * cur;
|
|
12268
13055
|
ggml_tensor * inpL;
|
|
@@ -12270,20 +13057,19 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12270
13057
|
|
|
12271
13058
|
inpL = build_inp_embd(model.tok_embd);
|
|
12272
13059
|
|
|
12273
|
-
|
|
12274
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
13060
|
+
auto * rs_inp = build_rs_inp();
|
|
12275
13061
|
|
|
12276
13062
|
const auto n_embd = hparams.n_embd;
|
|
12277
13063
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12278
13064
|
const auto n_seqs = ubatch.n_seqs;
|
|
12279
13065
|
|
|
13066
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13067
|
+
|
|
12280
13068
|
for (int il = 0; il < n_layer; ++il) {
|
|
12281
13069
|
const llama_layer * layer = &model.layers[il];
|
|
12282
13070
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
12283
13071
|
|
|
12284
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
12285
|
-
gf, state_copy, state_mask, ubatch, il
|
|
12286
|
-
);
|
|
13072
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
12287
13073
|
|
|
12288
13074
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
12289
13075
|
cb(att_norm, "attn_norm", il);
|
|
@@ -12295,7 +13081,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12295
13081
|
1
|
|
12296
13082
|
);
|
|
12297
13083
|
|
|
12298
|
-
cur = build_rwkv7_time_mix(gf, att_norm, x_prev,
|
|
13084
|
+
cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
|
|
12299
13085
|
|
|
12300
13086
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
12301
13087
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -12303,11 +13089,12 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12303
13089
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
12304
13090
|
cb(ffn_inp, "ffn_inp", il);
|
|
12305
13091
|
|
|
12306
|
-
|
|
12307
|
-
|
|
12308
|
-
|
|
12309
|
-
|
|
12310
|
-
|
|
13092
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
13093
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
13094
|
+
|
|
13095
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13096
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13097
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
12311
13098
|
}
|
|
12312
13099
|
|
|
12313
13100
|
// feed-forward network
|
|
@@ -12376,6 +13163,9 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
12376
13163
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12377
13164
|
|
|
12378
13165
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
13166
|
+
|
|
13167
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13168
|
+
|
|
12379
13169
|
for (int il = 0; il < n_layer; ++il) {
|
|
12380
13170
|
ggml_tensor * inpSA = inpL;
|
|
12381
13171
|
|
|
@@ -12438,9 +13228,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
12438
13228
|
cb(cur, "attn_out", il);
|
|
12439
13229
|
}
|
|
12440
13230
|
|
|
12441
|
-
if (il == n_layer - 1) {
|
|
12442
|
-
// skip computing output for unused tokens
|
|
12443
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13231
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12444
13232
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12445
13233
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12446
13234
|
}
|
|
@@ -12559,6 +13347,8 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
12559
13347
|
|
|
12560
13348
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12561
13349
|
|
|
13350
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13351
|
+
|
|
12562
13352
|
for (int il = 0; il < n_layer; ++il) {
|
|
12563
13353
|
ggml_tensor * inpSA = inpL;
|
|
12564
13354
|
|
|
@@ -12635,21 +13425,19 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
12635
13425
|
cur = build_attn(inp_attn, gf,
|
|
12636
13426
|
model.layers[il].wo, nullptr,
|
|
12637
13427
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12638
|
-
|
|
12639
|
-
if (hparams.swin_norm) {
|
|
12640
|
-
cur = build_norm(cur,
|
|
12641
|
-
model.layers[il].attn_norm, NULL,
|
|
12642
|
-
LLM_NORM_RMS, il);
|
|
12643
|
-
}
|
|
12644
13428
|
}
|
|
12645
13429
|
|
|
12646
|
-
if (il == n_layer - 1) {
|
|
12647
|
-
// skip computing output for unused tokens
|
|
12648
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13430
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12649
13431
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12650
13432
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12651
13433
|
}
|
|
12652
13434
|
|
|
13435
|
+
if (hparams.swin_norm) {
|
|
13436
|
+
cur = build_norm(cur,
|
|
13437
|
+
model.layers[il].attn_norm, NULL,
|
|
13438
|
+
LLM_NORM_RMS, il);
|
|
13439
|
+
}
|
|
13440
|
+
|
|
12653
13441
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12654
13442
|
cb(ffn_inp, "ffn_inp", il);
|
|
12655
13443
|
|
|
@@ -12890,6 +13678,8 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
12890
13678
|
|
|
12891
13679
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12892
13680
|
|
|
13681
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13682
|
+
|
|
12893
13683
|
for (int il = 0; il < n_layer; ++il) {
|
|
12894
13684
|
ggml_tensor * inpSA = inpL;
|
|
12895
13685
|
|
|
@@ -12993,9 +13783,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
12993
13783
|
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
12994
13784
|
}
|
|
12995
13785
|
|
|
12996
|
-
if (il == n_layer - 1) {
|
|
12997
|
-
// skip computing output for unused tokens
|
|
12998
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13786
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12999
13787
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13000
13788
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13001
13789
|
}
|
|
@@ -13055,6 +13843,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13055
13843
|
|
|
13056
13844
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13057
13845
|
|
|
13846
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13847
|
+
|
|
13058
13848
|
for (int il = 0; il < n_layer; ++il) {
|
|
13059
13849
|
ggml_tensor * inpSA = inpL;
|
|
13060
13850
|
|
|
@@ -13116,9 +13906,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13116
13906
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
13117
13907
|
}
|
|
13118
13908
|
|
|
13119
|
-
if (il == n_layer - 1) {
|
|
13120
|
-
// skip computing output for unused tokens
|
|
13121
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13909
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13122
13910
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13123
13911
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13124
13912
|
}
|
|
@@ -13187,69 +13975,375 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13187
13975
|
}
|
|
13188
13976
|
};
|
|
13189
13977
|
|
|
13190
|
-
|
|
13191
|
-
|
|
13978
|
+
struct llm_build_dots1 : public llm_graph_context {
|
|
13979
|
+
llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
13980
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13192
13981
|
|
|
13193
|
-
|
|
13194
|
-
|
|
13195
|
-
|
|
13196
|
-
|
|
13197
|
-
|
|
13198
|
-
|
|
13199
|
-
|
|
13200
|
-
|
|
13201
|
-
|
|
13202
|
-
|
|
13203
|
-
|
|
13204
|
-
|
|
13205
|
-
|
|
13206
|
-
|
|
13982
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13983
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13984
|
+
|
|
13985
|
+
ggml_tensor * cur;
|
|
13986
|
+
ggml_tensor * inpL;
|
|
13987
|
+
|
|
13988
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13989
|
+
|
|
13990
|
+
// inp_pos - contains the positions
|
|
13991
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13992
|
+
|
|
13993
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13994
|
+
|
|
13995
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13996
|
+
|
|
13997
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
13998
|
+
ggml_tensor * inpSA = inpL;
|
|
13999
|
+
|
|
14000
|
+
// norm
|
|
14001
|
+
cur = build_norm(inpL,
|
|
14002
|
+
model.layers[il].attn_norm, NULL,
|
|
14003
|
+
LLM_NORM_RMS, il);
|
|
14004
|
+
cb(cur, "attn_norm", il);
|
|
14005
|
+
|
|
14006
|
+
// self_attention
|
|
13207
14007
|
{
|
|
13208
|
-
|
|
13209
|
-
|
|
13210
|
-
|
|
13211
|
-
|
|
13212
|
-
|
|
13213
|
-
|
|
13214
|
-
|
|
13215
|
-
|
|
13216
|
-
|
|
14008
|
+
// compute Q and K and RoPE them
|
|
14009
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14010
|
+
cb(Qcur, "Qcur", il);
|
|
14011
|
+
|
|
14012
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14013
|
+
cb(Kcur, "Kcur", il);
|
|
14014
|
+
|
|
14015
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14016
|
+
cb(Vcur, "Vcur", il);
|
|
14017
|
+
|
|
14018
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
14019
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14020
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14021
|
+
|
|
14022
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
14023
|
+
cb(Qcur, "Qcur_normed", il);
|
|
14024
|
+
|
|
14025
|
+
Qcur = ggml_rope_ext(
|
|
14026
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
14027
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14028
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14029
|
+
);
|
|
14030
|
+
|
|
14031
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
14032
|
+
cb(Kcur, "Kcur_normed", il);
|
|
14033
|
+
|
|
14034
|
+
Kcur = ggml_rope_ext(
|
|
14035
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
14036
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14037
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14038
|
+
);
|
|
14039
|
+
|
|
14040
|
+
cb(Qcur, "Qcur", il);
|
|
14041
|
+
cb(Kcur, "Kcur", il);
|
|
14042
|
+
cb(Vcur, "Vcur", il);
|
|
14043
|
+
|
|
14044
|
+
cur = build_attn(inp_attn, gf,
|
|
14045
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14046
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14047
|
+
}
|
|
14048
|
+
|
|
14049
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14050
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14051
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14052
|
+
}
|
|
14053
|
+
|
|
14054
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14055
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14056
|
+
|
|
14057
|
+
// MoE branch
|
|
14058
|
+
cur = build_norm(ffn_inp,
|
|
14059
|
+
model.layers[il].ffn_norm, NULL,
|
|
14060
|
+
LLM_NORM_RMS, il);
|
|
14061
|
+
cb(cur, "ffn_norm", il);
|
|
14062
|
+
|
|
14063
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
|
14064
|
+
cur = build_ffn(cur,
|
|
14065
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
14066
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
14067
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
14068
|
+
NULL,
|
|
14069
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14070
|
+
cb(cur, "ffn_out", il);
|
|
14071
|
+
} else {
|
|
14072
|
+
ggml_tensor * moe_out =
|
|
14073
|
+
build_moe_ffn(cur,
|
|
14074
|
+
model.layers[il].ffn_gate_inp,
|
|
14075
|
+
model.layers[il].ffn_up_exps,
|
|
14076
|
+
model.layers[il].ffn_gate_exps,
|
|
14077
|
+
model.layers[il].ffn_down_exps,
|
|
14078
|
+
model.layers[il].ffn_exp_probs_b,
|
|
14079
|
+
n_expert, n_expert_used,
|
|
14080
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
14081
|
+
true, hparams.expert_weights_scale,
|
|
14082
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
14083
|
+
il);
|
|
14084
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
14085
|
+
|
|
14086
|
+
{
|
|
14087
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
14088
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
14089
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
14090
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
14091
|
+
NULL,
|
|
14092
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14093
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
14094
|
+
|
|
14095
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
14096
|
+
cb(cur, "ffn_out", il);
|
|
14097
|
+
}
|
|
14098
|
+
}
|
|
14099
|
+
|
|
14100
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14101
|
+
|
|
14102
|
+
cur = build_cvec(cur, il);
|
|
14103
|
+
cb(cur, "l_out", il);
|
|
14104
|
+
|
|
14105
|
+
// input for next layer
|
|
14106
|
+
inpL = cur;
|
|
14107
|
+
}
|
|
14108
|
+
|
|
14109
|
+
cur = inpL;
|
|
14110
|
+
|
|
14111
|
+
cur = build_norm(cur,
|
|
14112
|
+
model.output_norm, NULL,
|
|
14113
|
+
LLM_NORM_RMS, -1);
|
|
14114
|
+
|
|
14115
|
+
cb(cur, "result_norm", -1);
|
|
14116
|
+
res->t_embd = cur;
|
|
14117
|
+
|
|
14118
|
+
// lm_head
|
|
14119
|
+
cur = build_lora_mm(model.output, cur);
|
|
14120
|
+
|
|
14121
|
+
cb(cur, "result_output", -1);
|
|
14122
|
+
res->t_logits = cur;
|
|
14123
|
+
|
|
14124
|
+
ggml_build_forward_expand(gf, cur);
|
|
14125
|
+
}
|
|
14126
|
+
};
|
|
14127
|
+
|
|
14128
|
+
struct llm_build_arcee : public llm_graph_context {
|
|
14129
|
+
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14130
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14131
|
+
|
|
14132
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14133
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
14134
|
+
|
|
14135
|
+
ggml_tensor * cur;
|
|
14136
|
+
ggml_tensor * inpL;
|
|
14137
|
+
|
|
14138
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14139
|
+
|
|
14140
|
+
// inp_pos - contains the positions
|
|
14141
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
14142
|
+
|
|
14143
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
14144
|
+
|
|
14145
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14146
|
+
|
|
14147
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14148
|
+
|
|
14149
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14150
|
+
ggml_tensor * inpSA = inpL;
|
|
14151
|
+
|
|
14152
|
+
// norm
|
|
14153
|
+
cur = build_norm(inpL,
|
|
14154
|
+
model.layers[il].attn_norm, NULL,
|
|
14155
|
+
LLM_NORM_RMS, il);
|
|
14156
|
+
cb(cur, "attn_norm", il);
|
|
14157
|
+
|
|
14158
|
+
// self-attention
|
|
13217
14159
|
{
|
|
13218
|
-
|
|
14160
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
14161
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
13219
14162
|
|
|
13220
|
-
|
|
14163
|
+
// compute Q and K and RoPE them
|
|
14164
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14165
|
+
cb(Qcur, "Qcur", il);
|
|
14166
|
+
if (model.layers[il].bq) {
|
|
14167
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14168
|
+
cb(Qcur, "Qcur", il);
|
|
14169
|
+
}
|
|
13221
14170
|
|
|
13222
|
-
|
|
14171
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14172
|
+
cb(Kcur, "Kcur", il);
|
|
14173
|
+
if (model.layers[il].bk) {
|
|
14174
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14175
|
+
cb(Kcur, "Kcur", il);
|
|
14176
|
+
}
|
|
13223
14177
|
|
|
13224
|
-
|
|
13225
|
-
|
|
14178
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14179
|
+
cb(Vcur, "Vcur", il);
|
|
14180
|
+
if (model.layers[il].bv) {
|
|
14181
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14182
|
+
cb(Vcur, "Vcur", il);
|
|
14183
|
+
}
|
|
13226
14184
|
|
|
13227
|
-
|
|
13228
|
-
|
|
13229
|
-
|
|
13230
|
-
|
|
13231
|
-
|
|
13232
|
-
|
|
13233
|
-
|
|
13234
|
-
|
|
13235
|
-
|
|
13236
|
-
|
|
13237
|
-
|
|
13238
|
-
|
|
13239
|
-
|
|
14185
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
14186
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14187
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14188
|
+
|
|
14189
|
+
Qcur = ggml_rope_ext(
|
|
14190
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
14191
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14192
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14193
|
+
);
|
|
14194
|
+
|
|
14195
|
+
Kcur = ggml_rope_ext(
|
|
14196
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
14197
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14198
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14199
|
+
);
|
|
14200
|
+
|
|
14201
|
+
cb(Qcur, "Qcur", il);
|
|
14202
|
+
cb(Kcur, "Kcur", il);
|
|
14203
|
+
cb(Vcur, "Vcur", il);
|
|
14204
|
+
|
|
14205
|
+
cur = build_attn(inp_attn, gf,
|
|
14206
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14207
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
14208
|
+
cb(cur, "attn_out", il);
|
|
14209
|
+
}
|
|
14210
|
+
|
|
14211
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14212
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14213
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14214
|
+
}
|
|
14215
|
+
|
|
14216
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14217
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14218
|
+
|
|
14219
|
+
// feed-forward network
|
|
14220
|
+
// ARCEE uses relu^2 instead of silu
|
|
14221
|
+
cur = build_norm(ffn_inp,
|
|
14222
|
+
model.layers[il].ffn_norm, NULL,
|
|
14223
|
+
LLM_NORM_RMS, il);
|
|
14224
|
+
cb(cur, "ffn_norm", il);
|
|
14225
|
+
|
|
14226
|
+
cur = build_ffn(cur,
|
|
14227
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
14228
|
+
NULL, NULL, NULL,
|
|
14229
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
14230
|
+
NULL,
|
|
14231
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
14232
|
+
cb(cur, "ffn_out", il);
|
|
14233
|
+
|
|
14234
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14235
|
+
cb(cur, "ffn_out", il);
|
|
14236
|
+
|
|
14237
|
+
cur = build_cvec(cur, il);
|
|
14238
|
+
cb(cur, "l_out", il);
|
|
14239
|
+
|
|
14240
|
+
// input for next layer
|
|
14241
|
+
inpL = cur;
|
|
14242
|
+
}
|
|
14243
|
+
|
|
14244
|
+
cur = inpL;
|
|
14245
|
+
|
|
14246
|
+
cur = build_norm(cur,
|
|
14247
|
+
model.output_norm, NULL,
|
|
14248
|
+
LLM_NORM_RMS, -1);
|
|
14249
|
+
|
|
14250
|
+
cb(cur, "result_norm", -1);
|
|
14251
|
+
res->t_embd = cur;
|
|
13240
14252
|
|
|
13241
|
-
|
|
14253
|
+
// lm_head
|
|
14254
|
+
cur = build_lora_mm(model.output, cur);
|
|
14255
|
+
|
|
14256
|
+
cb(cur, "result_output", -1);
|
|
14257
|
+
res->t_logits = cur;
|
|
14258
|
+
|
|
14259
|
+
ggml_build_forward_expand(gf, cur);
|
|
14260
|
+
}
|
|
14261
|
+
};
|
|
14262
|
+
|
|
14263
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
14264
|
+
llama_memory_i * res;
|
|
14265
|
+
|
|
14266
|
+
switch (arch) {
|
|
14267
|
+
// Models that need specific instantiation should be handled in the
|
|
14268
|
+
// switch statement
|
|
14269
|
+
case LLM_ARCH_BERT:
|
|
14270
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
14271
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
14272
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
14273
|
+
case LLM_ARCH_NEO_BERT:
|
|
14274
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
14275
|
+
{
|
|
14276
|
+
res = nullptr;
|
|
14277
|
+
} break;
|
|
14278
|
+
// Models that need standard caching should rely on recurrent/hybrid
|
|
14279
|
+
// checks
|
|
14280
|
+
default:
|
|
14281
|
+
{
|
|
14282
|
+
if (llm_arch_is_recurrent(arch)) {
|
|
14283
|
+
res = new llama_memory_recurrent(
|
|
13242
14284
|
*this,
|
|
13243
14285
|
nullptr,
|
|
13244
|
-
|
|
13245
|
-
|
|
13246
|
-
!cparams.flash_attn,
|
|
14286
|
+
GGML_TYPE_F32,
|
|
14287
|
+
GGML_TYPE_F32,
|
|
13247
14288
|
cparams.offload_kqv,
|
|
13248
|
-
cparams.
|
|
13249
|
-
cparams.n_seq_max
|
|
13250
|
-
|
|
13251
|
-
|
|
13252
|
-
|
|
14289
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
14290
|
+
cparams.n_seq_max);
|
|
14291
|
+
} else if (llm_arch_is_hybrid(arch)) {
|
|
14292
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
14293
|
+
|
|
14294
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
14295
|
+
|
|
14296
|
+
res = new llama_memory_hybrid(
|
|
14297
|
+
/* model */ *this,
|
|
14298
|
+
/* attn_type_k */ params.type_k,
|
|
14299
|
+
/* attn_type_v */ params.type_v,
|
|
14300
|
+
/* attn_v_trans */ !cparams.flash_attn,
|
|
14301
|
+
/* attn_kv_size */ cparams.n_ctx,
|
|
14302
|
+
/* attn_n_pad */ padding,
|
|
14303
|
+
/* attn_n_swa */ hparams.n_swa,
|
|
14304
|
+
/* attn_swa_type */ hparams.swa_type,
|
|
14305
|
+
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
14306
|
+
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
14307
|
+
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
14308
|
+
/* n_seq_max */ cparams.n_seq_max,
|
|
14309
|
+
/* offload */ cparams.offload_kqv);
|
|
14310
|
+
} else {
|
|
14311
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
14312
|
+
|
|
14313
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
14314
|
+
|
|
14315
|
+
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
14316
|
+
|
|
14317
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
14318
|
+
GGML_ASSERT(hparams.is_swa_any());
|
|
14319
|
+
|
|
14320
|
+
res = new llama_kv_cache_unified_iswa(
|
|
14321
|
+
*this,
|
|
14322
|
+
params.type_k,
|
|
14323
|
+
params.type_v,
|
|
14324
|
+
!cparams.flash_attn,
|
|
14325
|
+
cparams.offload_kqv,
|
|
14326
|
+
params.swa_full,
|
|
14327
|
+
cparams.n_ctx,
|
|
14328
|
+
cparams.n_seq_max,
|
|
14329
|
+
cparams.n_ubatch,
|
|
14330
|
+
padding);
|
|
14331
|
+
} else {
|
|
14332
|
+
GGML_ASSERT(!hparams.is_swa_any());
|
|
14333
|
+
|
|
14334
|
+
res = new llama_kv_cache_unified(
|
|
14335
|
+
*this,
|
|
14336
|
+
nullptr,
|
|
14337
|
+
params.type_k,
|
|
14338
|
+
params.type_v,
|
|
14339
|
+
!cparams.flash_attn,
|
|
14340
|
+
cparams.offload_kqv,
|
|
14341
|
+
cparams.n_ctx,
|
|
14342
|
+
cparams.n_seq_max,
|
|
14343
|
+
padding,
|
|
14344
|
+
hparams.n_swa,
|
|
14345
|
+
hparams.swa_type);
|
|
14346
|
+
}
|
|
13253
14347
|
}
|
|
13254
14348
|
}
|
|
13255
14349
|
}
|
|
@@ -13303,6 +14397,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13303
14397
|
{
|
|
13304
14398
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
|
13305
14399
|
} break;
|
|
14400
|
+
case LLM_ARCH_NEO_BERT:
|
|
14401
|
+
{
|
|
14402
|
+
llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
|
|
14403
|
+
} break;
|
|
13306
14404
|
case LLM_ARCH_BLOOM:
|
|
13307
14405
|
{
|
|
13308
14406
|
llm = std::make_unique<llm_build_bloom>(*this, params, gf);
|
|
@@ -13388,6 +14486,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13388
14486
|
{
|
|
13389
14487
|
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
|
|
13390
14488
|
} break;
|
|
14489
|
+
case LLM_ARCH_GEMMA3N:
|
|
14490
|
+
{
|
|
14491
|
+
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params, gf);
|
|
14492
|
+
} break;
|
|
13391
14493
|
case LLM_ARCH_STARCODER2:
|
|
13392
14494
|
{
|
|
13393
14495
|
llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
|
|
@@ -13525,6 +14627,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13525
14627
|
{
|
|
13526
14628
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
|
13527
14629
|
} break;
|
|
14630
|
+
case LLM_ARCH_DOTS1:
|
|
14631
|
+
{
|
|
14632
|
+
llm = std::make_unique<llm_build_dots1>(*this, params, gf);
|
|
14633
|
+
} break;
|
|
14634
|
+
case LLM_ARCH_ARCEE:
|
|
14635
|
+
{
|
|
14636
|
+
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
|
|
14637
|
+
} break;
|
|
13528
14638
|
default:
|
|
13529
14639
|
GGML_ABORT("fatal error");
|
|
13530
14640
|
}
|
|
@@ -13600,6 +14710,18 @@ int32_t llama_model_n_swa(const llama_model * model) {
|
|
|
13600
14710
|
return model->hparams.n_swa;
|
|
13601
14711
|
}
|
|
13602
14712
|
|
|
14713
|
+
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
|
|
14714
|
+
return model->hparams.n_cls_out;
|
|
14715
|
+
}
|
|
14716
|
+
|
|
14717
|
+
const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
|
|
14718
|
+
if (i < model->classifier_labels.size()) {
|
|
14719
|
+
return model->classifier_labels[i].c_str();
|
|
14720
|
+
}
|
|
14721
|
+
|
|
14722
|
+
return nullptr;
|
|
14723
|
+
}
|
|
14724
|
+
|
|
13603
14725
|
// deprecated
|
|
13604
14726
|
int32_t llama_n_ctx_train(const llama_model * model) {
|
|
13605
14727
|
return llama_model_n_ctx_train(model);
|
|
@@ -13662,6 +14784,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13662
14784
|
case LLM_ARCH_GRANITE_MOE:
|
|
13663
14785
|
case LLM_ARCH_CHAMELEON:
|
|
13664
14786
|
case LLM_ARCH_BAILINGMOE:
|
|
14787
|
+
case LLM_ARCH_NEO_BERT:
|
|
14788
|
+
case LLM_ARCH_ARCEE:
|
|
13665
14789
|
return LLAMA_ROPE_TYPE_NORM;
|
|
13666
14790
|
|
|
13667
14791
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -13687,6 +14811,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13687
14811
|
case LLM_ARCH_GEMMA:
|
|
13688
14812
|
case LLM_ARCH_GEMMA2:
|
|
13689
14813
|
case LLM_ARCH_GEMMA3:
|
|
14814
|
+
case LLM_ARCH_GEMMA3N:
|
|
13690
14815
|
case LLM_ARCH_STARCODER2:
|
|
13691
14816
|
case LLM_ARCH_OPENELM:
|
|
13692
14817
|
case LLM_ARCH_GPTNEOX:
|
|
@@ -13695,6 +14820,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13695
14820
|
case LLM_ARCH_NEMOTRON:
|
|
13696
14821
|
case LLM_ARCH_EXAONE:
|
|
13697
14822
|
case LLM_ARCH_MINICPM3:
|
|
14823
|
+
case LLM_ARCH_DOTS1:
|
|
13698
14824
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
13699
14825
|
|
|
13700
14826
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -13760,7 +14886,7 @@ uint64_t llama_model_size(const llama_model * model) {
|
|
|
13760
14886
|
}
|
|
13761
14887
|
|
|
13762
14888
|
const char * llama_model_chat_template(const llama_model * model, const char * name) {
|
|
13763
|
-
const auto key = name ? LLM_KV(model->arch, name)(
|
|
14889
|
+
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
|
|
13764
14890
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
|
13765
14891
|
const auto & it = model->gguf_kv.find(key);
|
|
13766
14892
|
if (it == model->gguf_kv.end()) {
|
|
@@ -13768,7 +14894,7 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
|
|
13768
14894
|
// do not extend this list unless absolutely necessary
|
|
13769
14895
|
// Mistral-Small-2503 does not have built-in chat template
|
|
13770
14896
|
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
|
13771
|
-
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
|
14897
|
+
if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
|
13772
14898
|
return "mistral-v7-tekken";
|
|
13773
14899
|
}
|
|
13774
14900
|
|
|
@@ -13802,14 +14928,7 @@ llama_token llama_model_decoder_start_token(const llama_model * model) {
|
|
|
13802
14928
|
}
|
|
13803
14929
|
|
|
13804
14930
|
bool llama_model_is_recurrent(const llama_model * model) {
|
|
13805
|
-
|
|
13806
|
-
case LLM_ARCH_MAMBA: return true;
|
|
13807
|
-
case LLM_ARCH_RWKV6: return true;
|
|
13808
|
-
case LLM_ARCH_RWKV6QWEN2: return true;
|
|
13809
|
-
case LLM_ARCH_RWKV7: return true;
|
|
13810
|
-
case LLM_ARCH_ARWKV7: return true;
|
|
13811
|
-
default: return false;
|
|
13812
|
-
}
|
|
14931
|
+
return llm_arch_is_recurrent(model->arch);
|
|
13813
14932
|
}
|
|
13814
14933
|
|
|
13815
14934
|
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|