@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
#include <cmath>
|
|
18
18
|
#include <functional>
|
|
19
19
|
#include <map>
|
|
20
|
+
#include <regex>
|
|
20
21
|
#include <sstream>
|
|
21
22
|
#include <stdexcept>
|
|
22
23
|
|
|
@@ -39,14 +40,18 @@ const char * llm_type_name(llm_type type) {
|
|
|
39
40
|
case LLM_TYPE_335M: return "335M";
|
|
40
41
|
case LLM_TYPE_410M: return "410M";
|
|
41
42
|
case LLM_TYPE_450M: return "450M";
|
|
43
|
+
case LLM_TYPE_475M: return "475M";
|
|
42
44
|
case LLM_TYPE_770M: return "770M";
|
|
43
45
|
case LLM_TYPE_780M: return "780M";
|
|
44
46
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
47
|
+
case LLM_TYPE_0_6B: return "0.6B";
|
|
45
48
|
case LLM_TYPE_1B: return "1B";
|
|
46
49
|
case LLM_TYPE_1_3B: return "1.3B";
|
|
47
50
|
case LLM_TYPE_1_4B: return "1.4B";
|
|
48
51
|
case LLM_TYPE_1_5B: return "1.5B";
|
|
49
52
|
case LLM_TYPE_1_6B: return "1.6B";
|
|
53
|
+
case LLM_TYPE_1_7B: return "1.7B";
|
|
54
|
+
case LLM_TYPE_1_8B: return "1.8B";
|
|
50
55
|
case LLM_TYPE_2B: return "2B";
|
|
51
56
|
case LLM_TYPE_2_8B: return "2.8B";
|
|
52
57
|
case LLM_TYPE_2_9B: return "2.9B";
|
|
@@ -64,6 +69,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
64
69
|
case LLM_TYPE_15B: return "15B";
|
|
65
70
|
case LLM_TYPE_16B: return "16B";
|
|
66
71
|
case LLM_TYPE_20B: return "20B";
|
|
72
|
+
case LLM_TYPE_27B: return "27B";
|
|
67
73
|
case LLM_TYPE_30B: return "30B";
|
|
68
74
|
case LLM_TYPE_32B: return "32B";
|
|
69
75
|
case LLM_TYPE_34B: return "34B";
|
|
@@ -72,7 +78,9 @@ const char * llm_type_name(llm_type type) {
|
|
|
72
78
|
case LLM_TYPE_65B: return "65B";
|
|
73
79
|
case LLM_TYPE_70B: return "70B";
|
|
74
80
|
case LLM_TYPE_236B: return "236B";
|
|
81
|
+
case LLM_TYPE_290B: return "290B";
|
|
75
82
|
case LLM_TYPE_314B: return "314B";
|
|
83
|
+
case LLM_TYPE_405B: return "405B";
|
|
76
84
|
case LLM_TYPE_671B: return "671B";
|
|
77
85
|
case LLM_TYPE_SMALL: return "0.1B";
|
|
78
86
|
case LLM_TYPE_MEDIUM: return "0.4B";
|
|
@@ -86,7 +94,10 @@ const char * llm_type_name(llm_type type) {
|
|
|
86
94
|
case LLM_TYPE_16x3_8B: return "16x3.8B";
|
|
87
95
|
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
|
|
88
96
|
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
|
89
|
-
case
|
|
97
|
+
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
98
|
+
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
99
|
+
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
100
|
+
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
90
101
|
default: return "?B";
|
|
91
102
|
}
|
|
92
103
|
}
|
|
@@ -106,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
|
|
|
106
117
|
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
|
107
118
|
};
|
|
108
119
|
|
|
120
|
+
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
|
|
121
|
+
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
|
|
122
|
+
}
|
|
123
|
+
|
|
109
124
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
|
110
125
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
|
111
126
|
if (kv.second == name) {
|
|
@@ -255,7 +270,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|
|
255
270
|
return nullptr;
|
|
256
271
|
}
|
|
257
272
|
|
|
258
|
-
// CPU: ACCEL ->
|
|
273
|
+
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
|
259
274
|
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
|
260
275
|
buft_list_t buft_list;
|
|
261
276
|
|
|
@@ -271,32 +286,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
271
286
|
}
|
|
272
287
|
}
|
|
273
288
|
|
|
274
|
-
bool has_gpu_device = false;
|
|
275
|
-
for (auto * dev : devices) {
|
|
276
|
-
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
277
|
-
has_gpu_device = true;
|
|
278
|
-
break;
|
|
279
|
-
}
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
// add extra buffer types, only if no GPU device is present
|
|
283
|
-
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
|
284
|
-
if (!has_gpu_device) {
|
|
285
|
-
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
286
|
-
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
287
|
-
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
288
|
-
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
289
|
-
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
290
|
-
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
291
|
-
while (extra_bufts && *extra_bufts) {
|
|
292
|
-
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
293
|
-
++extra_bufts;
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
} else {
|
|
297
|
-
LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
|
|
298
|
-
}
|
|
299
|
-
|
|
300
289
|
// add a host buffer type
|
|
301
290
|
// storing the tensors in a host buffer is useful when the processing of large batches
|
|
302
291
|
// is offloaded to a GPU device, since it reduces the time spent on data transfers
|
|
@@ -311,6 +300,24 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
311
300
|
}
|
|
312
301
|
}
|
|
313
302
|
|
|
303
|
+
// add extra buffer types, only if no GPU device is present
|
|
304
|
+
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
|
305
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
306
|
+
if (cpu_dev == nullptr) {
|
|
307
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
311
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
312
|
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
313
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
314
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
315
|
+
while (extra_bufts && *extra_bufts) {
|
|
316
|
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
317
|
+
++extra_bufts;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
314
321
|
// add the CPU buffer type
|
|
315
322
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
316
323
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
@@ -388,9 +395,12 @@ struct llama_model::impl {
|
|
|
388
395
|
layer_dev dev_input = {};
|
|
389
396
|
layer_dev dev_output = {};
|
|
390
397
|
std::vector<layer_dev> dev_layer;
|
|
398
|
+
|
|
399
|
+
bool has_tensor_overrides;
|
|
391
400
|
};
|
|
392
401
|
|
|
393
402
|
llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
|
|
403
|
+
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
|
394
404
|
}
|
|
395
405
|
|
|
396
406
|
llama_model::~llama_model() {}
|
|
@@ -556,12 +566,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
556
566
|
}
|
|
557
567
|
}
|
|
558
568
|
} break;
|
|
569
|
+
case LLM_ARCH_LLAMA4:
|
|
570
|
+
{
|
|
571
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
572
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
573
|
+
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
574
|
+
hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
|
|
575
|
+
hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
|
576
|
+
hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
|
|
577
|
+
|
|
578
|
+
switch (hparams.n_expert) {
|
|
579
|
+
case 16: type = LLM_TYPE_17B_16E; break;
|
|
580
|
+
case 128: type = LLM_TYPE_17B_128E; break;
|
|
581
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
if (type == LLM_TYPE_17B_128E) {
|
|
585
|
+
hparams.use_kq_norm = false;
|
|
586
|
+
}
|
|
587
|
+
} break;
|
|
559
588
|
case LLM_ARCH_DECI:
|
|
560
589
|
{
|
|
561
590
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
562
591
|
switch (hparams.n_layer) {
|
|
563
592
|
case 32: type = LLM_TYPE_7B; break;
|
|
564
593
|
case 80: type = LLM_TYPE_70B; break;
|
|
594
|
+
case 162: type = LLM_TYPE_405B; break;
|
|
565
595
|
default: type = LLM_TYPE_UNKNOWN;
|
|
566
596
|
}
|
|
567
597
|
} break;
|
|
@@ -680,13 +710,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
680
710
|
}
|
|
681
711
|
} break;
|
|
682
712
|
case LLM_ARCH_NOMIC_BERT:
|
|
713
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
683
714
|
{
|
|
684
715
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
685
716
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
686
717
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
718
|
+
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
|
687
719
|
|
|
688
720
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
|
689
|
-
|
|
721
|
+
if (arch == LLM_ARCH_NOMIC_BERT) {
|
|
722
|
+
type = LLM_TYPE_137M;
|
|
723
|
+
} else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
|
|
724
|
+
type = LLM_TYPE_475M;
|
|
725
|
+
}
|
|
690
726
|
}
|
|
691
727
|
} break;
|
|
692
728
|
case LLM_ARCH_BLOOM:
|
|
@@ -747,6 +783,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
747
783
|
// fall through
|
|
748
784
|
case LLM_ARCH_QWEN2:
|
|
749
785
|
{
|
|
786
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
750
787
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
751
788
|
switch (hparams.n_layer) {
|
|
752
789
|
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
|
@@ -772,6 +809,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
772
809
|
default: type = LLM_TYPE_UNKNOWN;
|
|
773
810
|
}
|
|
774
811
|
} break;
|
|
812
|
+
case LLM_ARCH_QWEN3:
|
|
813
|
+
{
|
|
814
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
815
|
+
switch (hparams.n_layer) {
|
|
816
|
+
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
|
817
|
+
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
|
|
818
|
+
case 40: type = LLM_TYPE_14B; break;
|
|
819
|
+
case 64: type = LLM_TYPE_32B; break;
|
|
820
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
821
|
+
}
|
|
822
|
+
} break;
|
|
823
|
+
case LLM_ARCH_QWEN3MOE:
|
|
824
|
+
{
|
|
825
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
826
|
+
|
|
827
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
828
|
+
switch (hparams.n_layer) {
|
|
829
|
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
|
830
|
+
case 94: type = LLM_TYPE_235B_A22B; break;
|
|
831
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
832
|
+
}
|
|
833
|
+
} break;
|
|
775
834
|
case LLM_ARCH_PHI2:
|
|
776
835
|
{
|
|
777
836
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1125,6 +1184,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1125
1184
|
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
1126
1185
|
}
|
|
1127
1186
|
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
1187
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
|
|
1188
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
|
1128
1189
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1129
1190
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1130
1191
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
@@ -1144,6 +1205,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1144
1205
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1145
1206
|
}
|
|
1146
1207
|
} break;
|
|
1208
|
+
case LLM_ARCH_PLM:
|
|
1209
|
+
{
|
|
1210
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1211
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
1212
|
+
switch (hparams.n_layer) {
|
|
1213
|
+
case 32: type = LLM_TYPE_1_8B; break;
|
|
1214
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1215
|
+
}
|
|
1216
|
+
} break;
|
|
1147
1217
|
case LLM_ARCH_CHATGLM:
|
|
1148
1218
|
{
|
|
1149
1219
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1165,6 +1235,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1165
1235
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1166
1236
|
}
|
|
1167
1237
|
} break;
|
|
1238
|
+
case LLM_ARCH_GLM4:
|
|
1239
|
+
{
|
|
1240
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1241
|
+
switch (hparams.n_layer) {
|
|
1242
|
+
case 40: type = LLM_TYPE_9B; break;
|
|
1243
|
+
case 61: type = LLM_TYPE_32B; break;
|
|
1244
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1245
|
+
}
|
|
1246
|
+
} break;
|
|
1168
1247
|
case LLM_ARCH_BITNET:
|
|
1169
1248
|
{
|
|
1170
1249
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1310,6 +1389,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1310
1389
|
// Add additional layer/vocab/etc checks here for other model sizes
|
|
1311
1390
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1312
1391
|
}
|
|
1392
|
+
|
|
1393
|
+
// For Granite MoE Shared
|
|
1394
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
|
1313
1395
|
} break;
|
|
1314
1396
|
case LLM_ARCH_CHAMELEON:
|
|
1315
1397
|
{
|
|
@@ -1330,6 +1412,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1330
1412
|
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
|
1331
1413
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
1332
1414
|
} break;
|
|
1415
|
+
case LLM_ARCH_BAILINGMOE:
|
|
1416
|
+
{
|
|
1417
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1418
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1419
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1420
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1421
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1422
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1423
|
+
|
|
1424
|
+
switch (hparams.n_layer) {
|
|
1425
|
+
case 28: type = LLM_TYPE_16B; break;
|
|
1426
|
+
case 88: type = LLM_TYPE_290B; break;
|
|
1427
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1428
|
+
}
|
|
1429
|
+
} break;
|
|
1333
1430
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1334
1431
|
}
|
|
1335
1432
|
|
|
@@ -1398,6 +1495,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1398
1495
|
}
|
|
1399
1496
|
|
|
1400
1497
|
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1498
|
+
if (cpu_dev == nullptr) {
|
|
1499
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
1500
|
+
}
|
|
1401
1501
|
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
|
1402
1502
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
|
1403
1503
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
|
@@ -1557,15 +1657,38 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1557
1657
|
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
|
1558
1658
|
}
|
|
1559
1659
|
|
|
1560
|
-
ggml_backend_buffer_type_t buft =
|
|
1660
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
1661
|
+
|
|
1662
|
+
// check overrides
|
|
1663
|
+
if (ml.tensor_buft_overrides) {
|
|
1664
|
+
std::string tensor_name = tn.str();
|
|
1665
|
+
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
|
1666
|
+
std::regex pattern(overrides->pattern);
|
|
1667
|
+
if (std::regex_search(tensor_name, pattern)) {
|
|
1668
|
+
buft = overrides->buft;
|
|
1669
|
+
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
|
1670
|
+
tensor_name.c_str(),
|
|
1671
|
+
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
|
1672
|
+
ggml_backend_buft_name(buft));
|
|
1673
|
+
break;
|
|
1674
|
+
}
|
|
1675
|
+
}
|
|
1676
|
+
}
|
|
1677
|
+
|
|
1561
1678
|
if (!buft) {
|
|
1562
|
-
|
|
1679
|
+
buft = select_weight_buft(hparams, t_meta, op, *buft_list);
|
|
1680
|
+
if (!buft) {
|
|
1681
|
+
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
|
1682
|
+
}
|
|
1563
1683
|
}
|
|
1564
1684
|
|
|
1565
1685
|
// avoid using a host buffer when using mmap
|
|
1566
1686
|
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
|
1567
1687
|
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
|
1568
1688
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1689
|
+
if (!cpu_dev) {
|
|
1690
|
+
throw std::runtime_error("no CPU backend found");
|
|
1691
|
+
}
|
|
1569
1692
|
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
|
1570
1693
|
}
|
|
1571
1694
|
|
|
@@ -1652,6 +1775,63 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1652
1775
|
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
|
1653
1776
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
1654
1777
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
1778
|
+
|
|
1779
|
+
// For Granite MoE Shared
|
|
1780
|
+
if (hparams.n_ff_shexp > 0) {
|
|
1781
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
1782
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
1783
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
1784
|
+
}
|
|
1785
|
+
}
|
|
1786
|
+
}
|
|
1787
|
+
} break;
|
|
1788
|
+
case LLM_ARCH_LLAMA4:
|
|
1789
|
+
{
|
|
1790
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1791
|
+
|
|
1792
|
+
// output
|
|
1793
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1794
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1795
|
+
|
|
1796
|
+
// if output is NULL, init from the input tok embed
|
|
1797
|
+
if (output == NULL) {
|
|
1798
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
1799
|
+
}
|
|
1800
|
+
|
|
1801
|
+
GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
|
|
1802
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1803
|
+
bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
|
|
1804
|
+
|
|
1805
|
+
auto & layer = layers[i];
|
|
1806
|
+
|
|
1807
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1808
|
+
|
|
1809
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
1810
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
1811
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
1812
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
1813
|
+
|
|
1814
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1815
|
+
|
|
1816
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1817
|
+
|
|
1818
|
+
if (is_moe_layer) {
|
|
1819
|
+
int n_ff_exp = hparams.n_ff_exp;
|
|
1820
|
+
|
|
1821
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
1822
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
1823
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
|
|
1824
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
1825
|
+
|
|
1826
|
+
// Shared expert
|
|
1827
|
+
const int64_t n_ff_shexp = n_ff_exp;
|
|
1828
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
1829
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
|
|
1830
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
1831
|
+
} else {
|
|
1832
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1833
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1834
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1655
1835
|
}
|
|
1656
1836
|
}
|
|
1657
1837
|
} break;
|
|
@@ -1697,7 +1877,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1697
1877
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
1698
1878
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1699
1879
|
|
|
1700
|
-
|
|
1880
|
+
if (n_ff > 0) {
|
|
1881
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1882
|
+
}
|
|
1701
1883
|
|
|
1702
1884
|
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
|
1703
1885
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
@@ -1707,9 +1889,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1707
1889
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1708
1890
|
}
|
|
1709
1891
|
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1892
|
+
if (n_ff > 0) {
|
|
1893
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1894
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1895
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1896
|
+
}
|
|
1713
1897
|
|
|
1714
1898
|
// optional MLP bias
|
|
1715
1899
|
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
@@ -1924,6 +2108,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1924
2108
|
} break;
|
|
1925
2109
|
case LLM_ARCH_BERT:
|
|
1926
2110
|
case LLM_ARCH_NOMIC_BERT:
|
|
2111
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
1927
2112
|
{
|
|
1928
2113
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1929
2114
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
|
@@ -1957,20 +2142,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1957
2142
|
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
1958
2143
|
}
|
|
1959
2144
|
|
|
2145
|
+
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
2146
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
2147
|
+
}
|
|
2148
|
+
|
|
1960
2149
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1961
2150
|
|
|
1962
2151
|
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
1963
2152
|
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
1964
2153
|
|
|
1965
|
-
|
|
1966
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
1967
|
-
|
|
1968
|
-
if (arch == LLM_ARCH_BERT) {
|
|
2154
|
+
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
|
1969
2155
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
1970
|
-
layer.
|
|
1971
|
-
layer.
|
|
2156
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
|
2157
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
2158
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
1972
2159
|
} else {
|
|
1973
|
-
layer.
|
|
2160
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2161
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2162
|
+
|
|
2163
|
+
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
2164
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2165
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
2166
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2167
|
+
} else {
|
|
2168
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2169
|
+
}
|
|
1974
2170
|
}
|
|
1975
2171
|
|
|
1976
2172
|
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
@@ -2254,6 +2450,77 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2254
2450
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
2255
2451
|
}
|
|
2256
2452
|
} break;
|
|
2453
|
+
case LLM_ARCH_QWEN3:
|
|
2454
|
+
{
|
|
2455
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2456
|
+
|
|
2457
|
+
// output
|
|
2458
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2459
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2460
|
+
// if output is NULL, init from the input tok embed
|
|
2461
|
+
if (output == NULL) {
|
|
2462
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2463
|
+
}
|
|
2464
|
+
|
|
2465
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2466
|
+
auto & layer = layers[i];
|
|
2467
|
+
|
|
2468
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2469
|
+
|
|
2470
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2471
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2472
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2473
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2474
|
+
|
|
2475
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2476
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2477
|
+
|
|
2478
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2479
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2480
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2481
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2482
|
+
}
|
|
2483
|
+
} break;
|
|
2484
|
+
case LLM_ARCH_QWEN3MOE:
|
|
2485
|
+
{
|
|
2486
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2487
|
+
|
|
2488
|
+
// output
|
|
2489
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2490
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2491
|
+
|
|
2492
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2493
|
+
auto & layer = layers[i];
|
|
2494
|
+
|
|
2495
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2496
|
+
|
|
2497
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2498
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2499
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2500
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2501
|
+
|
|
2502
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2503
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2504
|
+
|
|
2505
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2506
|
+
|
|
2507
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2508
|
+
|
|
2509
|
+
if (n_expert == 0) {
|
|
2510
|
+
throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
|
|
2511
|
+
}
|
|
2512
|
+
if (n_expert_used == 0) {
|
|
2513
|
+
throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
|
|
2514
|
+
}
|
|
2515
|
+
|
|
2516
|
+
// MoE branch
|
|
2517
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
2518
|
+
|
|
2519
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2520
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2521
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2522
|
+
}
|
|
2523
|
+
} break;
|
|
2257
2524
|
case LLM_ARCH_PHI2:
|
|
2258
2525
|
{
|
|
2259
2526
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3003,8 +3270,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3003
3270
|
{
|
|
3004
3271
|
const bool is_lite = (hparams.n_layer == 27);
|
|
3005
3272
|
|
|
3273
|
+
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
|
3274
|
+
|
|
3275
|
+
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
|
3276
|
+
const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
|
3277
|
+
const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
|
3278
|
+
|
|
3006
3279
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
3007
|
-
const int64_t n_embd_head_qk_nope =
|
|
3280
|
+
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
|
3008
3281
|
|
|
3009
3282
|
const int64_t q_lora_rank = hparams.n_lora_q;
|
|
3010
3283
|
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
@@ -3030,14 +3303,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3030
3303
|
|
|
3031
3304
|
if (!is_lite) {
|
|
3032
3305
|
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
|
3033
|
-
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head *
|
|
3306
|
+
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
|
|
3034
3307
|
} else {
|
|
3035
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd,
|
|
3308
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
|
|
3036
3309
|
}
|
|
3037
3310
|
|
|
3038
|
-
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank +
|
|
3039
|
-
|
|
3040
|
-
|
|
3311
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
|
|
3312
|
+
|
|
3313
|
+
// note: only old legacy GGUF files will have the unsplit wkv_b tensor in
|
|
3314
|
+
if (is_mla) {
|
|
3315
|
+
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
|
|
3316
|
+
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
|
|
3317
|
+
} else {
|
|
3318
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
|
|
3319
|
+
}
|
|
3320
|
+
|
|
3321
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
|
|
3041
3322
|
|
|
3042
3323
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3043
3324
|
|
|
@@ -3068,6 +3349,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3068
3349
|
}
|
|
3069
3350
|
}
|
|
3070
3351
|
} break;
|
|
3352
|
+
case LLM_ARCH_PLM:
|
|
3353
|
+
{
|
|
3354
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
3355
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
3356
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
3357
|
+
|
|
3358
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3359
|
+
|
|
3360
|
+
// output
|
|
3361
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3362
|
+
// output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3363
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3364
|
+
|
|
3365
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3366
|
+
auto & layer = layers[i];
|
|
3367
|
+
|
|
3368
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3369
|
+
|
|
3370
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3371
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
|
|
3372
|
+
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
|
3373
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
|
|
3374
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
|
|
3375
|
+
|
|
3376
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3377
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3378
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3379
|
+
}
|
|
3380
|
+
} break;
|
|
3071
3381
|
case LLM_ARCH_BITNET:
|
|
3072
3382
|
{
|
|
3073
3383
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3227,7 +3537,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3227
3537
|
|
|
3228
3538
|
// output
|
|
3229
3539
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3230
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
3540
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3541
|
+
// if output is NULL, init from the input tok embed
|
|
3542
|
+
if (output == NULL) {
|
|
3543
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3544
|
+
}
|
|
3231
3545
|
|
|
3232
3546
|
for (int i = 0; i < n_layer; ++i) {
|
|
3233
3547
|
auto & layer = layers[i];
|
|
@@ -3254,21 +3568,60 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3254
3568
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3255
3569
|
}
|
|
3256
3570
|
} break;
|
|
3257
|
-
case
|
|
3571
|
+
case LLM_ARCH_GLM4:
|
|
3258
3572
|
{
|
|
3259
3573
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3260
3574
|
|
|
3261
3575
|
// output
|
|
3262
|
-
output_norm
|
|
3263
|
-
|
|
3264
|
-
output
|
|
3576
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3577
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3578
|
+
// if output is NULL, init from the input tok embed
|
|
3579
|
+
if (output == NULL) {
|
|
3580
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3581
|
+
}
|
|
3265
3582
|
|
|
3266
3583
|
for (int i = 0; i < n_layer; ++i) {
|
|
3267
3584
|
auto & layer = layers[i];
|
|
3268
3585
|
|
|
3269
|
-
layer.attn_norm
|
|
3270
|
-
layer.
|
|
3271
|
-
|
|
3586
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3587
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3588
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3589
|
+
|
|
3590
|
+
if (layer.wqkv == nullptr) {
|
|
3591
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3592
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3593
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3594
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3595
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3596
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3597
|
+
}
|
|
3598
|
+
|
|
3599
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3600
|
+
|
|
3601
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3602
|
+
|
|
3603
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3604
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3605
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
|
|
3606
|
+
|
|
3607
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3608
|
+
}
|
|
3609
|
+
} break;
|
|
3610
|
+
case LLM_ARCH_NEMOTRON:
|
|
3611
|
+
{
|
|
3612
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3613
|
+
|
|
3614
|
+
// output
|
|
3615
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3616
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
3617
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3618
|
+
|
|
3619
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3620
|
+
auto & layer = layers[i];
|
|
3621
|
+
|
|
3622
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3623
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
3624
|
+
|
|
3272
3625
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
3273
3626
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3274
3627
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
@@ -3712,6 +4065,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3712
4065
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
|
3713
4066
|
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
|
3714
4067
|
} break;
|
|
4068
|
+
case LLM_ARCH_BAILINGMOE:
|
|
4069
|
+
{
|
|
4070
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
4071
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4072
|
+
|
|
4073
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4074
|
+
|
|
4075
|
+
// output
|
|
4076
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4077
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
4078
|
+
|
|
4079
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4080
|
+
auto & layer = layers[i];
|
|
4081
|
+
|
|
4082
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4083
|
+
|
|
4084
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
|
|
4085
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
4086
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
4087
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
|
|
4088
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4089
|
+
|
|
4090
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4091
|
+
|
|
4092
|
+
if (n_expert == 0) {
|
|
4093
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
4094
|
+
}
|
|
4095
|
+
if (n_expert_used == 0) {
|
|
4096
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
4097
|
+
}
|
|
4098
|
+
|
|
4099
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4100
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
4101
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4102
|
+
|
|
4103
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4104
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
4105
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4106
|
+
}
|
|
4107
|
+
} break;
|
|
3715
4108
|
default:
|
|
3716
4109
|
throw std::runtime_error("unknown architecture");
|
|
3717
4110
|
}
|
|
@@ -3753,6 +4146,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3753
4146
|
if (!dev) {
|
|
3754
4147
|
// FIXME: workaround for CPU backend buft having a NULL device
|
|
3755
4148
|
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
4149
|
+
if (!dev) {
|
|
4150
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
4151
|
+
}
|
|
3756
4152
|
}
|
|
3757
4153
|
ggml_backend_dev_props props;
|
|
3758
4154
|
ggml_backend_dev_get_props(dev, &props);
|
|
@@ -3882,7 +4278,7 @@ uint64_t llama_model::n_elements() const {
|
|
|
3882
4278
|
}
|
|
3883
4279
|
|
|
3884
4280
|
void llama_model::print_info() const {
|
|
3885
|
-
const
|
|
4281
|
+
const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
|
|
3886
4282
|
|
|
3887
4283
|
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
|
3888
4284
|
bool is_var = false;
|
|
@@ -3943,7 +4339,7 @@ void llama_model::print_info() const {
|
|
|
3943
4339
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
|
3944
4340
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
3945
4341
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
3946
|
-
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
|
4342
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
3947
4343
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
3948
4344
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
3949
4345
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
@@ -3980,6 +4376,8 @@ void llama_model::print_info() const {
|
|
|
3980
4376
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
3981
4377
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
3982
4378
|
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
|
4379
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
|
|
4380
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
|
|
3983
4381
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
3984
4382
|
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
3985
4383
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
@@ -3993,10 +4391,25 @@ void llama_model::print_info() const {
|
|
|
3993
4391
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
3994
4392
|
}
|
|
3995
4393
|
|
|
3996
|
-
if (arch ==
|
|
4394
|
+
if (arch == LLM_ARCH_QWEN3MOE) {
|
|
4395
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
4396
|
+
}
|
|
4397
|
+
|
|
4398
|
+
if (arch == LLM_ARCH_MINICPM ||
|
|
4399
|
+
arch == LLM_ARCH_GRANITE ||
|
|
4400
|
+
arch == LLM_ARCH_GRANITE_MOE) {
|
|
3997
4401
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
3998
4402
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
3999
4403
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
4404
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
4405
|
+
}
|
|
4406
|
+
|
|
4407
|
+
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
4408
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
4409
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
4410
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
4411
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
4412
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
4000
4413
|
}
|
|
4001
4414
|
|
|
4002
4415
|
vocab.print_info();
|
|
@@ -4060,6 +4473,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
|
|
|
4060
4473
|
});
|
|
4061
4474
|
}
|
|
4062
4475
|
|
|
4476
|
+
bool llama_model::has_tensor_overrides() const {
|
|
4477
|
+
return pimpl->has_tensor_overrides;
|
|
4478
|
+
}
|
|
4479
|
+
|
|
4063
4480
|
const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
4064
4481
|
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
|
|
4065
4482
|
[name](const std::pair<std::string, ggml_tensor *> & it) {
|
|
@@ -4072,6 +4489,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
|
4072
4489
|
return it->second;
|
|
4073
4490
|
}
|
|
4074
4491
|
|
|
4492
|
+
ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
|
|
4493
|
+
// choose long/short freq factors based on the context size
|
|
4494
|
+
if (layers[il].rope_freqs != nullptr) {
|
|
4495
|
+
return layers[il].rope_freqs;
|
|
4496
|
+
}
|
|
4497
|
+
|
|
4498
|
+
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
|
4499
|
+
return layers[il].rope_long;
|
|
4500
|
+
}
|
|
4501
|
+
|
|
4502
|
+
return layers[il].rope_short;
|
|
4503
|
+
}
|
|
4504
|
+
|
|
4075
4505
|
struct llm_build_llama : public llm_graph_context {
|
|
4076
4506
|
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
4077
4507
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -4087,12 +4517,22 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4087
4517
|
// inp_pos - contains the positions
|
|
4088
4518
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
4089
4519
|
|
|
4520
|
+
// temperature tuning
|
|
4521
|
+
ggml_tensor * inp_attn_scale = nullptr;
|
|
4522
|
+
if (arch == LLM_ARCH_LLAMA4) {
|
|
4523
|
+
inp_attn_scale = build_inp_attn_scale();
|
|
4524
|
+
}
|
|
4525
|
+
|
|
4090
4526
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
4091
4527
|
|
|
4092
4528
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4093
4529
|
for (int il = 0; il < n_layer; ++il) {
|
|
4094
4530
|
ggml_tensor * inpSA = inpL;
|
|
4095
4531
|
|
|
4532
|
+
bool use_rope = arch == LLM_ARCH_LLAMA4
|
|
4533
|
+
? (il + 1) % hparams.n_no_rope_layer_step != 0
|
|
4534
|
+
: true;
|
|
4535
|
+
|
|
4096
4536
|
// norm
|
|
4097
4537
|
cur = build_norm(inpL,
|
|
4098
4538
|
model.layers[il].attn_norm, NULL,
|
|
@@ -4102,7 +4542,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4102
4542
|
// self-attention
|
|
4103
4543
|
{
|
|
4104
4544
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4105
|
-
ggml_tensor * rope_factors =
|
|
4545
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
4106
4546
|
|
|
4107
4547
|
// compute Q and K and RoPE them
|
|
4108
4548
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -4130,25 +4570,38 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4130
4570
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
4131
4571
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
4132
4572
|
|
|
4133
|
-
|
|
4134
|
-
|
|
4135
|
-
|
|
4136
|
-
|
|
4137
|
-
|
|
4573
|
+
if (use_rope) {
|
|
4574
|
+
Qcur = ggml_rope_ext(
|
|
4575
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
4576
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4577
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4578
|
+
);
|
|
4138
4579
|
|
|
4139
|
-
|
|
4140
|
-
|
|
4141
|
-
|
|
4142
|
-
|
|
4143
|
-
|
|
4580
|
+
Kcur = ggml_rope_ext(
|
|
4581
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
4582
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4583
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4584
|
+
);
|
|
4585
|
+
} else if (inp_attn_scale) {
|
|
4586
|
+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
|
4587
|
+
}
|
|
4144
4588
|
|
|
4145
4589
|
cb(Qcur, "Qcur", il);
|
|
4146
4590
|
cb(Kcur, "Kcur", il);
|
|
4147
4591
|
cb(Vcur, "Vcur", il);
|
|
4148
4592
|
|
|
4593
|
+
if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
|
|
4594
|
+
// Llama4TextL2Norm
|
|
4595
|
+
Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
|
|
4596
|
+
Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
|
|
4597
|
+
cb(Qcur, "Qcur_normed", il);
|
|
4598
|
+
cb(Kcur, "Kcur_normed", il);
|
|
4599
|
+
}
|
|
4600
|
+
|
|
4149
4601
|
cur = build_attn(inp_attn, gf,
|
|
4150
4602
|
model.layers[il].wo, model.layers[il].bo,
|
|
4151
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
4603
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4604
|
+
cb(cur, "attn_out", il);
|
|
4152
4605
|
}
|
|
4153
4606
|
|
|
4154
4607
|
if (il == n_layer - 1) {
|
|
@@ -4158,15 +4611,10 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4158
4611
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4159
4612
|
}
|
|
4160
4613
|
|
|
4161
|
-
// For Granite architecture
|
|
4162
|
-
if (hparams.f_residual_scale) {
|
|
4163
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4164
|
-
}
|
|
4165
|
-
|
|
4166
4614
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
4167
4615
|
cb(ffn_inp, "ffn_inp", il);
|
|
4168
4616
|
|
|
4169
|
-
// feed-forward network
|
|
4617
|
+
// feed-forward network (non-MoE)
|
|
4170
4618
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
4171
4619
|
|
|
4172
4620
|
cur = build_norm(ffn_inp,
|
|
@@ -4181,6 +4629,38 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4181
4629
|
NULL,
|
|
4182
4630
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4183
4631
|
cb(cur, "ffn_out", il);
|
|
4632
|
+
|
|
4633
|
+
} else if (arch == LLM_ARCH_LLAMA4) {
|
|
4634
|
+
// llama4 MoE
|
|
4635
|
+
ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
|
|
4636
|
+
model.layers[il].ffn_norm, NULL,
|
|
4637
|
+
LLM_NORM_RMS, il);
|
|
4638
|
+
cb(cur, "ffn_norm", il);
|
|
4639
|
+
|
|
4640
|
+
ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
|
|
4641
|
+
model.layers[il].ffn_gate_inp,
|
|
4642
|
+
model.layers[il].ffn_up_exps,
|
|
4643
|
+
model.layers[il].ffn_gate_exps,
|
|
4644
|
+
model.layers[il].ffn_down_exps,
|
|
4645
|
+
nullptr,
|
|
4646
|
+
n_expert, n_expert_used,
|
|
4647
|
+
LLM_FFN_SILU, false,
|
|
4648
|
+
false, 0.0,
|
|
4649
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
|
4650
|
+
il);
|
|
4651
|
+
|
|
4652
|
+
// Shared experts
|
|
4653
|
+
ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
|
|
4654
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
4655
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
4656
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
4657
|
+
NULL,
|
|
4658
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4659
|
+
cb(shexp_out, "ffn_moe_shexp", il);
|
|
4660
|
+
|
|
4661
|
+
cur = ggml_add(ctx0, moe_out, shexp_out);
|
|
4662
|
+
cb(cur, "ffn_moe_out_merged", il);
|
|
4663
|
+
|
|
4184
4664
|
} else {
|
|
4185
4665
|
// MoE branch
|
|
4186
4666
|
cur = build_norm(ffn_inp,
|
|
@@ -4202,11 +4682,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4202
4682
|
cb(cur, "ffn_moe_out", il);
|
|
4203
4683
|
}
|
|
4204
4684
|
|
|
4205
|
-
// For Granite architecture
|
|
4206
|
-
if (hparams.f_residual_scale) {
|
|
4207
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4208
|
-
}
|
|
4209
|
-
|
|
4210
4685
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
4211
4686
|
cb(cur, "ffn_out", il);
|
|
4212
4687
|
|
|
@@ -4229,11 +4704,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4229
4704
|
// lm_head
|
|
4230
4705
|
cur = build_lora_mm(model.output, cur);
|
|
4231
4706
|
|
|
4232
|
-
// For Granite architecture
|
|
4233
|
-
if (hparams.f_logit_scale) {
|
|
4234
|
-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
4235
|
-
}
|
|
4236
|
-
|
|
4237
4707
|
cb(cur, "result_output", -1);
|
|
4238
4708
|
res->t_logits = cur;
|
|
4239
4709
|
|
|
@@ -4263,6 +4733,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4263
4733
|
ggml_tensor * inpSA = inpL;
|
|
4264
4734
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
4265
4735
|
const int64_t n_head = hparams.n_head(il);
|
|
4736
|
+
const int64_t n_ff = hparams.n_ff(il);
|
|
4266
4737
|
|
|
4267
4738
|
if (n_head == 0) {
|
|
4268
4739
|
// attention-free layer of Llama-3_1-Nemotron-51B
|
|
@@ -4282,7 +4753,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4282
4753
|
} else if (n_head > 0) {
|
|
4283
4754
|
// self-attention
|
|
4284
4755
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4285
|
-
ggml_tensor * rope_factors =
|
|
4756
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
4286
4757
|
|
|
4287
4758
|
// compute Q and K and RoPE them
|
|
4288
4759
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -4328,7 +4799,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4328
4799
|
|
|
4329
4800
|
cur = build_attn(inp_attn, gf,
|
|
4330
4801
|
model.layers[il].wo, model.layers[il].bo,
|
|
4331
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
4802
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4332
4803
|
}
|
|
4333
4804
|
|
|
4334
4805
|
if (il == n_layer - 1) {
|
|
@@ -4338,9 +4809,9 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4338
4809
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4339
4810
|
}
|
|
4340
4811
|
|
|
4341
|
-
//
|
|
4342
|
-
if (
|
|
4343
|
-
|
|
4812
|
+
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
|
|
4813
|
+
if (n_ff == 0) {
|
|
4814
|
+
continue;
|
|
4344
4815
|
}
|
|
4345
4816
|
|
|
4346
4817
|
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
|
@@ -4366,11 +4837,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4366
4837
|
cb(cur, "ffn_out", il);
|
|
4367
4838
|
}
|
|
4368
4839
|
|
|
4369
|
-
// For Granite architecture
|
|
4370
|
-
if (hparams.f_residual_scale) {
|
|
4371
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4372
|
-
}
|
|
4373
|
-
|
|
4374
4840
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
4375
4841
|
cb(cur, "ffn_out", il);
|
|
4376
4842
|
|
|
@@ -4393,11 +4859,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4393
4859
|
// lm_head
|
|
4394
4860
|
cur = build_lora_mm(model.output, cur);
|
|
4395
4861
|
|
|
4396
|
-
// For Granite architecture
|
|
4397
|
-
if (hparams.f_logit_scale) {
|
|
4398
|
-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
4399
|
-
}
|
|
4400
|
-
|
|
4401
4862
|
cb(cur, "result_output", -1);
|
|
4402
4863
|
res->t_logits = cur;
|
|
4403
4864
|
|
|
@@ -4470,7 +4931,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
4470
4931
|
|
|
4471
4932
|
cur = build_attn(inp_attn, gf,
|
|
4472
4933
|
model.layers[il].wo, NULL,
|
|
4473
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4934
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4474
4935
|
}
|
|
4475
4936
|
|
|
4476
4937
|
if (il == n_layer - 1) {
|
|
@@ -4585,7 +5046,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
4585
5046
|
|
|
4586
5047
|
cur = build_attn(inp_attn, gf,
|
|
4587
5048
|
model.layers[il].wo, NULL,
|
|
4588
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5049
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4589
5050
|
}
|
|
4590
5051
|
|
|
4591
5052
|
if (il == n_layer - 1) {
|
|
@@ -4710,7 +5171,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
4710
5171
|
|
|
4711
5172
|
cur = build_attn(inp_attn, gf,
|
|
4712
5173
|
model.layers[il].wo, NULL,
|
|
4713
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5174
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4714
5175
|
}
|
|
4715
5176
|
|
|
4716
5177
|
if (il == n_layer - 1) {
|
|
@@ -4840,7 +5301,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
4840
5301
|
|
|
4841
5302
|
cur = build_attn(inp_attn, gf,
|
|
4842
5303
|
model.layers[il].wo, model.layers[il].bo,
|
|
4843
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
5304
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
4844
5305
|
}
|
|
4845
5306
|
|
|
4846
5307
|
if (il == n_layer - 1) {
|
|
@@ -4991,7 +5452,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
4991
5452
|
|
|
4992
5453
|
cur = build_attn(inp_attn, gf,
|
|
4993
5454
|
model.layers[il].wo, NULL,
|
|
4994
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5455
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4995
5456
|
}
|
|
4996
5457
|
|
|
4997
5458
|
if (il == n_layer - 1) {
|
|
@@ -5105,7 +5566,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
5105
5566
|
|
|
5106
5567
|
cur = build_attn(inp_attn, gf,
|
|
5107
5568
|
model.layers[il].wo, model.layers[il].bo,
|
|
5108
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5569
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5109
5570
|
}
|
|
5110
5571
|
|
|
5111
5572
|
if (il == n_layer - 1) {
|
|
@@ -5204,7 +5665,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
5204
5665
|
|
|
5205
5666
|
cur = build_attn(inp_attn, gf,
|
|
5206
5667
|
model.layers[il].wo, NULL,
|
|
5207
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5668
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5208
5669
|
}
|
|
5209
5670
|
|
|
5210
5671
|
if (il == n_layer - 1) {
|
|
@@ -5331,6 +5792,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5331
5792
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
5332
5793
|
cb(cur, "wqkv", il);
|
|
5333
5794
|
|
|
5795
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
5796
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
5797
|
+
cb(cur, "bqkv", il);
|
|
5798
|
+
}
|
|
5799
|
+
|
|
5334
5800
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
5335
5801
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
5336
5802
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
@@ -5358,7 +5824,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5358
5824
|
|
|
5359
5825
|
cur = build_attn(inp_attn, gf,
|
|
5360
5826
|
model.layers[il].wo, model.layers[il].bo,
|
|
5361
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5827
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5362
5828
|
cb(cur, "kqv_out", il);
|
|
5363
5829
|
|
|
5364
5830
|
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
@@ -5383,13 +5849,29 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5383
5849
|
cb(ffn_inp, "ffn_inp", il);
|
|
5384
5850
|
|
|
5385
5851
|
// feed-forward network
|
|
5386
|
-
if (
|
|
5852
|
+
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
|
5853
|
+
// MoE branch
|
|
5854
|
+
cur = build_moe_ffn(cur,
|
|
5855
|
+
model.layers[il].ffn_gate_inp,
|
|
5856
|
+
model.layers[il].ffn_up_exps,
|
|
5857
|
+
nullptr,
|
|
5858
|
+
model.layers[il].ffn_down_exps,
|
|
5859
|
+
nullptr,
|
|
5860
|
+
hparams.n_expert,
|
|
5861
|
+
hparams.n_expert_used,
|
|
5862
|
+
LLM_FFN_GELU,
|
|
5863
|
+
false, false,
|
|
5864
|
+
0.0f,
|
|
5865
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
|
5866
|
+
cb(cur, "ffn_moe_out", il);
|
|
5867
|
+
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
5387
5868
|
cur = build_ffn(cur,
|
|
5388
5869
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
5389
5870
|
NULL, NULL, NULL,
|
|
5390
5871
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
5391
5872
|
NULL,
|
|
5392
5873
|
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
5874
|
+
cb(cur, "ffn_out", il);
|
|
5393
5875
|
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
5394
5876
|
cur = build_ffn(cur,
|
|
5395
5877
|
model.layers[il].ffn_up, NULL, NULL,
|
|
@@ -5397,6 +5879,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5397
5879
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
5398
5880
|
NULL,
|
|
5399
5881
|
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
5882
|
+
cb(cur, "ffn_out", il);
|
|
5400
5883
|
} else {
|
|
5401
5884
|
cur = build_ffn(cur,
|
|
5402
5885
|
model.layers[il].ffn_up, NULL, NULL,
|
|
@@ -5404,8 +5887,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5404
5887
|
model.layers[il].ffn_down, NULL, NULL,
|
|
5405
5888
|
NULL,
|
|
5406
5889
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
5890
|
+
cb(cur, "ffn_out", il);
|
|
5407
5891
|
}
|
|
5408
|
-
cb(cur, "ffn_out", il);
|
|
5409
5892
|
|
|
5410
5893
|
// attentions bypass the intermediate layer
|
|
5411
5894
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
@@ -5475,7 +5958,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
5475
5958
|
|
|
5476
5959
|
cur = build_attn(inp_attn, gf,
|
|
5477
5960
|
model.layers[il].wo, model.layers[il].bo,
|
|
5478
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5961
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5479
5962
|
}
|
|
5480
5963
|
|
|
5481
5964
|
if (il == n_layer - 1) {
|
|
@@ -5616,7 +6099,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
5616
6099
|
|
|
5617
6100
|
cur = build_attn(inp_attn, gf,
|
|
5618
6101
|
model.layers[il].wo, model.layers[il].bo,
|
|
5619
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6102
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5620
6103
|
}
|
|
5621
6104
|
|
|
5622
6105
|
if (il == n_layer - 1) {
|
|
@@ -5762,7 +6245,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
5762
6245
|
|
|
5763
6246
|
cur = build_attn(inp_attn, gf,
|
|
5764
6247
|
model.layers[il].wo, NULL,
|
|
5765
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6248
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5766
6249
|
}
|
|
5767
6250
|
|
|
5768
6251
|
if (il == n_layer - 1) {
|
|
@@ -5885,7 +6368,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
5885
6368
|
|
|
5886
6369
|
cur = build_attn(inp_attn, gf,
|
|
5887
6370
|
model.layers[il].wo, NULL,
|
|
5888
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6371
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5889
6372
|
}
|
|
5890
6373
|
|
|
5891
6374
|
if (il == n_layer - 1) {
|
|
@@ -6005,7 +6488,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
6005
6488
|
|
|
6006
6489
|
cur = build_attn(inp_attn, gf,
|
|
6007
6490
|
model.layers[il].wo, model.layers[il].bo,
|
|
6008
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6491
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6009
6492
|
}
|
|
6010
6493
|
|
|
6011
6494
|
if (il == n_layer - 1) {
|
|
@@ -6126,7 +6609,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
6126
6609
|
|
|
6127
6610
|
cur = build_attn(inp_attn, gf,
|
|
6128
6611
|
model.layers[il].wo, model.layers[il].bo,
|
|
6129
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6612
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6130
6613
|
}
|
|
6131
6614
|
|
|
6132
6615
|
if (il == n_layer - 1) {
|
|
@@ -6253,7 +6736,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6253
6736
|
|
|
6254
6737
|
cur = build_attn(inp_attn, gf,
|
|
6255
6738
|
model.layers[il].wo, model.layers[il].bo,
|
|
6256
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6739
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6257
6740
|
}
|
|
6258
6741
|
|
|
6259
6742
|
if (il == n_layer - 1) {
|
|
@@ -6284,7 +6767,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6284
6767
|
false, 0.0,
|
|
6285
6768
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
6286
6769
|
il);
|
|
6287
|
-
cb(
|
|
6770
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
6288
6771
|
|
|
6289
6772
|
// FFN shared expert
|
|
6290
6773
|
{
|
|
@@ -6340,16 +6823,14 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6340
6823
|
}
|
|
6341
6824
|
};
|
|
6342
6825
|
|
|
6343
|
-
struct
|
|
6344
|
-
|
|
6826
|
+
struct llm_build_qwen3 : public llm_graph_context {
|
|
6827
|
+
llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6345
6828
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6346
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6347
6829
|
|
|
6348
6830
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6831
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
6349
6832
|
|
|
6350
6833
|
ggml_tensor * cur;
|
|
6351
|
-
ggml_tensor * attn_norm_output;
|
|
6352
|
-
ggml_tensor * ffn_output;
|
|
6353
6834
|
ggml_tensor * inpL;
|
|
6354
6835
|
|
|
6355
6836
|
inpL = build_inp_embd(model.tok_embd);
|
|
@@ -6360,48 +6841,42 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6360
6841
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6361
6842
|
|
|
6362
6843
|
for (int il = 0; il < n_layer; ++il) {
|
|
6363
|
-
|
|
6364
|
-
|
|
6365
|
-
|
|
6366
|
-
|
|
6367
|
-
|
|
6844
|
+
ggml_tensor * inpSA = inpL;
|
|
6845
|
+
|
|
6846
|
+
// norm
|
|
6847
|
+
cur = build_norm(inpL,
|
|
6848
|
+
model.layers[il].attn_norm, NULL,
|
|
6849
|
+
LLM_NORM_RMS, il);
|
|
6850
|
+
cb(cur, "attn_norm", il);
|
|
6368
6851
|
|
|
6369
6852
|
// self-attention
|
|
6370
6853
|
{
|
|
6371
|
-
|
|
6372
|
-
ggml_tensor *
|
|
6373
|
-
ggml_tensor * Vcur = nullptr;
|
|
6374
|
-
|
|
6375
|
-
if (model.layers[il].wqkv) {
|
|
6376
|
-
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
6377
|
-
cb(cur, "wqkv", il);
|
|
6378
|
-
|
|
6379
|
-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6380
|
-
cb(cur, "bqkv", il);
|
|
6381
|
-
|
|
6382
|
-
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6383
|
-
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6384
|
-
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6385
|
-
} else {
|
|
6386
|
-
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
6387
|
-
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
6388
|
-
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
6389
|
-
}
|
|
6390
|
-
|
|
6854
|
+
// compute Q and K and RoPE them
|
|
6855
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
6391
6856
|
cb(Qcur, "Qcur", il);
|
|
6857
|
+
|
|
6858
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
6392
6859
|
cb(Kcur, "Kcur", il);
|
|
6860
|
+
|
|
6861
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
6393
6862
|
cb(Vcur, "Vcur", il);
|
|
6394
6863
|
|
|
6395
6864
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6396
6865
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6397
6866
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6398
6867
|
|
|
6868
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
6869
|
+
cb(Qcur, "Qcur_normed", il);
|
|
6870
|
+
|
|
6399
6871
|
Qcur = ggml_rope_ext(
|
|
6400
6872
|
ctx0, Qcur, inp_pos, nullptr,
|
|
6401
6873
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6402
6874
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6403
6875
|
);
|
|
6404
6876
|
|
|
6877
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
6878
|
+
cb(Kcur, "Kcur_normed", il);
|
|
6879
|
+
|
|
6405
6880
|
Kcur = ggml_rope_ext(
|
|
6406
6881
|
ctx0, Kcur, inp_pos, nullptr,
|
|
6407
6882
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -6412,36 +6887,36 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6412
6887
|
cb(Kcur, "Kcur", il);
|
|
6413
6888
|
cb(Vcur, "Vcur", il);
|
|
6414
6889
|
|
|
6415
|
-
// with phi2, we scale the Q to avoid precision issues
|
|
6416
|
-
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
|
6417
|
-
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
|
6418
|
-
|
|
6419
6890
|
cur = build_attn(inp_attn, gf,
|
|
6420
6891
|
model.layers[il].wo, model.layers[il].bo,
|
|
6421
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
6892
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6422
6893
|
}
|
|
6423
6894
|
|
|
6424
6895
|
if (il == n_layer - 1) {
|
|
6425
6896
|
// skip computing output for unused tokens
|
|
6426
6897
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6427
|
-
cur
|
|
6428
|
-
|
|
6429
|
-
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
|
6898
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6899
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6430
6900
|
}
|
|
6431
6901
|
|
|
6432
|
-
|
|
6433
|
-
|
|
6434
|
-
ffn_output = build_ffn(attn_norm_output,
|
|
6435
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
6436
|
-
NULL, NULL, NULL,
|
|
6437
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
6438
|
-
NULL,
|
|
6439
|
-
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
6440
|
-
cb(ffn_output, "ffn_out", il);
|
|
6441
|
-
}
|
|
6902
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
6903
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
6442
6904
|
|
|
6443
|
-
|
|
6444
|
-
cur =
|
|
6905
|
+
// feed-forward network
|
|
6906
|
+
cur = build_norm(ffn_inp,
|
|
6907
|
+
model.layers[il].ffn_norm, NULL,
|
|
6908
|
+
LLM_NORM_RMS, il);
|
|
6909
|
+
cb(cur, "ffn_norm", il);
|
|
6910
|
+
|
|
6911
|
+
cur = build_ffn(cur,
|
|
6912
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
6913
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
6914
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
6915
|
+
NULL,
|
|
6916
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
6917
|
+
cb(cur, "ffn_out", il);
|
|
6918
|
+
|
|
6919
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
6445
6920
|
|
|
6446
6921
|
cur = build_cvec(cur, il);
|
|
6447
6922
|
cb(cur, "l_out", il);
|
|
@@ -6450,10 +6925,267 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6450
6925
|
inpL = cur;
|
|
6451
6926
|
}
|
|
6452
6927
|
|
|
6453
|
-
cur =
|
|
6454
|
-
|
|
6455
|
-
|
|
6456
|
-
|
|
6928
|
+
cur = inpL;
|
|
6929
|
+
|
|
6930
|
+
cur = build_norm(cur,
|
|
6931
|
+
model.output_norm, NULL,
|
|
6932
|
+
LLM_NORM_RMS, -1);
|
|
6933
|
+
|
|
6934
|
+
cb(cur, "result_norm", -1);
|
|
6935
|
+
res->t_embd = cur;
|
|
6936
|
+
|
|
6937
|
+
// lm_head
|
|
6938
|
+
cur = build_lora_mm(model.output, cur);
|
|
6939
|
+
|
|
6940
|
+
cb(cur, "result_output", -1);
|
|
6941
|
+
res->t_logits = cur;
|
|
6942
|
+
|
|
6943
|
+
ggml_build_forward_expand(gf, cur);
|
|
6944
|
+
}
|
|
6945
|
+
};
|
|
6946
|
+
|
|
6947
|
+
struct llm_build_qwen3moe : public llm_graph_context {
|
|
6948
|
+
llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6949
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6950
|
+
|
|
6951
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6952
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
6953
|
+
|
|
6954
|
+
ggml_tensor * cur;
|
|
6955
|
+
ggml_tensor * inpL;
|
|
6956
|
+
|
|
6957
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
6958
|
+
|
|
6959
|
+
// inp_pos - contains the positions
|
|
6960
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
6961
|
+
|
|
6962
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6963
|
+
|
|
6964
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
6965
|
+
ggml_tensor * inpSA = inpL;
|
|
6966
|
+
|
|
6967
|
+
// norm
|
|
6968
|
+
cur = build_norm(inpL,
|
|
6969
|
+
model.layers[il].attn_norm, NULL,
|
|
6970
|
+
LLM_NORM_RMS, il);
|
|
6971
|
+
cb(cur, "attn_norm", il);
|
|
6972
|
+
|
|
6973
|
+
// self_attention
|
|
6974
|
+
{
|
|
6975
|
+
// compute Q and K and RoPE them
|
|
6976
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
6977
|
+
cb(Qcur, "Qcur", il);
|
|
6978
|
+
|
|
6979
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
6980
|
+
cb(Kcur, "Kcur", il);
|
|
6981
|
+
|
|
6982
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
6983
|
+
cb(Vcur, "Vcur", il);
|
|
6984
|
+
|
|
6985
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6986
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6987
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6988
|
+
|
|
6989
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
6990
|
+
cb(Qcur, "Qcur_normed", il);
|
|
6991
|
+
|
|
6992
|
+
Qcur = ggml_rope_ext(
|
|
6993
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6994
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6995
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6996
|
+
);
|
|
6997
|
+
|
|
6998
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
6999
|
+
cb(Kcur, "Kcur_normed", il);
|
|
7000
|
+
|
|
7001
|
+
Kcur = ggml_rope_ext(
|
|
7002
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
7003
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7004
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7005
|
+
);
|
|
7006
|
+
|
|
7007
|
+
cb(Qcur, "Qcur", il);
|
|
7008
|
+
cb(Kcur, "Kcur", il);
|
|
7009
|
+
cb(Vcur, "Vcur", il);
|
|
7010
|
+
|
|
7011
|
+
cur = build_attn(inp_attn, gf,
|
|
7012
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
7013
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7014
|
+
}
|
|
7015
|
+
|
|
7016
|
+
if (il == n_layer - 1) {
|
|
7017
|
+
// skip computing output for unused tokens
|
|
7018
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7019
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7020
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7021
|
+
}
|
|
7022
|
+
|
|
7023
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
7024
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
7025
|
+
|
|
7026
|
+
// MoE branch
|
|
7027
|
+
cur = build_norm(ffn_inp,
|
|
7028
|
+
model.layers[il].ffn_norm, NULL,
|
|
7029
|
+
LLM_NORM_RMS, il);
|
|
7030
|
+
cb(cur, "ffn_norm", il);
|
|
7031
|
+
|
|
7032
|
+
ggml_tensor * moe_out =
|
|
7033
|
+
build_moe_ffn(cur,
|
|
7034
|
+
model.layers[il].ffn_gate_inp,
|
|
7035
|
+
model.layers[il].ffn_up_exps,
|
|
7036
|
+
model.layers[il].ffn_gate_exps,
|
|
7037
|
+
model.layers[il].ffn_down_exps,
|
|
7038
|
+
nullptr,
|
|
7039
|
+
n_expert, n_expert_used,
|
|
7040
|
+
LLM_FFN_SILU, true,
|
|
7041
|
+
false, 0.0,
|
|
7042
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
7043
|
+
il);
|
|
7044
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
7045
|
+
cur = moe_out;
|
|
7046
|
+
|
|
7047
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
7048
|
+
|
|
7049
|
+
cur = build_cvec(cur, il);
|
|
7050
|
+
cb(cur, "l_out", il);
|
|
7051
|
+
|
|
7052
|
+
// input for next layer
|
|
7053
|
+
inpL = cur;
|
|
7054
|
+
}
|
|
7055
|
+
|
|
7056
|
+
cur = inpL;
|
|
7057
|
+
|
|
7058
|
+
cur = build_norm(cur,
|
|
7059
|
+
model.output_norm, NULL,
|
|
7060
|
+
LLM_NORM_RMS, -1);
|
|
7061
|
+
|
|
7062
|
+
cb(cur, "result_norm", -1);
|
|
7063
|
+
res->t_embd = cur;
|
|
7064
|
+
|
|
7065
|
+
// lm_head
|
|
7066
|
+
cur = build_lora_mm(model.output, cur);
|
|
7067
|
+
|
|
7068
|
+
cb(cur, "result_output", -1);
|
|
7069
|
+
res->t_logits = cur;
|
|
7070
|
+
|
|
7071
|
+
ggml_build_forward_expand(gf, cur);
|
|
7072
|
+
}
|
|
7073
|
+
};
|
|
7074
|
+
|
|
7075
|
+
struct llm_build_phi2 : public llm_graph_context {
|
|
7076
|
+
llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
7077
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7078
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
7079
|
+
|
|
7080
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7081
|
+
|
|
7082
|
+
ggml_tensor * cur;
|
|
7083
|
+
ggml_tensor * attn_norm_output;
|
|
7084
|
+
ggml_tensor * ffn_output;
|
|
7085
|
+
ggml_tensor * inpL;
|
|
7086
|
+
|
|
7087
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
7088
|
+
|
|
7089
|
+
// inp_pos - contains the positions
|
|
7090
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
7091
|
+
|
|
7092
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7093
|
+
|
|
7094
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7095
|
+
attn_norm_output = build_norm(inpL,
|
|
7096
|
+
model.layers[il].attn_norm,
|
|
7097
|
+
model.layers[il].attn_norm_b,
|
|
7098
|
+
LLM_NORM, il);
|
|
7099
|
+
cb(attn_norm_output, "attn_norm", il);
|
|
7100
|
+
|
|
7101
|
+
// self-attention
|
|
7102
|
+
{
|
|
7103
|
+
ggml_tensor * Qcur = nullptr;
|
|
7104
|
+
ggml_tensor * Kcur = nullptr;
|
|
7105
|
+
ggml_tensor * Vcur = nullptr;
|
|
7106
|
+
|
|
7107
|
+
if (model.layers[il].wqkv) {
|
|
7108
|
+
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
7109
|
+
cb(cur, "wqkv", il);
|
|
7110
|
+
|
|
7111
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7112
|
+
cb(cur, "bqkv", il);
|
|
7113
|
+
|
|
7114
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
7115
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
7116
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
7117
|
+
} else {
|
|
7118
|
+
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7119
|
+
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7120
|
+
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
7121
|
+
}
|
|
7122
|
+
|
|
7123
|
+
cb(Qcur, "Qcur", il);
|
|
7124
|
+
cb(Kcur, "Kcur", il);
|
|
7125
|
+
cb(Vcur, "Vcur", il);
|
|
7126
|
+
|
|
7127
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7128
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7129
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7130
|
+
|
|
7131
|
+
Qcur = ggml_rope_ext(
|
|
7132
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
7133
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7134
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7135
|
+
);
|
|
7136
|
+
|
|
7137
|
+
Kcur = ggml_rope_ext(
|
|
7138
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
7139
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7140
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7141
|
+
);
|
|
7142
|
+
|
|
7143
|
+
cb(Qcur, "Qcur", il);
|
|
7144
|
+
cb(Kcur, "Kcur", il);
|
|
7145
|
+
cb(Vcur, "Vcur", il);
|
|
7146
|
+
|
|
7147
|
+
// with phi2, we scale the Q to avoid precision issues
|
|
7148
|
+
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
|
7149
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
|
7150
|
+
|
|
7151
|
+
cur = build_attn(inp_attn, gf,
|
|
7152
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
7153
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7154
|
+
}
|
|
7155
|
+
|
|
7156
|
+
if (il == n_layer - 1) {
|
|
7157
|
+
// skip computing output for unused tokens
|
|
7158
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7159
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7160
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7161
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
|
7162
|
+
}
|
|
7163
|
+
|
|
7164
|
+
// FF
|
|
7165
|
+
{
|
|
7166
|
+
ffn_output = build_ffn(attn_norm_output,
|
|
7167
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
7168
|
+
NULL, NULL, NULL,
|
|
7169
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
7170
|
+
NULL,
|
|
7171
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
7172
|
+
cb(ffn_output, "ffn_out", il);
|
|
7173
|
+
}
|
|
7174
|
+
|
|
7175
|
+
cur = ggml_add(ctx0, cur, ffn_output);
|
|
7176
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
7177
|
+
|
|
7178
|
+
cur = build_cvec(cur, il);
|
|
7179
|
+
cb(cur, "l_out", il);
|
|
7180
|
+
|
|
7181
|
+
// input for next layer
|
|
7182
|
+
inpL = cur;
|
|
7183
|
+
}
|
|
7184
|
+
|
|
7185
|
+
cur = build_norm(inpL,
|
|
7186
|
+
model.output_norm,
|
|
7187
|
+
model.output_norm_b,
|
|
7188
|
+
LLM_NORM, -1);
|
|
6457
7189
|
|
|
6458
7190
|
cb(cur, "result_norm", -1);
|
|
6459
7191
|
res->t_embd = cur;
|
|
@@ -6493,7 +7225,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
6493
7225
|
// self-attention
|
|
6494
7226
|
{
|
|
6495
7227
|
// rope freq factors for 128k context
|
|
6496
|
-
ggml_tensor * rope_factors =
|
|
7228
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
6497
7229
|
|
|
6498
7230
|
ggml_tensor* attn_norm_output = build_norm(inpL,
|
|
6499
7231
|
model.layers[il].attn_norm,
|
|
@@ -6547,7 +7279,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
6547
7279
|
|
|
6548
7280
|
cur = build_attn(inp_attn, gf,
|
|
6549
7281
|
model.layers[il].wo, model.layers[il].bo,
|
|
6550
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
7282
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
6551
7283
|
}
|
|
6552
7284
|
|
|
6553
7285
|
if (il == n_layer - 1) {
|
|
@@ -6682,7 +7414,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
6682
7414
|
|
|
6683
7415
|
cur = build_attn(inp_attn, gf,
|
|
6684
7416
|
model.layers[il].wo, NULL,
|
|
6685
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7417
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6686
7418
|
}
|
|
6687
7419
|
ggml_tensor * sa_out = cur;
|
|
6688
7420
|
|
|
@@ -6789,7 +7521,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
6789
7521
|
|
|
6790
7522
|
cur = build_attn(inp_attn, gf,
|
|
6791
7523
|
model.layers[il].wo, model.layers[il].bo,
|
|
6792
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7524
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6793
7525
|
}
|
|
6794
7526
|
|
|
6795
7527
|
if (il == n_layer - 1) {
|
|
@@ -6905,7 +7637,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
6905
7637
|
|
|
6906
7638
|
cur = build_attn(inp_attn, gf,
|
|
6907
7639
|
model.layers[il].wo, model.layers[il].bo,
|
|
6908
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7640
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6909
7641
|
}
|
|
6910
7642
|
|
|
6911
7643
|
if (il == n_layer - 1) {
|
|
@@ -7034,7 +7766,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
7034
7766
|
|
|
7035
7767
|
cur = build_attn(inp_attn, gf,
|
|
7036
7768
|
model.layers[il].wo, NULL,
|
|
7037
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7769
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7038
7770
|
}
|
|
7039
7771
|
|
|
7040
7772
|
if (il == n_layer - 1) {
|
|
@@ -7161,7 +7893,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
7161
7893
|
|
|
7162
7894
|
cur = build_attn(inp_attn, gf,
|
|
7163
7895
|
model.layers[il].wo, model.layers[il].bo,
|
|
7164
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7896
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7165
7897
|
}
|
|
7166
7898
|
|
|
7167
7899
|
if (il == n_layer - 1) {
|
|
@@ -7245,7 +7977,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
7245
7977
|
for (int il = 0; il < n_layer; ++il) {
|
|
7246
7978
|
ggml_tensor * inpSA = inpL;
|
|
7247
7979
|
|
|
7248
|
-
ggml_tensor * rope_factors =
|
|
7980
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
7249
7981
|
|
|
7250
7982
|
// norm
|
|
7251
7983
|
cur = build_norm(inpL,
|
|
@@ -7358,7 +8090,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
7358
8090
|
|
|
7359
8091
|
cur = build_attn(inp_attn, gf,
|
|
7360
8092
|
model.layers[il].wo, NULL,
|
|
7361
|
-
q_states, k_states, v_states, nullptr, kq_scale, il);
|
|
8093
|
+
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
7362
8094
|
}
|
|
7363
8095
|
|
|
7364
8096
|
if (il == n_layer - 1) {
|
|
@@ -7488,7 +8220,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
7488
8220
|
|
|
7489
8221
|
cur = build_attn(inp_attn, gf,
|
|
7490
8222
|
model.layers[il].wo, NULL,
|
|
7491
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
8223
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7492
8224
|
}
|
|
7493
8225
|
|
|
7494
8226
|
if (il == n_layer - 1) {
|
|
@@ -7610,7 +8342,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
|
7610
8342
|
|
|
7611
8343
|
cur = build_attn(inp_attn, gf,
|
|
7612
8344
|
model.layers[il].wo, NULL,
|
|
7613
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
8345
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7614
8346
|
}
|
|
7615
8347
|
|
|
7616
8348
|
cur = build_norm(cur,
|
|
@@ -7751,7 +8483,7 @@ struct llm_build_gemma3 : public llm_graph_context {
|
|
|
7751
8483
|
|
|
7752
8484
|
cur = build_attn(inp_attn, gf,
|
|
7753
8485
|
model.layers[il].wo, NULL,
|
|
7754
|
-
Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
|
|
8486
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
7755
8487
|
}
|
|
7756
8488
|
|
|
7757
8489
|
cur = build_norm(cur,
|
|
@@ -7891,7 +8623,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
7891
8623
|
|
|
7892
8624
|
cur = build_attn(inp_attn, gf,
|
|
7893
8625
|
model.layers[il].wo, model.layers[il].bo,
|
|
7894
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8626
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7895
8627
|
}
|
|
7896
8628
|
|
|
7897
8629
|
if (il == n_layer - 1) {
|
|
@@ -8012,7 +8744,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8012
8744
|
ggml_tensor * state_mask,
|
|
8013
8745
|
const llama_ubatch & ubatch,
|
|
8014
8746
|
int il) const {
|
|
8015
|
-
const
|
|
8747
|
+
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
|
8016
8748
|
|
|
8017
8749
|
const auto kv_head = kv_self->head;
|
|
8018
8750
|
|
|
@@ -8226,7 +8958,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
8226
8958
|
|
|
8227
8959
|
cur = build_attn(inp_attn, gf,
|
|
8228
8960
|
model.layers[il].wo, model.layers[il].bo,
|
|
8229
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8961
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8230
8962
|
}
|
|
8231
8963
|
|
|
8232
8964
|
if (il == n_layer - 1) {
|
|
@@ -8313,7 +9045,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
|
8313
9045
|
// self-attention
|
|
8314
9046
|
{
|
|
8315
9047
|
// rope freq factors for 128k context
|
|
8316
|
-
ggml_tensor * rope_factors =
|
|
9048
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
8317
9049
|
|
|
8318
9050
|
// compute Q and K and RoPE them
|
|
8319
9051
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -8361,7 +9093,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
|
8361
9093
|
|
|
8362
9094
|
cur = build_attn(inp_attn, gf,
|
|
8363
9095
|
model.layers[il].wo, model.layers[il].bo,
|
|
8364
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9096
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8365
9097
|
}
|
|
8366
9098
|
|
|
8367
9099
|
if (il == n_layer - 1) {
|
|
@@ -8492,7 +9224,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
8492
9224
|
|
|
8493
9225
|
cur = build_attn(inp_attn, gf,
|
|
8494
9226
|
model.layers[il].wo, nullptr,
|
|
8495
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9227
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8496
9228
|
}
|
|
8497
9229
|
|
|
8498
9230
|
if (il == n_layer - 1) {
|
|
@@ -8612,7 +9344,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
8612
9344
|
|
|
8613
9345
|
cur = build_attn(inp_attn, gf,
|
|
8614
9346
|
model.layers[il].wo, NULL,
|
|
8615
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9347
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8616
9348
|
}
|
|
8617
9349
|
|
|
8618
9350
|
cur = build_norm(cur,
|
|
@@ -8745,7 +9477,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
8745
9477
|
|
|
8746
9478
|
cur = build_attn(inp_attn, gf,
|
|
8747
9479
|
model.layers[il].wo, NULL,
|
|
8748
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9480
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8749
9481
|
}
|
|
8750
9482
|
|
|
8751
9483
|
if (il == n_layer - 1) {
|
|
@@ -8878,7 +9610,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
8878
9610
|
|
|
8879
9611
|
cur = build_attn(inp_attn, gf,
|
|
8880
9612
|
model.layers[il].wo, NULL,
|
|
8881
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9613
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8882
9614
|
}
|
|
8883
9615
|
|
|
8884
9616
|
if (il == n_layer - 1) {
|
|
@@ -8992,7 +9724,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
8992
9724
|
|
|
8993
9725
|
cur = build_attn(inp_attn, gf,
|
|
8994
9726
|
model.layers[il].wo, model.layers[il].bo,
|
|
8995
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9727
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8996
9728
|
}
|
|
8997
9729
|
|
|
8998
9730
|
if (il == n_layer - 1) {
|
|
@@ -9142,7 +9874,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
9142
9874
|
|
|
9143
9875
|
cur = build_attn(inp_attn, gf,
|
|
9144
9876
|
model.layers[il].wo, NULL,
|
|
9145
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9877
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9146
9878
|
}
|
|
9147
9879
|
|
|
9148
9880
|
if (il == n_layer - 1) {
|
|
@@ -9251,7 +9983,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
9251
9983
|
// self-attention
|
|
9252
9984
|
{
|
|
9253
9985
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
9254
|
-
ggml_tensor * rope_factors =
|
|
9986
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
9255
9987
|
|
|
9256
9988
|
// compute Q and K and RoPE them
|
|
9257
9989
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -9297,7 +10029,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
9297
10029
|
|
|
9298
10030
|
cur = build_attn(inp_attn, gf,
|
|
9299
10031
|
model.layers[il].wo, model.layers[il].bo,
|
|
9300
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
10032
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
9301
10033
|
}
|
|
9302
10034
|
|
|
9303
10035
|
if (il == n_layer - 1) {
|
|
@@ -9387,15 +10119,22 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
9387
10119
|
llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
9388
10120
|
bool is_lite = (hparams.n_layer == 27);
|
|
9389
10121
|
|
|
10122
|
+
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
|
10123
|
+
|
|
10124
|
+
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
|
10125
|
+
const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
|
10126
|
+
const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
|
10127
|
+
|
|
10128
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
10129
|
+
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
|
|
10130
|
+
|
|
10131
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
10132
|
+
|
|
9390
10133
|
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
|
9391
10134
|
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
|
9392
10135
|
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
|
9393
|
-
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(
|
|
9394
|
-
const float
|
|
9395
|
-
|
|
9396
|
-
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
9397
|
-
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
9398
|
-
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
10136
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
|
|
10137
|
+
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
|
9399
10138
|
|
|
9400
10139
|
ggml_tensor * cur;
|
|
9401
10140
|
ggml_tensor * inpL;
|
|
@@ -9421,16 +10160,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
9421
10160
|
{
|
|
9422
10161
|
ggml_tensor * q = NULL;
|
|
9423
10162
|
if (!is_lite) {
|
|
9424
|
-
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
|
9425
10163
|
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
|
9426
10164
|
cb(q, "q", il);
|
|
9427
10165
|
|
|
9428
10166
|
q = build_norm(q,
|
|
9429
|
-
model.layers[il].attn_q_a_norm,
|
|
10167
|
+
model.layers[il].attn_q_a_norm, nullptr,
|
|
9430
10168
|
LLM_NORM_RMS, il);
|
|
9431
10169
|
cb(q, "q", il);
|
|
9432
10170
|
|
|
9433
|
-
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
|
9434
10171
|
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
|
9435
10172
|
cb(q, "q", il);
|
|
9436
10173
|
} else {
|
|
@@ -9438,96 +10175,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
9438
10175
|
cb(q, "q", il);
|
|
9439
10176
|
}
|
|
9440
10177
|
|
|
9441
|
-
// split into {
|
|
9442
|
-
ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
|
|
9443
|
-
|
|
9444
|
-
ggml_row_size(q->type,
|
|
10178
|
+
// split into {n_embd_head_qk_nope, n_head, n_tokens}
|
|
10179
|
+
ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
|
|
10180
|
+
n_embd_head_qk_nope, n_head, n_tokens,
|
|
10181
|
+
ggml_row_size(q->type, n_embd_head_k),
|
|
10182
|
+
ggml_row_size(q->type, n_embd_head_k) * n_head,
|
|
9445
10183
|
0);
|
|
9446
10184
|
cb(q_nope, "q_nope", il);
|
|
9447
10185
|
|
|
9448
|
-
// and {
|
|
9449
|
-
ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
|
|
9450
|
-
|
|
9451
|
-
ggml_row_size(q->type,
|
|
10186
|
+
// and {n_embd_head_qk_rope, n_head, n_tokens}
|
|
10187
|
+
ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
|
|
10188
|
+
n_embd_head_qk_rope, n_head, n_tokens,
|
|
10189
|
+
ggml_row_size(q->type, n_embd_head_k),
|
|
10190
|
+
ggml_row_size(q->type, n_embd_head_k) * n_head,
|
|
9452
10191
|
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
9453
10192
|
cb(q_pe, "q_pe", il);
|
|
9454
10193
|
|
|
9455
|
-
|
|
9456
|
-
|
|
9457
|
-
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
10194
|
+
ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
10195
|
+
cb(kv_cmpr_pe, "kv_cmpr_pe", il);
|
|
9458
10196
|
|
|
9459
10197
|
// split into {kv_lora_rank, n_tokens}
|
|
9460
|
-
ggml_tensor *
|
|
9461
|
-
|
|
10198
|
+
ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
|
|
10199
|
+
kv_lora_rank, n_tokens,
|
|
10200
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
|
9462
10201
|
0);
|
|
9463
|
-
cb(
|
|
10202
|
+
cb(kv_cmpr, "kv_cmpr", il);
|
|
10203
|
+
|
|
10204
|
+
// and {n_embd_head_qk_rope, 1, n_tokens}
|
|
10205
|
+
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
|
|
10206
|
+
n_embd_head_qk_rope, 1, n_tokens,
|
|
10207
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
|
10208
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
|
10209
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
|
|
10210
|
+
cb(k_pe, "k_pe", il);
|
|
9464
10211
|
|
|
9465
|
-
|
|
9466
|
-
|
|
9467
|
-
|
|
9468
|
-
|
|
9469
|
-
|
|
10212
|
+
q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
|
|
10213
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10214
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10215
|
+
);
|
|
10216
|
+
cb(q_pe, "q_pe", il);
|
|
10217
|
+
|
|
10218
|
+
k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
|
|
10219
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10220
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10221
|
+
);
|
|
9470
10222
|
cb(k_pe, "k_pe", il);
|
|
9471
10223
|
|
|
9472
|
-
|
|
9473
|
-
|
|
9474
|
-
kv_compressed = build_norm(kv_compressed,
|
|
9475
|
-
model.layers[il].attn_kv_a_norm, NULL,
|
|
10224
|
+
kv_cmpr = build_norm(kv_cmpr,
|
|
10225
|
+
model.layers[il].attn_kv_a_norm, nullptr,
|
|
9476
10226
|
LLM_NORM_RMS, il);
|
|
9477
|
-
cb(
|
|
10227
|
+
cb(kv_cmpr, "kv_cmpr", il);
|
|
9478
10228
|
|
|
9479
|
-
|
|
9480
|
-
|
|
9481
|
-
|
|
10229
|
+
if (is_mla) {
|
|
10230
|
+
// {n_embd_head_qk_nope, n_tokens, n_head}
|
|
10231
|
+
q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
|
|
10232
|
+
cb(q_nope, "q_nope_perm", il);
|
|
9482
10233
|
|
|
9483
|
-
|
|
9484
|
-
|
|
9485
|
-
|
|
9486
|
-
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
9487
|
-
0);
|
|
9488
|
-
cb(k_nope, "k_nope", il);
|
|
10234
|
+
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
|
|
10235
|
+
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
|
|
10236
|
+
cb(q_nope_absorbed, "q_nope_absorbed", il);
|
|
9489
10237
|
|
|
9490
|
-
|
|
9491
|
-
|
|
9492
|
-
|
|
9493
|
-
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
|
9494
|
-
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
|
9495
|
-
cb(v_states, "v_states", il);
|
|
10238
|
+
// {kv_lora_rank, n_head, n_tokens}
|
|
10239
|
+
q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
|
|
10240
|
+
cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
|
|
9496
10241
|
|
|
9497
|
-
|
|
9498
|
-
|
|
10242
|
+
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
|
|
10243
|
+
// note: rope must go first for in-place context shifting in build_rope_shift()
|
|
10244
|
+
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
|
|
10245
|
+
cb(Qcur, "Qcur", il);
|
|
9499
10246
|
|
|
9500
|
-
|
|
9501
|
-
|
|
9502
|
-
0);
|
|
9503
|
-
cb(v_states, "v_states", il);
|
|
10247
|
+
kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
|
|
10248
|
+
cb(kv_cmpr, "kv_cmpr_reshape", il);
|
|
9504
10249
|
|
|
9505
|
-
|
|
9506
|
-
|
|
9507
|
-
|
|
9508
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9509
|
-
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
9510
|
-
);
|
|
9511
|
-
cb(q_pe, "q_pe", il);
|
|
10250
|
+
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
|
|
10251
|
+
ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
|
|
10252
|
+
cb(Kcur, "Kcur", il);
|
|
9512
10253
|
|
|
9513
|
-
|
|
9514
|
-
|
|
9515
|
-
|
|
9516
|
-
ctx0, k_pe, inp_pos, nullptr,
|
|
9517
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9518
|
-
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
9519
|
-
);
|
|
9520
|
-
cb(k_pe, "k_pe", il);
|
|
10254
|
+
// {kv_lora_rank, 1, n_tokens}
|
|
10255
|
+
ggml_tensor * Vcur = kv_cmpr;
|
|
10256
|
+
cb(Vcur, "Vcur", il);
|
|
9521
10257
|
|
|
9522
|
-
|
|
9523
|
-
|
|
10258
|
+
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
10259
|
+
cur = build_attn(inp_attn, gf,
|
|
10260
|
+
model.layers[il].wo, NULL,
|
|
10261
|
+
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
10262
|
+
} else {
|
|
10263
|
+
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
|
10264
|
+
cb(kv, "kv", il);
|
|
10265
|
+
|
|
10266
|
+
// split into {n_embd_head_qk_nope, n_head, n_tokens}
|
|
10267
|
+
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
|
|
10268
|
+
n_embd_head_qk_nope, n_head, n_tokens,
|
|
10269
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
|
|
10270
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
|
|
10271
|
+
0);
|
|
10272
|
+
cb(k_nope, "k_nope_view", il);
|
|
9524
10273
|
|
|
9525
|
-
|
|
9526
|
-
|
|
10274
|
+
// and {n_embd_head_v, n_head, n_tokens}
|
|
10275
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
|
|
10276
|
+
n_embd_head_v, n_head, n_tokens,
|
|
10277
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
|
|
10278
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
|
|
10279
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope));
|
|
10280
|
+
cb(Vcur, "Vcur_view", il);
|
|
9527
10281
|
|
|
9528
|
-
|
|
9529
|
-
|
|
9530
|
-
|
|
10282
|
+
Vcur = ggml_cont(ctx0, Vcur);
|
|
10283
|
+
cb(Vcur, "Vcur_cont", il);
|
|
10284
|
+
|
|
10285
|
+
// note: rope must go first for in-place context shifting in build_rope_shift()
|
|
10286
|
+
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
|
|
10287
|
+
cb(Qcur, "Qcur", il);
|
|
10288
|
+
|
|
10289
|
+
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
|
10290
|
+
cb(Kcur, "Kcur", il);
|
|
10291
|
+
|
|
10292
|
+
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
10293
|
+
cur = build_attn(inp_attn, gf,
|
|
10294
|
+
model.layers[il].wo, NULL,
|
|
10295
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
10296
|
+
}
|
|
9531
10297
|
}
|
|
9532
10298
|
|
|
9533
10299
|
if (il == n_layer - 1) {
|
|
@@ -9693,7 +10459,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
9693
10459
|
|
|
9694
10460
|
cur = build_attn(inp_attn, gf,
|
|
9695
10461
|
NULL, NULL,
|
|
9696
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10462
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9697
10463
|
|
|
9698
10464
|
cur = build_norm(cur,
|
|
9699
10465
|
model.layers[il].attn_sub_norm, NULL,
|
|
@@ -9816,7 +10582,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
9816
10582
|
|
|
9817
10583
|
cur = build_attn(inp_attn, gf,
|
|
9818
10584
|
model.layers[il].wo_enc, nullptr,
|
|
9819
|
-
Qcur, Kcur, Vcur, kq_b, 1.0f, il);
|
|
10585
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
9820
10586
|
cb(cur, "kqv_out", il);
|
|
9821
10587
|
}
|
|
9822
10588
|
|
|
@@ -9922,7 +10688,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
9922
10688
|
|
|
9923
10689
|
cur = build_attn(inp_attn_self, gf,
|
|
9924
10690
|
model.layers[il].wo, model.layers[il].bo,
|
|
9925
|
-
Qcur, Kcur, Vcur, kq_b, 1.0f, il);
|
|
10691
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
9926
10692
|
cb(cur, "kqv_out", il);
|
|
9927
10693
|
}
|
|
9928
10694
|
|
|
@@ -9954,7 +10720,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
9954
10720
|
|
|
9955
10721
|
cur = build_attn(inp_attn_cross, gf,
|
|
9956
10722
|
model.layers[il].wo_cross, nullptr,
|
|
9957
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
10723
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9958
10724
|
cb(cur, "kqv_out", il);
|
|
9959
10725
|
|
|
9960
10726
|
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
@@ -10087,7 +10853,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
10087
10853
|
|
|
10088
10854
|
cur = build_attn(inp_attn, gf,
|
|
10089
10855
|
model.layers[il].wo, model.layers[il].bo,
|
|
10090
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
|
|
10856
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
10091
10857
|
}
|
|
10092
10858
|
|
|
10093
10859
|
if (il == n_layer - 1) {
|
|
@@ -10219,7 +10985,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
10219
10985
|
|
|
10220
10986
|
cur = build_attn(inp_attn, gf,
|
|
10221
10987
|
model.layers[il].wo, NULL,
|
|
10222
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10988
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10223
10989
|
}
|
|
10224
10990
|
|
|
10225
10991
|
if (il == n_layer - 1) {
|
|
@@ -10272,6 +11038,157 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
10272
11038
|
}
|
|
10273
11039
|
};
|
|
10274
11040
|
|
|
11041
|
+
struct llm_build_glm4 : public llm_graph_context {
|
|
11042
|
+
llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
11043
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
11044
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
11045
|
+
|
|
11046
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
11047
|
+
|
|
11048
|
+
ggml_tensor * cur;
|
|
11049
|
+
ggml_tensor * inpL;
|
|
11050
|
+
|
|
11051
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
11052
|
+
|
|
11053
|
+
// inp_pos - contains the positions
|
|
11054
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
11055
|
+
|
|
11056
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11057
|
+
|
|
11058
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
11059
|
+
ggml_tensor * inpSA = inpL;
|
|
11060
|
+
|
|
11061
|
+
// Pre-attention norm
|
|
11062
|
+
cur = build_norm(inpL,
|
|
11063
|
+
model.layers[il].attn_norm,
|
|
11064
|
+
NULL,
|
|
11065
|
+
LLM_NORM_RMS, il);
|
|
11066
|
+
cb(cur, "attn_norm", il);
|
|
11067
|
+
|
|
11068
|
+
// self-attention
|
|
11069
|
+
{
|
|
11070
|
+
ggml_tensor * Qcur = nullptr;
|
|
11071
|
+
ggml_tensor * Kcur = nullptr;
|
|
11072
|
+
ggml_tensor * Vcur = nullptr;
|
|
11073
|
+
|
|
11074
|
+
if (model.layers[il].wqkv == nullptr) {
|
|
11075
|
+
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
11076
|
+
if (model.layers[il].bq) {
|
|
11077
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
11078
|
+
}
|
|
11079
|
+
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
11080
|
+
if (model.layers[il].bk) {
|
|
11081
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
11082
|
+
}
|
|
11083
|
+
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
11084
|
+
if (model.layers[il].bv) {
|
|
11085
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
11086
|
+
}
|
|
11087
|
+
} else {
|
|
11088
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
11089
|
+
cb(cur, "wqkv", il);
|
|
11090
|
+
if (model.layers[il].bqkv) {
|
|
11091
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
11092
|
+
cb(cur, "bqkv", il);
|
|
11093
|
+
}
|
|
11094
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
11095
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
11096
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
11097
|
+
}
|
|
11098
|
+
|
|
11099
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
11100
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11101
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
11102
|
+
|
|
11103
|
+
Qcur = ggml_rope_ext(
|
|
11104
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
11105
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11106
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11107
|
+
);
|
|
11108
|
+
|
|
11109
|
+
Kcur = ggml_rope_ext(
|
|
11110
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
11111
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11112
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11113
|
+
);
|
|
11114
|
+
|
|
11115
|
+
cb(Qcur, "Qcur", il);
|
|
11116
|
+
cb(Kcur, "Kcur", il);
|
|
11117
|
+
cb(Vcur, "Vcur", il);
|
|
11118
|
+
|
|
11119
|
+
cur = build_attn(inp_attn, gf,
|
|
11120
|
+
model.layers[il].wo, NULL,
|
|
11121
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11122
|
+
}
|
|
11123
|
+
|
|
11124
|
+
if (il == n_layer - 1) {
|
|
11125
|
+
// skip computing output for unused tokens
|
|
11126
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11127
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11128
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11129
|
+
}
|
|
11130
|
+
|
|
11131
|
+
// Post-attention norm (new!)
|
|
11132
|
+
cur = build_norm(cur,
|
|
11133
|
+
model.layers[il].attn_post_norm,
|
|
11134
|
+
NULL,
|
|
11135
|
+
LLM_NORM_RMS, il);
|
|
11136
|
+
cb(cur, "post_attn_norm", il);
|
|
11137
|
+
|
|
11138
|
+
// Add the input (residual connection after post-attention norm)
|
|
11139
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
11140
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
11141
|
+
|
|
11142
|
+
// FF
|
|
11143
|
+
{
|
|
11144
|
+
// Pre-MLP norm
|
|
11145
|
+
cur = build_norm(ffn_inp,
|
|
11146
|
+
model.layers[il].ffn_norm,
|
|
11147
|
+
NULL,
|
|
11148
|
+
LLM_NORM_RMS, il);
|
|
11149
|
+
cb(cur, "ffn_norm", il);
|
|
11150
|
+
|
|
11151
|
+
// MLP
|
|
11152
|
+
cur = build_ffn(cur,
|
|
11153
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
11154
|
+
NULL, NULL, NULL,
|
|
11155
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
11156
|
+
NULL,
|
|
11157
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
11158
|
+
cb(cur, "ffn_out", il);
|
|
11159
|
+
|
|
11160
|
+
// Post-MLP norm
|
|
11161
|
+
cur = build_norm(cur,
|
|
11162
|
+
model.layers[il].ffn_post_norm,
|
|
11163
|
+
NULL,
|
|
11164
|
+
LLM_NORM_RMS, il);
|
|
11165
|
+
cb(cur, "post_mlp_norm", il);
|
|
11166
|
+
}
|
|
11167
|
+
|
|
11168
|
+
// Add residual connection after post-MLP norm
|
|
11169
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
|
11170
|
+
cb(inpL, "l_out", il);
|
|
11171
|
+
}
|
|
11172
|
+
|
|
11173
|
+
// Final norm
|
|
11174
|
+
cur = build_norm(inpL,
|
|
11175
|
+
model.output_norm,
|
|
11176
|
+
NULL,
|
|
11177
|
+
LLM_NORM_RMS, -1);
|
|
11178
|
+
|
|
11179
|
+
cb(cur, "result_norm", -1);
|
|
11180
|
+
res->t_embd = cur;
|
|
11181
|
+
|
|
11182
|
+
// Output projection
|
|
11183
|
+
cur = build_lora_mm(model.output, cur);
|
|
11184
|
+
|
|
11185
|
+
cb(cur, "result_output", -1);
|
|
11186
|
+
res->t_logits = cur;
|
|
11187
|
+
|
|
11188
|
+
ggml_build_forward_expand(gf, cur);
|
|
11189
|
+
}
|
|
11190
|
+
};
|
|
11191
|
+
|
|
10275
11192
|
struct llm_build_nemotron : public llm_graph_context {
|
|
10276
11193
|
llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
10277
11194
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -10345,7 +11262,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
10345
11262
|
|
|
10346
11263
|
cur = build_attn(inp_attn, gf,
|
|
10347
11264
|
model.layers[il].wo, model.layers[il].bo,
|
|
10348
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11265
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10349
11266
|
}
|
|
10350
11267
|
|
|
10351
11268
|
if (il == n_layer - 1) {
|
|
@@ -10430,7 +11347,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
10430
11347
|
// self-attention
|
|
10431
11348
|
{
|
|
10432
11349
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
10433
|
-
ggml_tensor * rope_factors =
|
|
11350
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
10434
11351
|
|
|
10435
11352
|
// compute Q and K and RoPE them
|
|
10436
11353
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -10476,7 +11393,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
10476
11393
|
|
|
10477
11394
|
cur = build_attn(inp_attn, gf,
|
|
10478
11395
|
model.layers[il].wo, model.layers[il].bo,
|
|
10479
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11396
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10480
11397
|
}
|
|
10481
11398
|
|
|
10482
11399
|
if (il == n_layer - 1) {
|
|
@@ -10575,7 +11492,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
10575
11492
|
ggml_tensor * state_mask,
|
|
10576
11493
|
const llama_ubatch & ubatch,
|
|
10577
11494
|
int il) const {
|
|
10578
|
-
const
|
|
11495
|
+
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
|
10579
11496
|
|
|
10580
11497
|
const auto n_tokens = ubatch.n_tokens;
|
|
10581
11498
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -10971,7 +11888,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
10971
11888
|
ggml_tensor *& first_layer_value,
|
|
10972
11889
|
const llama_ubatch & ubatch,
|
|
10973
11890
|
int il) const {
|
|
10974
|
-
const
|
|
11891
|
+
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
|
10975
11892
|
|
|
10976
11893
|
const auto n_tokens = ubatch.n_tokens;
|
|
10977
11894
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -11280,14 +12197,15 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
11280
12197
|
}
|
|
11281
12198
|
};
|
|
11282
12199
|
|
|
11283
|
-
|
|
11284
|
-
|
|
11285
|
-
|
|
11286
|
-
|
|
11287
|
-
|
|
11288
|
-
|
|
11289
|
-
|
|
11290
|
-
|
|
12200
|
+
|
|
12201
|
+
struct llm_build_granite : public llm_graph_context {
|
|
12202
|
+
llm_build_granite(
|
|
12203
|
+
const llama_model & model,
|
|
12204
|
+
const llm_graph_params & params,
|
|
12205
|
+
ggml_cgraph * gf,
|
|
12206
|
+
const bool use_rope = true)
|
|
12207
|
+
: llm_graph_context(params) {
|
|
12208
|
+
|
|
11291
12209
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
11292
12210
|
|
|
11293
12211
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -11298,27 +12216,214 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
11298
12216
|
|
|
11299
12217
|
inpL = build_inp_embd(model.tok_embd);
|
|
11300
12218
|
|
|
11301
|
-
// inp_pos -
|
|
11302
|
-
ggml_tensor * inp_pos =
|
|
12219
|
+
// inp_pos - built only if rope enabled
|
|
12220
|
+
ggml_tensor * inp_pos = nullptr;
|
|
12221
|
+
if (use_rope) {
|
|
12222
|
+
inp_pos = build_inp_pos();
|
|
12223
|
+
}
|
|
11303
12224
|
|
|
11304
12225
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11305
12226
|
|
|
12227
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
11306
12228
|
for (int il = 0; il < n_layer; ++il) {
|
|
11307
12229
|
ggml_tensor * inpSA = inpL;
|
|
11308
12230
|
|
|
11309
12231
|
// norm
|
|
11310
|
-
|
|
11311
|
-
|
|
11312
|
-
|
|
11313
|
-
|
|
11314
|
-
model.layers[il].attn_norm, NULL,
|
|
11315
|
-
LLM_NORM_RMS, il);
|
|
11316
|
-
cb(cur, "attn_norm", il);
|
|
11317
|
-
}
|
|
12232
|
+
cur = build_norm(inpL,
|
|
12233
|
+
model.layers[il].attn_norm, NULL,
|
|
12234
|
+
LLM_NORM_RMS, il);
|
|
12235
|
+
cb(cur, "attn_norm", il);
|
|
11318
12236
|
|
|
11319
12237
|
// self-attention
|
|
11320
12238
|
{
|
|
11321
|
-
// compute Q and K and RoPE them
|
|
12239
|
+
// compute Q and K and (optionally) RoPE them
|
|
12240
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
12241
|
+
cb(Qcur, "Qcur", il);
|
|
12242
|
+
if (model.layers[il].bq) {
|
|
12243
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
12244
|
+
cb(Qcur, "Qcur", il);
|
|
12245
|
+
}
|
|
12246
|
+
|
|
12247
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
12248
|
+
cb(Kcur, "Kcur", il);
|
|
12249
|
+
if (model.layers[il].bk) {
|
|
12250
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
12251
|
+
cb(Kcur, "Kcur", il);
|
|
12252
|
+
}
|
|
12253
|
+
|
|
12254
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
12255
|
+
cb(Vcur, "Vcur", il);
|
|
12256
|
+
if (model.layers[il].bv) {
|
|
12257
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12258
|
+
cb(Vcur, "Vcur", il);
|
|
12259
|
+
}
|
|
12260
|
+
|
|
12261
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12262
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12263
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12264
|
+
|
|
12265
|
+
if (use_rope) {
|
|
12266
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
12267
|
+
Qcur = ggml_rope_ext(
|
|
12268
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
12269
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12270
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12271
|
+
);
|
|
12272
|
+
|
|
12273
|
+
Kcur = ggml_rope_ext(
|
|
12274
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
12275
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12276
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12277
|
+
);
|
|
12278
|
+
}
|
|
12279
|
+
|
|
12280
|
+
cb(Qcur, "Qcur", il);
|
|
12281
|
+
cb(Kcur, "Kcur", il);
|
|
12282
|
+
cb(Vcur, "Vcur", il);
|
|
12283
|
+
|
|
12284
|
+
cur = build_attn(inp_attn, gf,
|
|
12285
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
12286
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12287
|
+
cb(cur, "attn_out", il);
|
|
12288
|
+
}
|
|
12289
|
+
|
|
12290
|
+
if (il == n_layer - 1) {
|
|
12291
|
+
// skip computing output for unused tokens
|
|
12292
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12293
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12294
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12295
|
+
}
|
|
12296
|
+
|
|
12297
|
+
// For Granite architectures - scale residual
|
|
12298
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
12299
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12300
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12301
|
+
|
|
12302
|
+
// feed-forward network (non-MoE)
|
|
12303
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
12304
|
+
|
|
12305
|
+
cur = build_norm(ffn_inp,
|
|
12306
|
+
model.layers[il].ffn_norm, NULL,
|
|
12307
|
+
LLM_NORM_RMS, il);
|
|
12308
|
+
cb(cur, "ffn_norm", il);
|
|
12309
|
+
|
|
12310
|
+
cur = build_ffn(cur,
|
|
12311
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
12312
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
12313
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
12314
|
+
NULL,
|
|
12315
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
12316
|
+
cb(cur, "ffn_out", il);
|
|
12317
|
+
|
|
12318
|
+
} else {
|
|
12319
|
+
// MoE branch
|
|
12320
|
+
cur = build_norm(ffn_inp,
|
|
12321
|
+
model.layers[il].ffn_norm, NULL,
|
|
12322
|
+
LLM_NORM_RMS, il);
|
|
12323
|
+
cb(cur, "ffn_norm", il);
|
|
12324
|
+
|
|
12325
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
12326
|
+
model.layers[il].ffn_gate_inp,
|
|
12327
|
+
model.layers[il].ffn_up_exps,
|
|
12328
|
+
model.layers[il].ffn_gate_exps,
|
|
12329
|
+
model.layers[il].ffn_down_exps,
|
|
12330
|
+
nullptr,
|
|
12331
|
+
n_expert, n_expert_used,
|
|
12332
|
+
LLM_FFN_SILU, true,
|
|
12333
|
+
false, 0.0,
|
|
12334
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
12335
|
+
il);
|
|
12336
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
12337
|
+
|
|
12338
|
+
// For Granite MoE Shared
|
|
12339
|
+
if (hparams.n_ff_shexp > 0) {
|
|
12340
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
12341
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
12342
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
12343
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
12344
|
+
NULL,
|
|
12345
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
12346
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
12347
|
+
|
|
12348
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
12349
|
+
cb(cur, "ffn_out", il);
|
|
12350
|
+
} else {
|
|
12351
|
+
cur = moe_out;
|
|
12352
|
+
}
|
|
12353
|
+
}
|
|
12354
|
+
|
|
12355
|
+
// For Granite architectures - scale residual
|
|
12356
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
12357
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
12358
|
+
cb(cur, "ffn_out", il);
|
|
12359
|
+
|
|
12360
|
+
cur = build_cvec(cur, il);
|
|
12361
|
+
cb(cur, "l_out", il);
|
|
12362
|
+
|
|
12363
|
+
// input for next layer
|
|
12364
|
+
inpL = cur;
|
|
12365
|
+
}
|
|
12366
|
+
|
|
12367
|
+
cur = inpL;
|
|
12368
|
+
|
|
12369
|
+
cur = build_norm(cur,
|
|
12370
|
+
model.output_norm, NULL,
|
|
12371
|
+
LLM_NORM_RMS, -1);
|
|
12372
|
+
|
|
12373
|
+
cb(cur, "result_norm", -1);
|
|
12374
|
+
res->t_embd = cur;
|
|
12375
|
+
|
|
12376
|
+
// lm_head
|
|
12377
|
+
cur = build_lora_mm(model.output, cur);
|
|
12378
|
+
|
|
12379
|
+
// For Granite architectures - scale logits
|
|
12380
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
12381
|
+
cb(cur, "result_output", -1);
|
|
12382
|
+
res->t_logits = cur;
|
|
12383
|
+
|
|
12384
|
+
ggml_build_forward_expand(gf, cur);
|
|
12385
|
+
}
|
|
12386
|
+
};
|
|
12387
|
+
|
|
12388
|
+
// ref: https://github.com/facebookresearch/chameleon
|
|
12389
|
+
// based on the original build_llama() function, changes:
|
|
12390
|
+
// * qk-norm
|
|
12391
|
+
// * swin-norm
|
|
12392
|
+
// * removed bias
|
|
12393
|
+
// * removed MoE
|
|
12394
|
+
struct llm_build_chameleon : public llm_graph_context {
|
|
12395
|
+
llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
12396
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12397
|
+
|
|
12398
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
12399
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
12400
|
+
|
|
12401
|
+
ggml_tensor * cur;
|
|
12402
|
+
ggml_tensor * inpL;
|
|
12403
|
+
|
|
12404
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12405
|
+
|
|
12406
|
+
// inp_pos - contains the positions
|
|
12407
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12408
|
+
|
|
12409
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12410
|
+
|
|
12411
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12412
|
+
ggml_tensor * inpSA = inpL;
|
|
12413
|
+
|
|
12414
|
+
// norm
|
|
12415
|
+
if (hparams.swin_norm) {
|
|
12416
|
+
cur = inpL;
|
|
12417
|
+
} else {
|
|
12418
|
+
cur = build_norm(inpL,
|
|
12419
|
+
model.layers[il].attn_norm, NULL,
|
|
12420
|
+
LLM_NORM_RMS, il);
|
|
12421
|
+
cb(cur, "attn_norm", il);
|
|
12422
|
+
}
|
|
12423
|
+
|
|
12424
|
+
// self-attention
|
|
12425
|
+
{
|
|
12426
|
+
// compute Q and K and RoPE them
|
|
11322
12427
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
11323
12428
|
cb(Qcur, "Qcur", il);
|
|
11324
12429
|
|
|
@@ -11378,7 +12483,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
11378
12483
|
|
|
11379
12484
|
cur = build_attn(inp_attn, gf,
|
|
11380
12485
|
model.layers[il].wo, nullptr,
|
|
11381
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12486
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11382
12487
|
|
|
11383
12488
|
if (hparams.swin_norm) {
|
|
11384
12489
|
cur = build_norm(cur,
|
|
@@ -11615,36 +12720,362 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
|
|
11615
12720
|
}
|
|
11616
12721
|
};
|
|
11617
12722
|
|
|
11618
|
-
|
|
12723
|
+
struct llm_build_plm : public llm_graph_context {
|
|
12724
|
+
llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
12725
|
+
const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
|
|
12726
|
+
|
|
12727
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
12728
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
12729
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
12730
|
+
|
|
12731
|
+
ggml_tensor * cur;
|
|
12732
|
+
ggml_tensor * inpL;
|
|
12733
|
+
|
|
12734
|
+
// {n_embd, n_tokens}
|
|
12735
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12736
|
+
|
|
12737
|
+
// inp_pos - contains the positions
|
|
12738
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12739
|
+
|
|
12740
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12741
|
+
|
|
12742
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12743
|
+
ggml_tensor * inpSA = inpL;
|
|
12744
|
+
|
|
12745
|
+
// norm
|
|
12746
|
+
cur = build_norm(inpL,
|
|
12747
|
+
model.layers[il].attn_norm, NULL,
|
|
12748
|
+
LLM_NORM_RMS, il);
|
|
12749
|
+
cb(cur, "attn_norm", il);
|
|
12750
|
+
|
|
12751
|
+
// self_attention
|
|
12752
|
+
{
|
|
12753
|
+
ggml_tensor * q = NULL;
|
|
12754
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
12755
|
+
cb(q, "q", il);
|
|
12756
|
+
|
|
12757
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
12758
|
+
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
|
12759
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
12760
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
12761
|
+
0);
|
|
12762
|
+
cb(q_nope, "q_nope", il);
|
|
12763
|
+
|
|
12764
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
|
12765
|
+
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
|
12766
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
12767
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
12768
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
12769
|
+
cb(q_pe, "q_pe", il);
|
|
12770
|
+
|
|
12771
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
|
12772
|
+
ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
12773
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
12774
|
+
|
|
12775
|
+
// split into {kv_lora_rank, n_tokens}
|
|
12776
|
+
ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
|
12777
|
+
kv_pe_compresseed->nb[1],
|
|
12778
|
+
0);
|
|
12779
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
12780
|
+
|
|
12781
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
|
12782
|
+
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
|
12783
|
+
kv_pe_compresseed->nb[1],
|
|
12784
|
+
kv_pe_compresseed->nb[1],
|
|
12785
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
12786
|
+
cb(k_pe, "k_pe", il);
|
|
12787
|
+
|
|
12788
|
+
kv_compressed = build_norm(kv_compressed,
|
|
12789
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
|
12790
|
+
LLM_NORM_RMS, il);
|
|
12791
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
12792
|
+
|
|
12793
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
|
12794
|
+
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
|
12795
|
+
cb(kv, "kv", il);
|
|
12796
|
+
|
|
12797
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
12798
|
+
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
|
12799
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
|
12800
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
12801
|
+
0);
|
|
12802
|
+
cb(k_nope, "k_nope", il);
|
|
12803
|
+
|
|
12804
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
|
12805
|
+
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
|
12806
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
12807
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
|
12808
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
|
12809
|
+
cb(v_states, "v_states", il);
|
|
12810
|
+
|
|
12811
|
+
v_states = ggml_cont(ctx0, v_states);
|
|
12812
|
+
cb(v_states, "v_states", il);
|
|
12813
|
+
|
|
12814
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
|
12815
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
|
12816
|
+
0);
|
|
12817
|
+
cb(v_states, "v_states", il);
|
|
12818
|
+
|
|
12819
|
+
q_pe = ggml_rope_ext(
|
|
12820
|
+
ctx0, q_pe, inp_pos, nullptr,
|
|
12821
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12822
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12823
|
+
);
|
|
12824
|
+
cb(q_pe, "q_pe", il);
|
|
12825
|
+
|
|
12826
|
+
// shared RoPE key
|
|
12827
|
+
k_pe = ggml_rope_ext(
|
|
12828
|
+
ctx0, k_pe, inp_pos, nullptr,
|
|
12829
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12830
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12831
|
+
);
|
|
12832
|
+
cb(k_pe, "k_pe", il);
|
|
12833
|
+
|
|
12834
|
+
ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
|
12835
|
+
cb(q_states, "q_states", il);
|
|
12836
|
+
|
|
12837
|
+
ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
|
12838
|
+
cb(k_states, "k_states", il);
|
|
12839
|
+
|
|
12840
|
+
cur = build_attn(inp_attn, gf,
|
|
12841
|
+
model.layers[il].wo, NULL,
|
|
12842
|
+
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
12843
|
+
}
|
|
12844
|
+
|
|
12845
|
+
if (il == n_layer - 1) {
|
|
12846
|
+
// skip computing output for unused tokens
|
|
12847
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12848
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12849
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12850
|
+
}
|
|
12851
|
+
|
|
12852
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12853
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12854
|
+
|
|
12855
|
+
cur = build_norm(ffn_inp,
|
|
12856
|
+
model.layers[il].ffn_norm, NULL,
|
|
12857
|
+
LLM_NORM_RMS, il);
|
|
12858
|
+
cb(cur, "ffn_norm", il);
|
|
12859
|
+
|
|
12860
|
+
cur = build_ffn(cur,
|
|
12861
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
12862
|
+
NULL, NULL, NULL,
|
|
12863
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
12864
|
+
NULL,
|
|
12865
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
12866
|
+
cb(cur, "ffn_out", il);
|
|
12867
|
+
|
|
12868
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
12869
|
+
|
|
12870
|
+
cur = build_cvec(cur, il);
|
|
12871
|
+
cb(cur, "l_out", il);
|
|
12872
|
+
|
|
12873
|
+
// input for next layer
|
|
12874
|
+
inpL = cur;
|
|
12875
|
+
}
|
|
12876
|
+
|
|
12877
|
+
cur = inpL;
|
|
12878
|
+
|
|
12879
|
+
cur = build_norm(cur,
|
|
12880
|
+
model.output_norm, NULL,
|
|
12881
|
+
LLM_NORM_RMS, -1);
|
|
12882
|
+
|
|
12883
|
+
cb(cur, "result_norm", -1);
|
|
12884
|
+
res->t_embd = cur;
|
|
12885
|
+
|
|
12886
|
+
cur = build_lora_mm(model.output, cur);
|
|
12887
|
+
|
|
12888
|
+
cb(cur, "result_output", -1);
|
|
12889
|
+
res->t_logits = cur;
|
|
12890
|
+
|
|
12891
|
+
ggml_build_forward_expand(gf, cur);
|
|
12892
|
+
}
|
|
12893
|
+
};
|
|
12894
|
+
|
|
12895
|
+
struct llm_build_bailingmoe : public llm_graph_context {
|
|
12896
|
+
llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
12897
|
+
ggml_tensor * cur;
|
|
12898
|
+
ggml_tensor * inpL;
|
|
12899
|
+
|
|
12900
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12901
|
+
|
|
12902
|
+
// inp_pos - contains the positions
|
|
12903
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12904
|
+
|
|
12905
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12906
|
+
|
|
12907
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12908
|
+
ggml_tensor * inpSA = inpL;
|
|
12909
|
+
|
|
12910
|
+
// norm
|
|
12911
|
+
cur = build_norm(inpL,
|
|
12912
|
+
model.layers[il].attn_norm, NULL,
|
|
12913
|
+
LLM_NORM_RMS, il);
|
|
12914
|
+
cb(cur, "attn_norm", il);
|
|
12915
|
+
|
|
12916
|
+
// self-attention
|
|
12917
|
+
{
|
|
12918
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
12919
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
12920
|
+
|
|
12921
|
+
// compute Q and K and RoPE them
|
|
12922
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
12923
|
+
cb(Qcur, "Qcur", il);
|
|
12924
|
+
if (model.layers[il].bq) {
|
|
12925
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
12926
|
+
cb(Qcur, "Qcur", il);
|
|
12927
|
+
}
|
|
12928
|
+
|
|
12929
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
12930
|
+
cb(Kcur, "Kcur", il);
|
|
12931
|
+
if (model.layers[il].bk) {
|
|
12932
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
12933
|
+
cb(Kcur, "Kcur", il);
|
|
12934
|
+
}
|
|
12935
|
+
|
|
12936
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
12937
|
+
cb(Vcur, "Vcur", il);
|
|
12938
|
+
if (model.layers[il].bv) {
|
|
12939
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12940
|
+
cb(Vcur, "Vcur", il);
|
|
12941
|
+
}
|
|
12942
|
+
|
|
12943
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
|
12944
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
|
12945
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
|
12946
|
+
|
|
12947
|
+
Qcur = ggml_rope_ext(
|
|
12948
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
12949
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12950
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12951
|
+
);
|
|
12952
|
+
|
|
12953
|
+
Kcur = ggml_rope_ext(
|
|
12954
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
12955
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12956
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12957
|
+
);
|
|
12958
|
+
|
|
12959
|
+
cb(Qcur, "Qcur", il);
|
|
12960
|
+
cb(Kcur, "Kcur", il);
|
|
12961
|
+
cb(Vcur, "Vcur", il);
|
|
12962
|
+
|
|
12963
|
+
cur = build_attn(inp_attn, gf,
|
|
12964
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
12965
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
12966
|
+
}
|
|
12967
|
+
|
|
12968
|
+
if (il == n_layer - 1) {
|
|
12969
|
+
// skip computing output for unused tokens
|
|
12970
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12971
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12972
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12973
|
+
}
|
|
12974
|
+
|
|
12975
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12976
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12977
|
+
|
|
12978
|
+
cur = build_norm(ffn_inp,
|
|
12979
|
+
model.layers[il].ffn_norm, NULL,
|
|
12980
|
+
LLM_NORM_RMS, il);
|
|
12981
|
+
cb(cur, "ffn_norm", il);
|
|
12982
|
+
|
|
12983
|
+
ggml_tensor * moe_out =
|
|
12984
|
+
build_moe_ffn(cur,
|
|
12985
|
+
model.layers[il].ffn_gate_inp,
|
|
12986
|
+
model.layers[il].ffn_up_exps,
|
|
12987
|
+
model.layers[il].ffn_gate_exps,
|
|
12988
|
+
model.layers[il].ffn_down_exps,
|
|
12989
|
+
nullptr,
|
|
12990
|
+
n_expert, n_expert_used,
|
|
12991
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
12992
|
+
false, hparams.expert_weights_scale,
|
|
12993
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
12994
|
+
il);
|
|
12995
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
12996
|
+
|
|
12997
|
+
// FFN shared expert
|
|
12998
|
+
{
|
|
12999
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
13000
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
13001
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
13002
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
13003
|
+
NULL,
|
|
13004
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13005
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
13006
|
+
|
|
13007
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
13008
|
+
cb(cur, "ffn_out", il);
|
|
13009
|
+
}
|
|
13010
|
+
|
|
13011
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13012
|
+
|
|
13013
|
+
cur = build_cvec(cur, il);
|
|
13014
|
+
cb(cur, "l_out", il);
|
|
13015
|
+
|
|
13016
|
+
// input for next layer
|
|
13017
|
+
inpL = cur;
|
|
13018
|
+
}
|
|
13019
|
+
|
|
13020
|
+
cur = inpL;
|
|
13021
|
+
|
|
13022
|
+
cur = build_norm(cur,
|
|
13023
|
+
model.output_norm, NULL,
|
|
13024
|
+
LLM_NORM_RMS, -1);
|
|
13025
|
+
|
|
13026
|
+
cb(cur, "result_norm", -1);
|
|
13027
|
+
res->t_embd = cur;
|
|
13028
|
+
|
|
13029
|
+
// lm_head
|
|
13030
|
+
cur = build_lora_mm(model.output, cur);
|
|
13031
|
+
|
|
13032
|
+
cb(cur, "result_output", -1);
|
|
13033
|
+
res->t_logits = cur;
|
|
13034
|
+
|
|
13035
|
+
ggml_build_forward_expand(gf, cur);
|
|
13036
|
+
}
|
|
13037
|
+
};
|
|
13038
|
+
|
|
13039
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
11619
13040
|
llama_memory_i * res;
|
|
11620
13041
|
|
|
11621
13042
|
switch (arch) {
|
|
13043
|
+
case LLM_ARCH_BERT:
|
|
13044
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
13045
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
13046
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
13047
|
+
{
|
|
13048
|
+
res = nullptr;
|
|
13049
|
+
} break;
|
|
11622
13050
|
case LLM_ARCH_MAMBA:
|
|
11623
13051
|
case LLM_ARCH_RWKV6:
|
|
11624
13052
|
case LLM_ARCH_RWKV6QWEN2:
|
|
11625
13053
|
case LLM_ARCH_RWKV7:
|
|
11626
13054
|
case LLM_ARCH_ARWKV7:
|
|
11627
13055
|
{
|
|
11628
|
-
res = new
|
|
11629
|
-
|
|
11630
|
-
|
|
13056
|
+
res = new llama_kv_cache_recurrent(
|
|
13057
|
+
*this,
|
|
13058
|
+
GGML_TYPE_F32,
|
|
13059
|
+
GGML_TYPE_F32,
|
|
13060
|
+
cparams.offload_kqv,
|
|
13061
|
+
std::max((uint32_t) 1, cparams.n_seq_max));
|
|
11631
13062
|
} break;
|
|
11632
13063
|
default:
|
|
11633
13064
|
{
|
|
11634
|
-
|
|
11635
|
-
/*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
|
|
11636
|
-
// choose long/short freq factors based on the context size
|
|
11637
|
-
if (layers[il].rope_freqs != nullptr) {
|
|
11638
|
-
return layers[il].rope_freqs;
|
|
11639
|
-
}
|
|
13065
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
11640
13066
|
|
|
11641
|
-
|
|
11642
|
-
return layers[il].rope_long;
|
|
11643
|
-
}
|
|
13067
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
11644
13068
|
|
|
11645
|
-
|
|
11646
|
-
|
|
11647
|
-
|
|
13069
|
+
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
13070
|
+
|
|
13071
|
+
res = new llama_kv_cache_unified(
|
|
13072
|
+
*this,
|
|
13073
|
+
params.type_k,
|
|
13074
|
+
params.type_v,
|
|
13075
|
+
!cparams.flash_attn,
|
|
13076
|
+
cparams.offload_kqv,
|
|
13077
|
+
cparams.n_ctx,
|
|
13078
|
+
padding);
|
|
11648
13079
|
}
|
|
11649
13080
|
}
|
|
11650
13081
|
|
|
@@ -11659,9 +13090,8 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11659
13090
|
|
|
11660
13091
|
switch (arch) {
|
|
11661
13092
|
case LLM_ARCH_LLAMA:
|
|
13093
|
+
case LLM_ARCH_LLAMA4:
|
|
11662
13094
|
case LLM_ARCH_MINICPM:
|
|
11663
|
-
case LLM_ARCH_GRANITE:
|
|
11664
|
-
case LLM_ARCH_GRANITE_MOE:
|
|
11665
13095
|
{
|
|
11666
13096
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
|
11667
13097
|
} break;
|
|
@@ -11692,6 +13122,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11692
13122
|
case LLM_ARCH_BERT:
|
|
11693
13123
|
case LLM_ARCH_JINA_BERT_V2:
|
|
11694
13124
|
case LLM_ARCH_NOMIC_BERT:
|
|
13125
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
11695
13126
|
{
|
|
11696
13127
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
|
11697
13128
|
} break;
|
|
@@ -11723,6 +13154,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11723
13154
|
{
|
|
11724
13155
|
llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
|
|
11725
13156
|
} break;
|
|
13157
|
+
case LLM_ARCH_QWEN3:
|
|
13158
|
+
{
|
|
13159
|
+
llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
|
|
13160
|
+
} break;
|
|
13161
|
+
case LLM_ARCH_QWEN3MOE:
|
|
13162
|
+
{
|
|
13163
|
+
llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
|
|
13164
|
+
} break;
|
|
11726
13165
|
case LLM_ARCH_PHI2:
|
|
11727
13166
|
{
|
|
11728
13167
|
llm = std::make_unique<llm_build_phi2>(*this, params, gf);
|
|
@@ -11828,6 +13267,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11828
13267
|
{
|
|
11829
13268
|
llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
|
|
11830
13269
|
} break;
|
|
13270
|
+
case LLM_ARCH_GLM4:
|
|
13271
|
+
{
|
|
13272
|
+
llm = std::make_unique<llm_build_glm4>(*this, params, gf);
|
|
13273
|
+
} break;
|
|
11831
13274
|
case LLM_ARCH_BITNET:
|
|
11832
13275
|
{
|
|
11833
13276
|
llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
|
|
@@ -11846,10 +13289,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11846
13289
|
GGML_ABORT("invalid graph type");
|
|
11847
13290
|
};
|
|
11848
13291
|
} break;
|
|
11849
|
-
|
|
11850
|
-
|
|
11851
|
-
|
|
11852
|
-
|
|
13292
|
+
case LLM_ARCH_T5ENCODER:
|
|
13293
|
+
{
|
|
13294
|
+
llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
|
|
13295
|
+
}
|
|
13296
|
+
break;
|
|
11853
13297
|
case LLM_ARCH_JAIS:
|
|
11854
13298
|
{
|
|
11855
13299
|
llm = std::make_unique<llm_build_jais>(*this, params, gf);
|
|
@@ -11878,6 +13322,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11878
13322
|
{
|
|
11879
13323
|
llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
|
|
11880
13324
|
} break;
|
|
13325
|
+
case LLM_ARCH_GRANITE:
|
|
13326
|
+
case LLM_ARCH_GRANITE_MOE:
|
|
13327
|
+
{
|
|
13328
|
+
llm = std::make_unique<llm_build_granite>(*this, params, gf);
|
|
13329
|
+
} break;
|
|
11881
13330
|
case LLM_ARCH_CHAMELEON:
|
|
11882
13331
|
{
|
|
11883
13332
|
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
|
@@ -11886,6 +13335,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11886
13335
|
{
|
|
11887
13336
|
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
|
|
11888
13337
|
} break;
|
|
13338
|
+
case LLM_ARCH_PLM:
|
|
13339
|
+
{
|
|
13340
|
+
llm = std::make_unique<llm_build_plm>(*this, params, gf);
|
|
13341
|
+
} break;
|
|
13342
|
+
case LLM_ARCH_BAILINGMOE:
|
|
13343
|
+
{
|
|
13344
|
+
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
|
13345
|
+
} break;
|
|
11889
13346
|
default:
|
|
11890
13347
|
GGML_ABORT("fatal error");
|
|
11891
13348
|
}
|
|
@@ -11903,6 +13360,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11903
13360
|
llama_model_params llama_model_default_params() {
|
|
11904
13361
|
llama_model_params result = {
|
|
11905
13362
|
/*.devices =*/ nullptr,
|
|
13363
|
+
/*.tensor_buft_overrides =*/ nullptr,
|
|
11906
13364
|
/*.n_gpu_layers =*/ 0,
|
|
11907
13365
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
11908
13366
|
/*.main_gpu =*/ 0,
|
|
@@ -11998,11 +13456,10 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
11998
13456
|
|
|
11999
13457
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
12000
13458
|
case LLM_ARCH_LLAMA:
|
|
13459
|
+
case LLM_ARCH_LLAMA4:
|
|
12001
13460
|
case LLM_ARCH_DECI:
|
|
12002
13461
|
case LLM_ARCH_BAICHUAN:
|
|
12003
13462
|
case LLM_ARCH_STARCODER:
|
|
12004
|
-
case LLM_ARCH_PLAMO:
|
|
12005
|
-
case LLM_ARCH_ORION:
|
|
12006
13463
|
case LLM_ARCH_INTERNLM2:
|
|
12007
13464
|
case LLM_ARCH_MINICPM:
|
|
12008
13465
|
case LLM_ARCH_XVERSE:
|
|
@@ -12012,10 +13469,13 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
12012
13469
|
case LLM_ARCH_ARCTIC:
|
|
12013
13470
|
case LLM_ARCH_DEEPSEEK:
|
|
12014
13471
|
case LLM_ARCH_DEEPSEEK2:
|
|
13472
|
+
case LLM_ARCH_PLM:
|
|
12015
13473
|
case LLM_ARCH_CHATGLM:
|
|
13474
|
+
case LLM_ARCH_GLM4:
|
|
12016
13475
|
case LLM_ARCH_GRANITE:
|
|
12017
13476
|
case LLM_ARCH_GRANITE_MOE:
|
|
12018
13477
|
case LLM_ARCH_CHAMELEON:
|
|
13478
|
+
case LLM_ARCH_BAILINGMOE:
|
|
12019
13479
|
return LLAMA_ROPE_TYPE_NORM;
|
|
12020
13480
|
|
|
12021
13481
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -12024,16 +13484,20 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
12024
13484
|
case LLM_ARCH_DBRX:
|
|
12025
13485
|
case LLM_ARCH_BERT:
|
|
12026
13486
|
case LLM_ARCH_NOMIC_BERT:
|
|
13487
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
12027
13488
|
case LLM_ARCH_STABLELM:
|
|
12028
13489
|
case LLM_ARCH_BITNET:
|
|
12029
13490
|
case LLM_ARCH_QWEN:
|
|
12030
13491
|
case LLM_ARCH_QWEN2:
|
|
12031
13492
|
case LLM_ARCH_QWEN2MOE:
|
|
13493
|
+
case LLM_ARCH_QWEN3:
|
|
13494
|
+
case LLM_ARCH_QWEN3MOE:
|
|
12032
13495
|
case LLM_ARCH_OLMO2:
|
|
12033
13496
|
case LLM_ARCH_OLMOE:
|
|
12034
13497
|
case LLM_ARCH_PHI2:
|
|
12035
13498
|
case LLM_ARCH_PHI3:
|
|
12036
13499
|
case LLM_ARCH_PHIMOE:
|
|
13500
|
+
case LLM_ARCH_PLAMO:
|
|
12037
13501
|
case LLM_ARCH_GEMMA:
|
|
12038
13502
|
case LLM_ARCH_GEMMA2:
|
|
12039
13503
|
case LLM_ARCH_GEMMA3:
|
|
@@ -12041,6 +13505,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
12041
13505
|
case LLM_ARCH_OPENELM:
|
|
12042
13506
|
case LLM_ARCH_GPTNEOX:
|
|
12043
13507
|
case LLM_ARCH_CODESHELL:
|
|
13508
|
+
case LLM_ARCH_ORION:
|
|
12044
13509
|
case LLM_ARCH_NEMOTRON:
|
|
12045
13510
|
case LLM_ARCH_EXAONE:
|
|
12046
13511
|
case LLM_ARCH_MINICPM3:
|
|
@@ -12113,6 +13578,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
|
|
12113
13578
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
|
12114
13579
|
const auto & it = model->gguf_kv.find(key);
|
|
12115
13580
|
if (it == model->gguf_kv.end()) {
|
|
13581
|
+
// one-off fix for very popular models (so we are not flooded with issues)
|
|
13582
|
+
// do not extend this list unless absolutely necessary
|
|
13583
|
+
// Mistral-Small-2503 does not have built-in chat template
|
|
13584
|
+
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
|
13585
|
+
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
|
13586
|
+
return "mistral-v7-tekken";
|
|
13587
|
+
}
|
|
13588
|
+
|
|
12116
13589
|
return nullptr;
|
|
12117
13590
|
}
|
|
12118
13591
|
|