@novastera-oss/llamarn 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +140 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +48 -67
- package/cpp/LlamaCppModel.h +8 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +33 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
- package/cpp/llama.cpp/common/arg.cpp +38 -12
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
- package/cpp/llama.cpp/common/chat-parser.h +4 -1
- package/cpp/llama.cpp/common/chat.cpp +16 -13
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +52 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +140 -38
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
- package/cpp/llama.cpp/src/llama-batch.h +47 -17
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +488 -313
- package/cpp/llama.cpp/src/llama-context.h +38 -17
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
- package/cpp/llama.cpp/src/llama-graph.h +109 -52
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
- package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +89 -4
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +735 -143
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +65 -10
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +140 -38
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -5,7 +5,11 @@
|
|
|
5
5
|
#include "llama-batch.h"
|
|
6
6
|
#include "llama-cparams.h"
|
|
7
7
|
#include "llama-model-loader.h"
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
#include "llama-kv-cache-unified.h"
|
|
10
|
+
#include "llama-kv-cache-unified-iswa.h"
|
|
11
|
+
#include "llama-memory-hybrid.h"
|
|
12
|
+
#include "llama-memory-recurrent.h"
|
|
9
13
|
|
|
10
14
|
#include "ggml-cpp.h"
|
|
11
15
|
|
|
@@ -77,6 +81,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
77
81
|
case LLM_TYPE_40B: return "40B";
|
|
78
82
|
case LLM_TYPE_65B: return "65B";
|
|
79
83
|
case LLM_TYPE_70B: return "70B";
|
|
84
|
+
case LLM_TYPE_142B: return "142B";
|
|
80
85
|
case LLM_TYPE_236B: return "236B";
|
|
81
86
|
case LLM_TYPE_290B: return "290B";
|
|
82
87
|
case LLM_TYPE_314B: return "314B";
|
|
@@ -466,6 +471,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
466
471
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
467
472
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
|
468
473
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
|
474
|
+
std::fill(
|
|
475
|
+
hparams.recurrent_layer_arr.begin(),
|
|
476
|
+
hparams.recurrent_layer_arr.end(),
|
|
477
|
+
llm_arch_is_recurrent(ml.get_arch()));
|
|
469
478
|
|
|
470
479
|
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
|
471
480
|
|
|
@@ -540,6 +549,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
540
549
|
uint32_t n_vocab = 0;
|
|
541
550
|
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
|
542
551
|
|
|
552
|
+
// for classifier models
|
|
553
|
+
ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
|
|
554
|
+
if (!classifier_labels.empty()) {
|
|
555
|
+
hparams.n_cls_out = classifier_labels.size();
|
|
556
|
+
}
|
|
557
|
+
|
|
543
558
|
// arch-specific KVs
|
|
544
559
|
switch (arch) {
|
|
545
560
|
case LLM_ARCH_LLAMA:
|
|
@@ -589,6 +604,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
589
604
|
hparams.use_kq_norm = false;
|
|
590
605
|
}
|
|
591
606
|
} break;
|
|
607
|
+
case LLM_ARCH_ARCEE:
|
|
608
|
+
{
|
|
609
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
610
|
+
|
|
611
|
+
// Arcee uses the same structure as Llama
|
|
612
|
+
switch (hparams.n_layer) {
|
|
613
|
+
case 36: type = LLM_TYPE_4B; break;
|
|
614
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
615
|
+
}
|
|
616
|
+
} break;
|
|
592
617
|
case LLM_ARCH_DECI:
|
|
593
618
|
{
|
|
594
619
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -683,7 +708,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
683
708
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
684
709
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
685
710
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
686
|
-
ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
|
|
687
711
|
|
|
688
712
|
switch (hparams.n_layer) {
|
|
689
713
|
case 3:
|
|
@@ -730,6 +754,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
730
754
|
}
|
|
731
755
|
}
|
|
732
756
|
} break;
|
|
757
|
+
case LLM_ARCH_NEO_BERT:
|
|
758
|
+
{
|
|
759
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
760
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
761
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
762
|
+
|
|
763
|
+
if (hparams.n_layer == 28) {
|
|
764
|
+
type = LLM_TYPE_250M;
|
|
765
|
+
}
|
|
766
|
+
} break;
|
|
733
767
|
case LLM_ARCH_BLOOM:
|
|
734
768
|
{
|
|
735
769
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -953,6 +987,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
953
987
|
case 46: type = LLM_TYPE_27B; break;
|
|
954
988
|
default: type = LLM_TYPE_UNKNOWN;
|
|
955
989
|
}
|
|
990
|
+
|
|
991
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
|
|
992
|
+
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
993
|
+
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
994
|
+
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
956
995
|
} break;
|
|
957
996
|
case LLM_ARCH_GEMMA3:
|
|
958
997
|
{
|
|
@@ -973,6 +1012,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
973
1012
|
default: type = LLM_TYPE_UNKNOWN;
|
|
974
1013
|
}
|
|
975
1014
|
|
|
1015
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
|
|
976
1016
|
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
977
1017
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
978
1018
|
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
@@ -1430,6 +1470,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1430
1470
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1431
1471
|
}
|
|
1432
1472
|
} break;
|
|
1473
|
+
case LLM_ARCH_DOTS1:
|
|
1474
|
+
{
|
|
1475
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1476
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1477
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1478
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1479
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1480
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1481
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1482
|
+
switch (hparams.n_layer) {
|
|
1483
|
+
case 62: type = LLM_TYPE_142B; break;
|
|
1484
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1485
|
+
}
|
|
1486
|
+
} break;
|
|
1433
1487
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1434
1488
|
}
|
|
1435
1489
|
|
|
@@ -2173,6 +2227,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2173
2227
|
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
2174
2228
|
}
|
|
2175
2229
|
} break;
|
|
2230
|
+
case LLM_ARCH_NEO_BERT:
|
|
2231
|
+
{
|
|
2232
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2233
|
+
|
|
2234
|
+
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
2235
|
+
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2236
|
+
|
|
2237
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2238
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2239
|
+
|
|
2240
|
+
output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2241
|
+
|
|
2242
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2243
|
+
auto & layer = layers[i];
|
|
2244
|
+
|
|
2245
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2246
|
+
|
|
2247
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2248
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2249
|
+
|
|
2250
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2251
|
+
|
|
2252
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
|
|
2253
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2254
|
+
}
|
|
2255
|
+
} break;
|
|
2176
2256
|
case LLM_ARCH_JINA_BERT_V2:
|
|
2177
2257
|
{
|
|
2178
2258
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
|
@@ -2210,8 +2290,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2210
2290
|
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2211
2291
|
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2212
2292
|
|
|
2213
|
-
layer.
|
|
2214
|
-
layer.
|
|
2293
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2294
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
|
|
2215
2295
|
|
|
2216
2296
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2217
2297
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
@@ -4109,6 +4189,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4109
4189
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4110
4190
|
}
|
|
4111
4191
|
} break;
|
|
4192
|
+
case LLM_ARCH_DOTS1:
|
|
4193
|
+
{
|
|
4194
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
4195
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4196
|
+
|
|
4197
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4198
|
+
|
|
4199
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4200
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
4201
|
+
|
|
4202
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4203
|
+
auto & layer = layers[i];
|
|
4204
|
+
|
|
4205
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4206
|
+
|
|
4207
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4208
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4209
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4210
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4211
|
+
|
|
4212
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4213
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4214
|
+
|
|
4215
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4216
|
+
|
|
4217
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
|
4218
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4219
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4220
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4221
|
+
} else {
|
|
4222
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4223
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
4224
|
+
|
|
4225
|
+
if (n_expert == 0) {
|
|
4226
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
4227
|
+
}
|
|
4228
|
+
if (n_expert_used == 0) {
|
|
4229
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
4230
|
+
}
|
|
4231
|
+
|
|
4232
|
+
// MoE branch
|
|
4233
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4234
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
4235
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4236
|
+
|
|
4237
|
+
// Shared expert branch
|
|
4238
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4239
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
4240
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4241
|
+
}
|
|
4242
|
+
}
|
|
4243
|
+
} break;
|
|
4244
|
+
case LLM_ARCH_ARCEE:
|
|
4245
|
+
{
|
|
4246
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4247
|
+
|
|
4248
|
+
// output
|
|
4249
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4250
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4251
|
+
|
|
4252
|
+
// if output is NULL, init from the input tok embed
|
|
4253
|
+
if (output == NULL) {
|
|
4254
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4255
|
+
}
|
|
4256
|
+
|
|
4257
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4258
|
+
auto & layer = layers[i];
|
|
4259
|
+
|
|
4260
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4261
|
+
|
|
4262
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4263
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4264
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4265
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4266
|
+
|
|
4267
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4268
|
+
|
|
4269
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4270
|
+
|
|
4271
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4272
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4273
|
+
}
|
|
4274
|
+
} break;
|
|
4112
4275
|
default:
|
|
4113
4276
|
throw std::runtime_error("unknown architecture");
|
|
4114
4277
|
}
|
|
@@ -4353,6 +4516,15 @@ void llama_model::print_info() const {
|
|
|
4353
4516
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
4354
4517
|
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
4355
4518
|
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
4519
|
+
|
|
4520
|
+
if (!classifier_labels.empty()) {
|
|
4521
|
+
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
|
4522
|
+
|
|
4523
|
+
size_t i = 0;
|
|
4524
|
+
for (auto label : classifier_labels) {
|
|
4525
|
+
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
|
|
4526
|
+
}
|
|
4527
|
+
}
|
|
4356
4528
|
}
|
|
4357
4529
|
|
|
4358
4530
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
@@ -6020,7 +6192,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6020
6192
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
6021
6193
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
6022
6194
|
NULL,
|
|
6023
|
-
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
6195
|
+
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
|
|
6024
6196
|
cb(cur, "ffn_out", il);
|
|
6025
6197
|
} else {
|
|
6026
6198
|
cur = build_ffn(cur,
|
|
@@ -6051,6 +6223,117 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6051
6223
|
}
|
|
6052
6224
|
};
|
|
6053
6225
|
|
|
6226
|
+
struct llm_build_neo_bert : public llm_graph_context {
|
|
6227
|
+
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6228
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6229
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6230
|
+
|
|
6231
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6232
|
+
|
|
6233
|
+
ggml_tensor * cur;
|
|
6234
|
+
ggml_tensor * inpL;
|
|
6235
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
6236
|
+
|
|
6237
|
+
// construct input embeddings (token, type, position)
|
|
6238
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
6239
|
+
cb(inpL, "inp_embd", -1);
|
|
6240
|
+
|
|
6241
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
6242
|
+
|
|
6243
|
+
// iterate layers
|
|
6244
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
6245
|
+
ggml_tensor * cur = inpL;
|
|
6246
|
+
|
|
6247
|
+
ggml_tensor * Qcur;
|
|
6248
|
+
ggml_tensor * Kcur;
|
|
6249
|
+
ggml_tensor * Vcur;
|
|
6250
|
+
|
|
6251
|
+
// pre-norm
|
|
6252
|
+
cur = build_norm(inpL,
|
|
6253
|
+
model.layers[il].attn_norm, NULL,
|
|
6254
|
+
LLM_NORM_RMS, il);
|
|
6255
|
+
|
|
6256
|
+
// self-attention
|
|
6257
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6258
|
+
cb(cur, "wqkv", il);
|
|
6259
|
+
|
|
6260
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6261
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6262
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6263
|
+
|
|
6264
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6265
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6266
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6267
|
+
|
|
6268
|
+
// RoPE
|
|
6269
|
+
Qcur = ggml_rope_ext(
|
|
6270
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6271
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6272
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6273
|
+
);
|
|
6274
|
+
|
|
6275
|
+
Kcur = ggml_rope_ext(
|
|
6276
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6277
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6278
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6279
|
+
);
|
|
6280
|
+
|
|
6281
|
+
cb(Qcur, "Qcur", il);
|
|
6282
|
+
cb(Kcur, "Kcur", il);
|
|
6283
|
+
cb(Vcur, "Vcur", il);
|
|
6284
|
+
|
|
6285
|
+
cur = build_attn(inp_attn, gf,
|
|
6286
|
+
model.layers[il].wo, nullptr,
|
|
6287
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6288
|
+
cb(cur, "kqv_out", il);
|
|
6289
|
+
|
|
6290
|
+
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
6291
|
+
// skip computing output for unused tokens
|
|
6292
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6293
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6294
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6295
|
+
}
|
|
6296
|
+
|
|
6297
|
+
// re-add the layer input
|
|
6298
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
6299
|
+
|
|
6300
|
+
ggml_tensor * ffn_inp = cur;
|
|
6301
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
6302
|
+
|
|
6303
|
+
// pre-norm
|
|
6304
|
+
cur = build_norm(ffn_inp,
|
|
6305
|
+
model.layers[il].ffn_norm, NULL,
|
|
6306
|
+
LLM_NORM_RMS, il);
|
|
6307
|
+
cb(cur, "ffn_norm", il);
|
|
6308
|
+
|
|
6309
|
+
// feed-forward network
|
|
6310
|
+
cur = build_ffn(cur,
|
|
6311
|
+
model.layers[il].ffn_up,
|
|
6312
|
+
NULL, NULL, NULL, NULL, NULL,
|
|
6313
|
+
model.layers[il].ffn_down,
|
|
6314
|
+
NULL, NULL, NULL,
|
|
6315
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
6316
|
+
|
|
6317
|
+
// attentions bypass the intermediate layer
|
|
6318
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
6319
|
+
|
|
6320
|
+
// input for next layer
|
|
6321
|
+
inpL = cur;
|
|
6322
|
+
}
|
|
6323
|
+
|
|
6324
|
+
cur = inpL;
|
|
6325
|
+
|
|
6326
|
+
cur = build_norm(cur,
|
|
6327
|
+
model.output_norm_enc, NULL,
|
|
6328
|
+
LLM_NORM_RMS, -1);
|
|
6329
|
+
|
|
6330
|
+
cb(cur, "result_embd", -1);
|
|
6331
|
+
res->t_embd = cur;
|
|
6332
|
+
|
|
6333
|
+
ggml_build_forward_expand(gf, cur);
|
|
6334
|
+
}
|
|
6335
|
+
};
|
|
6336
|
+
|
|
6054
6337
|
struct llm_build_bloom : public llm_graph_context {
|
|
6055
6338
|
llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6056
6339
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -8481,14 +8764,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
8481
8764
|
cb(Kcur, "Kcur", il);
|
|
8482
8765
|
cb(Vcur, "Vcur", il);
|
|
8483
8766
|
|
|
8484
|
-
|
|
8485
|
-
switch (model.type) {
|
|
8486
|
-
case LLM_TYPE_2B:
|
|
8487
|
-
case LLM_TYPE_9B:
|
|
8488
|
-
case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
|
|
8489
|
-
default: GGML_ABORT("fatal error");
|
|
8490
|
-
};
|
|
8491
|
-
cb(Qcur, "Qcur_scaled", il);
|
|
8767
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
8492
8768
|
|
|
8493
8769
|
cur = build_attn(inp_attn, gf,
|
|
8494
8770
|
model.layers[il].wo, NULL,
|
|
@@ -8629,9 +8905,12 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8629
8905
|
cb(Kcur, "Kcur", il);
|
|
8630
8906
|
cb(Vcur, "Vcur", il);
|
|
8631
8907
|
|
|
8908
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
|
|
8909
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
8910
|
+
|
|
8632
8911
|
cur = build_attn(inp_attn, gf,
|
|
8633
8912
|
model.layers[il].wo, NULL,
|
|
8634
|
-
Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
8913
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8635
8914
|
}
|
|
8636
8915
|
|
|
8637
8916
|
cur = build_norm(cur,
|
|
@@ -8837,8 +9116,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8837
9116
|
// {n_embd, n_tokens}
|
|
8838
9117
|
inpL = build_inp_embd(model.tok_embd);
|
|
8839
9118
|
|
|
8840
|
-
|
|
8841
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
9119
|
+
auto * rs_inp = build_rs_inp();
|
|
8842
9120
|
|
|
8843
9121
|
for (int il = 0; il < n_layer; ++il) {
|
|
8844
9122
|
// norm
|
|
@@ -8847,8 +9125,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8847
9125
|
LLM_NORM_RMS, il);
|
|
8848
9126
|
cb(cur, "attn_norm", il);
|
|
8849
9127
|
|
|
8850
|
-
|
|
8851
|
-
cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
|
|
9128
|
+
cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
|
|
8852
9129
|
|
|
8853
9130
|
if (il == n_layer - 1) {
|
|
8854
9131
|
// skip computing output for unused tokens
|
|
@@ -8886,15 +9163,14 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8886
9163
|
|
|
8887
9164
|
// TODO: split
|
|
8888
9165
|
ggml_tensor * build_mamba_layer(
|
|
8889
|
-
|
|
8890
|
-
|
|
8891
|
-
|
|
8892
|
-
|
|
8893
|
-
|
|
8894
|
-
|
|
8895
|
-
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
|
9166
|
+
llm_graph_input_rs * inp,
|
|
9167
|
+
ggml_cgraph * gf,
|
|
9168
|
+
ggml_tensor * cur,
|
|
9169
|
+
const llama_ubatch & ubatch,
|
|
9170
|
+
int il) const {
|
|
9171
|
+
const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
|
|
8896
9172
|
|
|
8897
|
-
const auto kv_head =
|
|
9173
|
+
const auto kv_head = kv_state->get_head();
|
|
8898
9174
|
|
|
8899
9175
|
const int64_t d_conv = hparams.ssm_d_conv;
|
|
8900
9176
|
const int64_t d_inner = hparams.ssm_d_inner;
|
|
@@ -8912,17 +9188,17 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8912
9188
|
GGML_ASSERT(ubatch.equal_seqs);
|
|
8913
9189
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
8914
9190
|
|
|
8915
|
-
ggml_tensor * conv_states_all =
|
|
8916
|
-
ggml_tensor * ssm_states_all =
|
|
9191
|
+
ggml_tensor * conv_states_all = kv_state->get_r_l(il);
|
|
9192
|
+
ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
|
|
8917
9193
|
|
|
8918
9194
|
// (ab)using the KV cache to store the states
|
|
8919
|
-
ggml_tensor * conv =
|
|
8920
|
-
gf, conv_states_all,
|
|
8921
|
-
hparams.
|
|
9195
|
+
ggml_tensor * conv = build_rs(
|
|
9196
|
+
inp, gf, conv_states_all,
|
|
9197
|
+
hparams.n_embd_r(), n_seqs);
|
|
8922
9198
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
|
8923
|
-
ggml_tensor * ssm =
|
|
8924
|
-
gf, ssm_states_all,
|
|
8925
|
-
hparams.
|
|
9199
|
+
ggml_tensor * ssm = build_rs(
|
|
9200
|
+
inp, gf, ssm_states_all,
|
|
9201
|
+
hparams.n_embd_s(), n_seqs);
|
|
8926
9202
|
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
|
8927
9203
|
|
|
8928
9204
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
@@ -11633,14 +11909,13 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11633
11909
|
}
|
|
11634
11910
|
|
|
11635
11911
|
ggml_tensor * build_rwkv6_time_mix(
|
|
11912
|
+
llm_graph_input_rs * inp,
|
|
11636
11913
|
ggml_cgraph * gf,
|
|
11637
11914
|
ggml_tensor * cur,
|
|
11638
11915
|
ggml_tensor * x_prev,
|
|
11639
|
-
ggml_tensor * state_copy,
|
|
11640
|
-
ggml_tensor * state_mask,
|
|
11641
11916
|
const llama_ubatch & ubatch,
|
|
11642
11917
|
int il) const {
|
|
11643
|
-
const
|
|
11918
|
+
const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
|
|
11644
11919
|
|
|
11645
11920
|
const auto n_tokens = ubatch.n_tokens;
|
|
11646
11921
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -11650,7 +11925,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11650
11925
|
const auto n_head = n_embd / head_size;
|
|
11651
11926
|
const auto n_head_kv = hparams.n_head_kv(il);
|
|
11652
11927
|
|
|
11653
|
-
const auto kv_head =
|
|
11928
|
+
const auto kv_head = kv_state->get_head();
|
|
11654
11929
|
|
|
11655
11930
|
const auto & layer = model.layers[il];
|
|
11656
11931
|
|
|
@@ -11761,9 +12036,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11761
12036
|
k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
|
|
11762
12037
|
}
|
|
11763
12038
|
|
|
11764
|
-
ggml_tensor * wkv_state =
|
|
11765
|
-
gf,
|
|
11766
|
-
hparams.
|
|
12039
|
+
ggml_tensor * wkv_state = build_rs(
|
|
12040
|
+
inp, gf, kv_state->get_s_l(il),
|
|
12041
|
+
hparams.n_embd_s(), n_seqs);
|
|
11767
12042
|
|
|
11768
12043
|
ggml_tensor * wkv_output;
|
|
11769
12044
|
if (is_qrwkv) {
|
|
@@ -11781,9 +12056,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11781
12056
|
wkv_state,
|
|
11782
12057
|
ggml_view_1d(
|
|
11783
12058
|
ctx0,
|
|
11784
|
-
|
|
11785
|
-
hparams.
|
|
11786
|
-
hparams.
|
|
12059
|
+
kv_state->get_s_l(il),
|
|
12060
|
+
hparams.n_embd_s() * n_seqs,
|
|
12061
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
|
|
11787
12062
|
)
|
|
11788
12063
|
)
|
|
11789
12064
|
);
|
|
@@ -11817,8 +12092,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11817
12092
|
inpL = build_inp_embd(model.tok_embd);
|
|
11818
12093
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
|
11819
12094
|
|
|
11820
|
-
|
|
11821
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12095
|
+
auto * rs_inp = build_rs_inp();
|
|
11822
12096
|
|
|
11823
12097
|
const auto n_embd = hparams.n_embd;
|
|
11824
12098
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
@@ -11828,9 +12102,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11828
12102
|
const llama_layer * layer = &model.layers[il];
|
|
11829
12103
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
11830
12104
|
|
|
11831
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
11832
|
-
gf, state_copy, state_mask, ubatch, il
|
|
11833
|
-
);
|
|
12105
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
11834
12106
|
|
|
11835
12107
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
11836
12108
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -11845,7 +12117,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11845
12117
|
1
|
|
11846
12118
|
);
|
|
11847
12119
|
|
|
11848
|
-
cur = build_rwkv6_time_mix(gf, att_norm, x_prev,
|
|
12120
|
+
cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
|
|
11849
12121
|
|
|
11850
12122
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
11851
12123
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -11908,15 +12180,14 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11908
12180
|
// ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
|
|
11909
12181
|
struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
11910
12182
|
llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
|
|
11911
|
-
GGML_ASSERT(n_embd == hparams.
|
|
12183
|
+
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
11912
12184
|
|
|
11913
12185
|
ggml_tensor * cur;
|
|
11914
12186
|
ggml_tensor * inpL;
|
|
11915
12187
|
|
|
11916
12188
|
inpL = build_inp_embd(model.tok_embd);
|
|
11917
12189
|
|
|
11918
|
-
|
|
11919
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12190
|
+
auto * rs_inp = build_rs_inp();
|
|
11920
12191
|
|
|
11921
12192
|
const auto n_embd = hparams.n_embd;
|
|
11922
12193
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
@@ -11926,9 +12197,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
11926
12197
|
const llama_layer * layer = &model.layers[il];
|
|
11927
12198
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
11928
12199
|
|
|
11929
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
11930
|
-
gf, state_copy, state_mask, ubatch, il
|
|
11931
|
-
);
|
|
12200
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
11932
12201
|
|
|
11933
12202
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
11934
12203
|
cb(att_norm, "attn_norm", il);
|
|
@@ -11940,7 +12209,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
11940
12209
|
1
|
|
11941
12210
|
);
|
|
11942
12211
|
|
|
11943
|
-
cur = build_rwkv6_time_mix(gf, att_norm, x_prev,
|
|
12212
|
+
cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
|
|
11944
12213
|
|
|
11945
12214
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
11946
12215
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -12028,15 +12297,14 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12028
12297
|
}
|
|
12029
12298
|
|
|
12030
12299
|
ggml_tensor * build_rwkv7_time_mix(
|
|
12300
|
+
llm_graph_input_rs * inp,
|
|
12031
12301
|
ggml_cgraph * gf,
|
|
12032
12302
|
ggml_tensor * cur,
|
|
12033
12303
|
ggml_tensor * x_prev,
|
|
12034
|
-
ggml_tensor * state_copy,
|
|
12035
|
-
ggml_tensor * state_mask,
|
|
12036
12304
|
ggml_tensor *& first_layer_value,
|
|
12037
12305
|
const llama_ubatch & ubatch,
|
|
12038
12306
|
int il) const {
|
|
12039
|
-
const
|
|
12307
|
+
const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
|
|
12040
12308
|
|
|
12041
12309
|
const auto n_tokens = ubatch.n_tokens;
|
|
12042
12310
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -12045,7 +12313,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12045
12313
|
const auto head_count = n_embd / head_size;
|
|
12046
12314
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12047
12315
|
|
|
12048
|
-
const auto kv_head =
|
|
12316
|
+
const auto kv_head = kv_state->get_head();
|
|
12049
12317
|
|
|
12050
12318
|
const auto & layer = model.layers[il];
|
|
12051
12319
|
|
|
@@ -12115,9 +12383,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12115
12383
|
v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
|
|
12116
12384
|
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
|
12117
12385
|
|
|
12118
|
-
ggml_tensor * wkv_state =
|
|
12119
|
-
gf,
|
|
12120
|
-
hparams.
|
|
12386
|
+
ggml_tensor * wkv_state = build_rs(
|
|
12387
|
+
inp, gf, kv_state->get_s_l(il),
|
|
12388
|
+
hparams.n_embd_s(), n_seqs);
|
|
12121
12389
|
|
|
12122
12390
|
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
|
12123
12391
|
cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
|
|
@@ -12130,9 +12398,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12130
12398
|
wkv_state,
|
|
12131
12399
|
ggml_view_1d(
|
|
12132
12400
|
ctx0,
|
|
12133
|
-
|
|
12134
|
-
hparams.
|
|
12135
|
-
hparams.
|
|
12401
|
+
kv_state->get_s_l(il),
|
|
12402
|
+
hparams.n_embd_s() * n_seqs,
|
|
12403
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
|
|
12136
12404
|
)
|
|
12137
12405
|
)
|
|
12138
12406
|
);
|
|
@@ -12173,8 +12441,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12173
12441
|
inpL = build_inp_embd(model.tok_embd);
|
|
12174
12442
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
|
12175
12443
|
|
|
12176
|
-
|
|
12177
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12444
|
+
auto * rs_inp = build_rs_inp();
|
|
12178
12445
|
|
|
12179
12446
|
const auto n_embd = hparams.n_embd;
|
|
12180
12447
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
@@ -12184,9 +12451,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12184
12451
|
const llama_layer * layer = &model.layers[il];
|
|
12185
12452
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
12186
12453
|
|
|
12187
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
12188
|
-
gf, state_copy, state_mask, ubatch, il
|
|
12189
|
-
);
|
|
12454
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
12190
12455
|
|
|
12191
12456
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
12192
12457
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -12201,7 +12466,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12201
12466
|
1
|
|
12202
12467
|
);
|
|
12203
12468
|
|
|
12204
|
-
cur = build_rwkv7_time_mix(gf, att_norm, x_prev,
|
|
12469
|
+
cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
|
|
12205
12470
|
|
|
12206
12471
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
12207
12472
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -12259,7 +12524,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12259
12524
|
|
|
12260
12525
|
struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
12261
12526
|
llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
|
|
12262
|
-
GGML_ASSERT(n_embd == hparams.
|
|
12527
|
+
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
12263
12528
|
|
|
12264
12529
|
ggml_tensor * cur;
|
|
12265
12530
|
ggml_tensor * inpL;
|
|
@@ -12267,8 +12532,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12267
12532
|
|
|
12268
12533
|
inpL = build_inp_embd(model.tok_embd);
|
|
12269
12534
|
|
|
12270
|
-
|
|
12271
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12535
|
+
auto * rs_inp = build_rs_inp();
|
|
12272
12536
|
|
|
12273
12537
|
const auto n_embd = hparams.n_embd;
|
|
12274
12538
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
@@ -12278,9 +12542,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12278
12542
|
const llama_layer * layer = &model.layers[il];
|
|
12279
12543
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
12280
12544
|
|
|
12281
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
12282
|
-
gf, state_copy, state_mask, ubatch, il
|
|
12283
|
-
);
|
|
12545
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
12284
12546
|
|
|
12285
12547
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
12286
12548
|
cb(att_norm, "attn_norm", il);
|
|
@@ -12292,7 +12554,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12292
12554
|
1
|
|
12293
12555
|
);
|
|
12294
12556
|
|
|
12295
|
-
cur = build_rwkv7_time_mix(gf, att_norm, x_prev,
|
|
12557
|
+
cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
|
|
12296
12558
|
|
|
12297
12559
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
12298
12560
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -13184,69 +13446,375 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13184
13446
|
}
|
|
13185
13447
|
};
|
|
13186
13448
|
|
|
13187
|
-
|
|
13188
|
-
|
|
13449
|
+
struct llm_build_dots1 : public llm_graph_context {
|
|
13450
|
+
llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
13451
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13189
13452
|
|
|
13190
|
-
|
|
13191
|
-
|
|
13192
|
-
case LLM_ARCH_JINA_BERT_V2:
|
|
13193
|
-
case LLM_ARCH_NOMIC_BERT:
|
|
13194
|
-
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
13195
|
-
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
13196
|
-
{
|
|
13197
|
-
res = nullptr;
|
|
13198
|
-
} break;
|
|
13199
|
-
case LLM_ARCH_MAMBA:
|
|
13200
|
-
case LLM_ARCH_RWKV6:
|
|
13201
|
-
case LLM_ARCH_RWKV6QWEN2:
|
|
13202
|
-
case LLM_ARCH_RWKV7:
|
|
13203
|
-
case LLM_ARCH_ARWKV7:
|
|
13204
|
-
{
|
|
13205
|
-
res = new llama_kv_cache_recurrent(
|
|
13206
|
-
*this,
|
|
13207
|
-
GGML_TYPE_F32,
|
|
13208
|
-
GGML_TYPE_F32,
|
|
13209
|
-
cparams.offload_kqv,
|
|
13210
|
-
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13211
|
-
cparams.n_seq_max);
|
|
13212
|
-
} break;
|
|
13213
|
-
default:
|
|
13214
|
-
{
|
|
13215
|
-
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
13453
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13454
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13216
13455
|
|
|
13217
|
-
|
|
13456
|
+
ggml_tensor * cur;
|
|
13457
|
+
ggml_tensor * inpL;
|
|
13218
13458
|
|
|
13219
|
-
|
|
13459
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13220
13460
|
|
|
13221
|
-
|
|
13222
|
-
|
|
13461
|
+
// inp_pos - contains the positions
|
|
13462
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13223
13463
|
|
|
13224
|
-
|
|
13225
|
-
|
|
13226
|
-
|
|
13227
|
-
|
|
13228
|
-
|
|
13229
|
-
|
|
13230
|
-
|
|
13231
|
-
|
|
13232
|
-
|
|
13233
|
-
|
|
13234
|
-
|
|
13235
|
-
|
|
13236
|
-
|
|
13464
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13465
|
+
|
|
13466
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
13467
|
+
ggml_tensor * inpSA = inpL;
|
|
13468
|
+
|
|
13469
|
+
// norm
|
|
13470
|
+
cur = build_norm(inpL,
|
|
13471
|
+
model.layers[il].attn_norm, NULL,
|
|
13472
|
+
LLM_NORM_RMS, il);
|
|
13473
|
+
cb(cur, "attn_norm", il);
|
|
13474
|
+
|
|
13475
|
+
// self_attention
|
|
13476
|
+
{
|
|
13477
|
+
// compute Q and K and RoPE them
|
|
13478
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13479
|
+
cb(Qcur, "Qcur", il);
|
|
13480
|
+
|
|
13481
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13482
|
+
cb(Kcur, "Kcur", il);
|
|
13483
|
+
|
|
13484
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13485
|
+
cb(Vcur, "Vcur", il);
|
|
13486
|
+
|
|
13487
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13488
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13489
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13490
|
+
|
|
13491
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
13492
|
+
cb(Qcur, "Qcur_normed", il);
|
|
13493
|
+
|
|
13494
|
+
Qcur = ggml_rope_ext(
|
|
13495
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
13496
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13497
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13498
|
+
);
|
|
13499
|
+
|
|
13500
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
13501
|
+
cb(Kcur, "Kcur_normed", il);
|
|
13502
|
+
|
|
13503
|
+
Kcur = ggml_rope_ext(
|
|
13504
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
13505
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13506
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13507
|
+
);
|
|
13508
|
+
|
|
13509
|
+
cb(Qcur, "Qcur", il);
|
|
13510
|
+
cb(Kcur, "Kcur", il);
|
|
13511
|
+
cb(Vcur, "Vcur", il);
|
|
13512
|
+
|
|
13513
|
+
cur = build_attn(inp_attn, gf,
|
|
13514
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
13515
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13516
|
+
}
|
|
13517
|
+
|
|
13518
|
+
if (il == n_layer - 1) {
|
|
13519
|
+
// skip computing output for unused tokens
|
|
13520
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13521
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13522
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13523
|
+
}
|
|
13524
|
+
|
|
13525
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13526
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13527
|
+
|
|
13528
|
+
// MoE branch
|
|
13529
|
+
cur = build_norm(ffn_inp,
|
|
13530
|
+
model.layers[il].ffn_norm, NULL,
|
|
13531
|
+
LLM_NORM_RMS, il);
|
|
13532
|
+
cb(cur, "ffn_norm", il);
|
|
13533
|
+
|
|
13534
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
|
13535
|
+
cur = build_ffn(cur,
|
|
13536
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13537
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
13538
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13539
|
+
NULL,
|
|
13540
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13541
|
+
cb(cur, "ffn_out", il);
|
|
13542
|
+
} else {
|
|
13543
|
+
ggml_tensor * moe_out =
|
|
13544
|
+
build_moe_ffn(cur,
|
|
13545
|
+
model.layers[il].ffn_gate_inp,
|
|
13546
|
+
model.layers[il].ffn_up_exps,
|
|
13547
|
+
model.layers[il].ffn_gate_exps,
|
|
13548
|
+
model.layers[il].ffn_down_exps,
|
|
13549
|
+
model.layers[il].ffn_exp_probs_b,
|
|
13550
|
+
n_expert, n_expert_used,
|
|
13551
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
13552
|
+
true, hparams.expert_weights_scale,
|
|
13553
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
13554
|
+
il);
|
|
13555
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
13556
|
+
|
|
13557
|
+
{
|
|
13558
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
13559
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
13560
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
13561
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
13562
|
+
NULL,
|
|
13563
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13564
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
13565
|
+
|
|
13566
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
13567
|
+
cb(cur, "ffn_out", il);
|
|
13568
|
+
}
|
|
13569
|
+
}
|
|
13237
13570
|
|
|
13238
|
-
|
|
13571
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13572
|
+
|
|
13573
|
+
cur = build_cvec(cur, il);
|
|
13574
|
+
cb(cur, "l_out", il);
|
|
13575
|
+
|
|
13576
|
+
// input for next layer
|
|
13577
|
+
inpL = cur;
|
|
13578
|
+
}
|
|
13579
|
+
|
|
13580
|
+
cur = inpL;
|
|
13581
|
+
|
|
13582
|
+
cur = build_norm(cur,
|
|
13583
|
+
model.output_norm, NULL,
|
|
13584
|
+
LLM_NORM_RMS, -1);
|
|
13585
|
+
|
|
13586
|
+
cb(cur, "result_norm", -1);
|
|
13587
|
+
res->t_embd = cur;
|
|
13588
|
+
|
|
13589
|
+
// lm_head
|
|
13590
|
+
cur = build_lora_mm(model.output, cur);
|
|
13591
|
+
|
|
13592
|
+
cb(cur, "result_output", -1);
|
|
13593
|
+
res->t_logits = cur;
|
|
13594
|
+
|
|
13595
|
+
ggml_build_forward_expand(gf, cur);
|
|
13596
|
+
}
|
|
13597
|
+
};
|
|
13598
|
+
|
|
13599
|
+
struct llm_build_arcee : public llm_graph_context {
|
|
13600
|
+
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
13601
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13602
|
+
|
|
13603
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13604
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13605
|
+
|
|
13606
|
+
ggml_tensor * cur;
|
|
13607
|
+
ggml_tensor * inpL;
|
|
13608
|
+
|
|
13609
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13610
|
+
|
|
13611
|
+
// inp_pos - contains the positions
|
|
13612
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13613
|
+
|
|
13614
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13615
|
+
|
|
13616
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
13617
|
+
|
|
13618
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
13619
|
+
ggml_tensor * inpSA = inpL;
|
|
13620
|
+
|
|
13621
|
+
// norm
|
|
13622
|
+
cur = build_norm(inpL,
|
|
13623
|
+
model.layers[il].attn_norm, NULL,
|
|
13624
|
+
LLM_NORM_RMS, il);
|
|
13625
|
+
cb(cur, "attn_norm", il);
|
|
13626
|
+
|
|
13627
|
+
// self-attention
|
|
13628
|
+
{
|
|
13629
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
13630
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
13631
|
+
|
|
13632
|
+
// compute Q and K and RoPE them
|
|
13633
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13634
|
+
cb(Qcur, "Qcur", il);
|
|
13635
|
+
if (model.layers[il].bq) {
|
|
13636
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
13637
|
+
cb(Qcur, "Qcur", il);
|
|
13638
|
+
}
|
|
13639
|
+
|
|
13640
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13641
|
+
cb(Kcur, "Kcur", il);
|
|
13642
|
+
if (model.layers[il].bk) {
|
|
13643
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
13644
|
+
cb(Kcur, "Kcur", il);
|
|
13645
|
+
}
|
|
13646
|
+
|
|
13647
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13648
|
+
cb(Vcur, "Vcur", il);
|
|
13649
|
+
if (model.layers[il].bv) {
|
|
13650
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
13651
|
+
cb(Vcur, "Vcur", il);
|
|
13652
|
+
}
|
|
13653
|
+
|
|
13654
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13655
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13656
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13657
|
+
|
|
13658
|
+
Qcur = ggml_rope_ext(
|
|
13659
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
13660
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13661
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13662
|
+
);
|
|
13663
|
+
|
|
13664
|
+
Kcur = ggml_rope_ext(
|
|
13665
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
13666
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13667
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13668
|
+
);
|
|
13669
|
+
|
|
13670
|
+
cb(Qcur, "Qcur", il);
|
|
13671
|
+
cb(Kcur, "Kcur", il);
|
|
13672
|
+
cb(Vcur, "Vcur", il);
|
|
13673
|
+
|
|
13674
|
+
cur = build_attn(inp_attn, gf,
|
|
13675
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
13676
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
13677
|
+
cb(cur, "attn_out", il);
|
|
13678
|
+
}
|
|
13679
|
+
|
|
13680
|
+
if (il == n_layer - 1) {
|
|
13681
|
+
// skip computing output for unused tokens
|
|
13682
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13683
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13684
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13685
|
+
}
|
|
13686
|
+
|
|
13687
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13688
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13689
|
+
|
|
13690
|
+
// feed-forward network
|
|
13691
|
+
// ARCEE uses relu^2 instead of silu
|
|
13692
|
+
cur = build_norm(ffn_inp,
|
|
13693
|
+
model.layers[il].ffn_norm, NULL,
|
|
13694
|
+
LLM_NORM_RMS, il);
|
|
13695
|
+
cb(cur, "ffn_norm", il);
|
|
13696
|
+
|
|
13697
|
+
cur = build_ffn(cur,
|
|
13698
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13699
|
+
NULL, NULL, NULL,
|
|
13700
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13701
|
+
NULL,
|
|
13702
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
13703
|
+
cb(cur, "ffn_out", il);
|
|
13704
|
+
|
|
13705
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13706
|
+
cb(cur, "ffn_out", il);
|
|
13707
|
+
|
|
13708
|
+
cur = build_cvec(cur, il);
|
|
13709
|
+
cb(cur, "l_out", il);
|
|
13710
|
+
|
|
13711
|
+
// input for next layer
|
|
13712
|
+
inpL = cur;
|
|
13713
|
+
}
|
|
13714
|
+
|
|
13715
|
+
cur = inpL;
|
|
13716
|
+
|
|
13717
|
+
cur = build_norm(cur,
|
|
13718
|
+
model.output_norm, NULL,
|
|
13719
|
+
LLM_NORM_RMS, -1);
|
|
13720
|
+
|
|
13721
|
+
cb(cur, "result_norm", -1);
|
|
13722
|
+
res->t_embd = cur;
|
|
13723
|
+
|
|
13724
|
+
// lm_head
|
|
13725
|
+
cur = build_lora_mm(model.output, cur);
|
|
13726
|
+
|
|
13727
|
+
cb(cur, "result_output", -1);
|
|
13728
|
+
res->t_logits = cur;
|
|
13729
|
+
|
|
13730
|
+
ggml_build_forward_expand(gf, cur);
|
|
13731
|
+
}
|
|
13732
|
+
};
|
|
13733
|
+
|
|
13734
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
13735
|
+
llama_memory_i * res;
|
|
13736
|
+
|
|
13737
|
+
switch (arch) {
|
|
13738
|
+
// Models that need specific instantiation should be handled in the
|
|
13739
|
+
// switch statement
|
|
13740
|
+
case LLM_ARCH_BERT:
|
|
13741
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
13742
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
13743
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
13744
|
+
case LLM_ARCH_NEO_BERT:
|
|
13745
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
13746
|
+
{
|
|
13747
|
+
res = nullptr;
|
|
13748
|
+
} break;
|
|
13749
|
+
// Models that need standard caching should rely on recurrent/hybrid
|
|
13750
|
+
// checks
|
|
13751
|
+
default:
|
|
13752
|
+
{
|
|
13753
|
+
if (llm_arch_is_recurrent(arch)) {
|
|
13754
|
+
res = new llama_memory_recurrent(
|
|
13239
13755
|
*this,
|
|
13240
13756
|
nullptr,
|
|
13241
|
-
|
|
13242
|
-
|
|
13243
|
-
!cparams.flash_attn,
|
|
13757
|
+
GGML_TYPE_F32,
|
|
13758
|
+
GGML_TYPE_F32,
|
|
13244
13759
|
cparams.offload_kqv,
|
|
13245
|
-
cparams.
|
|
13246
|
-
cparams.n_seq_max
|
|
13247
|
-
|
|
13248
|
-
|
|
13249
|
-
|
|
13760
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13761
|
+
cparams.n_seq_max);
|
|
13762
|
+
} else if (llm_arch_is_hybrid(arch)) {
|
|
13763
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
13764
|
+
|
|
13765
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
13766
|
+
|
|
13767
|
+
res = new llama_memory_hybrid(
|
|
13768
|
+
/* model */ *this,
|
|
13769
|
+
/* attn_type_k */ params.type_k,
|
|
13770
|
+
/* attn_type_v */ params.type_v,
|
|
13771
|
+
/* attn_v_trans */ !cparams.flash_attn,
|
|
13772
|
+
/* attn_kv_size */ cparams.n_ctx,
|
|
13773
|
+
/* attn_n_pad */ padding,
|
|
13774
|
+
/* attn_n_swa */ hparams.n_swa,
|
|
13775
|
+
/* attn_swa_type */ hparams.swa_type,
|
|
13776
|
+
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
13777
|
+
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
13778
|
+
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13779
|
+
/* n_seq_max */ cparams.n_seq_max,
|
|
13780
|
+
/* offload */ cparams.offload_kqv);
|
|
13781
|
+
} else {
|
|
13782
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
13783
|
+
|
|
13784
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
13785
|
+
|
|
13786
|
+
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
13787
|
+
|
|
13788
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
13789
|
+
GGML_ASSERT(hparams.is_swa_any());
|
|
13790
|
+
|
|
13791
|
+
res = new llama_kv_cache_unified_iswa(
|
|
13792
|
+
*this,
|
|
13793
|
+
params.type_k,
|
|
13794
|
+
params.type_v,
|
|
13795
|
+
!cparams.flash_attn,
|
|
13796
|
+
cparams.offload_kqv,
|
|
13797
|
+
params.swa_full,
|
|
13798
|
+
cparams.n_ctx,
|
|
13799
|
+
cparams.n_seq_max,
|
|
13800
|
+
cparams.n_ubatch,
|
|
13801
|
+
padding);
|
|
13802
|
+
} else {
|
|
13803
|
+
GGML_ASSERT(!hparams.is_swa_any());
|
|
13804
|
+
|
|
13805
|
+
res = new llama_kv_cache_unified(
|
|
13806
|
+
*this,
|
|
13807
|
+
nullptr,
|
|
13808
|
+
params.type_k,
|
|
13809
|
+
params.type_v,
|
|
13810
|
+
!cparams.flash_attn,
|
|
13811
|
+
cparams.offload_kqv,
|
|
13812
|
+
cparams.n_ctx,
|
|
13813
|
+
cparams.n_seq_max,
|
|
13814
|
+
padding,
|
|
13815
|
+
hparams.n_swa,
|
|
13816
|
+
hparams.swa_type);
|
|
13817
|
+
}
|
|
13250
13818
|
}
|
|
13251
13819
|
}
|
|
13252
13820
|
}
|
|
@@ -13300,6 +13868,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13300
13868
|
{
|
|
13301
13869
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
|
13302
13870
|
} break;
|
|
13871
|
+
case LLM_ARCH_NEO_BERT:
|
|
13872
|
+
{
|
|
13873
|
+
llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
|
|
13874
|
+
} break;
|
|
13303
13875
|
case LLM_ARCH_BLOOM:
|
|
13304
13876
|
{
|
|
13305
13877
|
llm = std::make_unique<llm_build_bloom>(*this, params, gf);
|
|
@@ -13522,6 +14094,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13522
14094
|
{
|
|
13523
14095
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
|
13524
14096
|
} break;
|
|
14097
|
+
case LLM_ARCH_DOTS1:
|
|
14098
|
+
{
|
|
14099
|
+
llm = std::make_unique<llm_build_dots1>(*this, params, gf);
|
|
14100
|
+
} break;
|
|
14101
|
+
case LLM_ARCH_ARCEE:
|
|
14102
|
+
{
|
|
14103
|
+
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
|
|
14104
|
+
} break;
|
|
13525
14105
|
default:
|
|
13526
14106
|
GGML_ABORT("fatal error");
|
|
13527
14107
|
}
|
|
@@ -13593,6 +14173,22 @@ int32_t llama_model_n_head_kv(const llama_model * model) {
|
|
|
13593
14173
|
return model->hparams.n_head_kv();
|
|
13594
14174
|
}
|
|
13595
14175
|
|
|
14176
|
+
int32_t llama_model_n_swa(const llama_model * model) {
|
|
14177
|
+
return model->hparams.n_swa;
|
|
14178
|
+
}
|
|
14179
|
+
|
|
14180
|
+
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
|
|
14181
|
+
return model->hparams.n_cls_out;
|
|
14182
|
+
}
|
|
14183
|
+
|
|
14184
|
+
const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
|
|
14185
|
+
if (i < model->classifier_labels.size()) {
|
|
14186
|
+
return model->classifier_labels[i].c_str();
|
|
14187
|
+
}
|
|
14188
|
+
|
|
14189
|
+
return nullptr;
|
|
14190
|
+
}
|
|
14191
|
+
|
|
13596
14192
|
// deprecated
|
|
13597
14193
|
int32_t llama_n_ctx_train(const llama_model * model) {
|
|
13598
14194
|
return llama_model_n_ctx_train(model);
|
|
@@ -13655,6 +14251,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13655
14251
|
case LLM_ARCH_GRANITE_MOE:
|
|
13656
14252
|
case LLM_ARCH_CHAMELEON:
|
|
13657
14253
|
case LLM_ARCH_BAILINGMOE:
|
|
14254
|
+
case LLM_ARCH_NEO_BERT:
|
|
14255
|
+
case LLM_ARCH_ARCEE:
|
|
13658
14256
|
return LLAMA_ROPE_TYPE_NORM;
|
|
13659
14257
|
|
|
13660
14258
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -13688,6 +14286,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13688
14286
|
case LLM_ARCH_NEMOTRON:
|
|
13689
14287
|
case LLM_ARCH_EXAONE:
|
|
13690
14288
|
case LLM_ARCH_MINICPM3:
|
|
14289
|
+
case LLM_ARCH_DOTS1:
|
|
13691
14290
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
13692
14291
|
|
|
13693
14292
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -13753,7 +14352,7 @@ uint64_t llama_model_size(const llama_model * model) {
|
|
|
13753
14352
|
}
|
|
13754
14353
|
|
|
13755
14354
|
const char * llama_model_chat_template(const llama_model * model, const char * name) {
|
|
13756
|
-
const auto key = name ? LLM_KV(model->arch, name)(
|
|
14355
|
+
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
|
|
13757
14356
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
|
13758
14357
|
const auto & it = model->gguf_kv.find(key);
|
|
13759
14358
|
if (it == model->gguf_kv.end()) {
|
|
@@ -13795,14 +14394,7 @@ llama_token llama_model_decoder_start_token(const llama_model * model) {
|
|
|
13795
14394
|
}
|
|
13796
14395
|
|
|
13797
14396
|
bool llama_model_is_recurrent(const llama_model * model) {
|
|
13798
|
-
|
|
13799
|
-
case LLM_ARCH_MAMBA: return true;
|
|
13800
|
-
case LLM_ARCH_RWKV6: return true;
|
|
13801
|
-
case LLM_ARCH_RWKV6QWEN2: return true;
|
|
13802
|
-
case LLM_ARCH_RWKV7: return true;
|
|
13803
|
-
case LLM_ARCH_ARWKV7: return true;
|
|
13804
|
-
default: return false;
|
|
13805
|
-
}
|
|
14397
|
+
return llm_arch_is_recurrent(model->arch);
|
|
13806
14398
|
}
|
|
13807
14399
|
|
|
13808
14400
|
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|