@novastera-oss/llamarn 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +134 -36
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -2
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +30 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +50 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +134 -36
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
- package/cpp/llama.cpp/src/llama-batch.h +36 -11
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +313 -213
- package/cpp/llama.cpp/src/llama-context.h +16 -12
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
- package/cpp/llama.cpp/src/llama-graph.h +90 -34
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
- package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +64 -23
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +726 -141
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/llama.h +134 -36
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
|
@@ -8,7 +8,8 @@
|
|
|
8
8
|
|
|
9
9
|
#include "llama-kv-cache-unified.h"
|
|
10
10
|
#include "llama-kv-cache-unified-iswa.h"
|
|
11
|
-
#include "llama-
|
|
11
|
+
#include "llama-memory-hybrid.h"
|
|
12
|
+
#include "llama-memory-recurrent.h"
|
|
12
13
|
|
|
13
14
|
#include "ggml-cpp.h"
|
|
14
15
|
|
|
@@ -80,6 +81,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
80
81
|
case LLM_TYPE_40B: return "40B";
|
|
81
82
|
case LLM_TYPE_65B: return "65B";
|
|
82
83
|
case LLM_TYPE_70B: return "70B";
|
|
84
|
+
case LLM_TYPE_142B: return "142B";
|
|
83
85
|
case LLM_TYPE_236B: return "236B";
|
|
84
86
|
case LLM_TYPE_290B: return "290B";
|
|
85
87
|
case LLM_TYPE_314B: return "314B";
|
|
@@ -469,6 +471,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
469
471
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
470
472
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
|
471
473
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
|
474
|
+
std::fill(
|
|
475
|
+
hparams.recurrent_layer_arr.begin(),
|
|
476
|
+
hparams.recurrent_layer_arr.end(),
|
|
477
|
+
llm_arch_is_recurrent(ml.get_arch()));
|
|
472
478
|
|
|
473
479
|
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
|
474
480
|
|
|
@@ -543,6 +549,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
543
549
|
uint32_t n_vocab = 0;
|
|
544
550
|
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
|
545
551
|
|
|
552
|
+
// for classifier models
|
|
553
|
+
ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
|
|
554
|
+
if (!classifier_labels.empty()) {
|
|
555
|
+
hparams.n_cls_out = classifier_labels.size();
|
|
556
|
+
}
|
|
557
|
+
|
|
546
558
|
// arch-specific KVs
|
|
547
559
|
switch (arch) {
|
|
548
560
|
case LLM_ARCH_LLAMA:
|
|
@@ -592,6 +604,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
592
604
|
hparams.use_kq_norm = false;
|
|
593
605
|
}
|
|
594
606
|
} break;
|
|
607
|
+
case LLM_ARCH_ARCEE:
|
|
608
|
+
{
|
|
609
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
610
|
+
|
|
611
|
+
// Arcee uses the same structure as Llama
|
|
612
|
+
switch (hparams.n_layer) {
|
|
613
|
+
case 36: type = LLM_TYPE_4B; break;
|
|
614
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
615
|
+
}
|
|
616
|
+
} break;
|
|
595
617
|
case LLM_ARCH_DECI:
|
|
596
618
|
{
|
|
597
619
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -686,7 +708,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
686
708
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
687
709
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
688
710
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
689
|
-
ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
|
|
690
711
|
|
|
691
712
|
switch (hparams.n_layer) {
|
|
692
713
|
case 3:
|
|
@@ -733,6 +754,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
733
754
|
}
|
|
734
755
|
}
|
|
735
756
|
} break;
|
|
757
|
+
case LLM_ARCH_NEO_BERT:
|
|
758
|
+
{
|
|
759
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
760
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
761
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
762
|
+
|
|
763
|
+
if (hparams.n_layer == 28) {
|
|
764
|
+
type = LLM_TYPE_250M;
|
|
765
|
+
}
|
|
766
|
+
} break;
|
|
736
767
|
case LLM_ARCH_BLOOM:
|
|
737
768
|
{
|
|
738
769
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -956,6 +987,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
956
987
|
case 46: type = LLM_TYPE_27B; break;
|
|
957
988
|
default: type = LLM_TYPE_UNKNOWN;
|
|
958
989
|
}
|
|
990
|
+
|
|
991
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
|
|
992
|
+
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
993
|
+
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
994
|
+
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
959
995
|
} break;
|
|
960
996
|
case LLM_ARCH_GEMMA3:
|
|
961
997
|
{
|
|
@@ -976,6 +1012,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
976
1012
|
default: type = LLM_TYPE_UNKNOWN;
|
|
977
1013
|
}
|
|
978
1014
|
|
|
1015
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
|
|
979
1016
|
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
980
1017
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
981
1018
|
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
@@ -1433,6 +1470,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1433
1470
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1434
1471
|
}
|
|
1435
1472
|
} break;
|
|
1473
|
+
case LLM_ARCH_DOTS1:
|
|
1474
|
+
{
|
|
1475
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1476
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1477
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1478
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1479
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1480
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1481
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1482
|
+
switch (hparams.n_layer) {
|
|
1483
|
+
case 62: type = LLM_TYPE_142B; break;
|
|
1484
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1485
|
+
}
|
|
1486
|
+
} break;
|
|
1436
1487
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1437
1488
|
}
|
|
1438
1489
|
|
|
@@ -2176,6 +2227,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2176
2227
|
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
2177
2228
|
}
|
|
2178
2229
|
} break;
|
|
2230
|
+
case LLM_ARCH_NEO_BERT:
|
|
2231
|
+
{
|
|
2232
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2233
|
+
|
|
2234
|
+
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
2235
|
+
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2236
|
+
|
|
2237
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2238
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2239
|
+
|
|
2240
|
+
output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2241
|
+
|
|
2242
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2243
|
+
auto & layer = layers[i];
|
|
2244
|
+
|
|
2245
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2246
|
+
|
|
2247
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2248
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2249
|
+
|
|
2250
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2251
|
+
|
|
2252
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
|
|
2253
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2254
|
+
}
|
|
2255
|
+
} break;
|
|
2179
2256
|
case LLM_ARCH_JINA_BERT_V2:
|
|
2180
2257
|
{
|
|
2181
2258
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
|
@@ -2213,8 +2290,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2213
2290
|
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2214
2291
|
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2215
2292
|
|
|
2216
|
-
layer.
|
|
2217
|
-
layer.
|
|
2293
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2294
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
|
|
2218
2295
|
|
|
2219
2296
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2220
2297
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
@@ -4112,6 +4189,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4112
4189
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4113
4190
|
}
|
|
4114
4191
|
} break;
|
|
4192
|
+
case LLM_ARCH_DOTS1:
|
|
4193
|
+
{
|
|
4194
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
4195
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4196
|
+
|
|
4197
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4198
|
+
|
|
4199
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4200
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
4201
|
+
|
|
4202
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4203
|
+
auto & layer = layers[i];
|
|
4204
|
+
|
|
4205
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4206
|
+
|
|
4207
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4208
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4209
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4210
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4211
|
+
|
|
4212
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4213
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4214
|
+
|
|
4215
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4216
|
+
|
|
4217
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
|
4218
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4219
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4220
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4221
|
+
} else {
|
|
4222
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4223
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
4224
|
+
|
|
4225
|
+
if (n_expert == 0) {
|
|
4226
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
4227
|
+
}
|
|
4228
|
+
if (n_expert_used == 0) {
|
|
4229
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
4230
|
+
}
|
|
4231
|
+
|
|
4232
|
+
// MoE branch
|
|
4233
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4234
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
4235
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4236
|
+
|
|
4237
|
+
// Shared expert branch
|
|
4238
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4239
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
4240
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4241
|
+
}
|
|
4242
|
+
}
|
|
4243
|
+
} break;
|
|
4244
|
+
case LLM_ARCH_ARCEE:
|
|
4245
|
+
{
|
|
4246
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4247
|
+
|
|
4248
|
+
// output
|
|
4249
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4250
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4251
|
+
|
|
4252
|
+
// if output is NULL, init from the input tok embed
|
|
4253
|
+
if (output == NULL) {
|
|
4254
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4255
|
+
}
|
|
4256
|
+
|
|
4257
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4258
|
+
auto & layer = layers[i];
|
|
4259
|
+
|
|
4260
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4261
|
+
|
|
4262
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4263
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4264
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4265
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4266
|
+
|
|
4267
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4268
|
+
|
|
4269
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4270
|
+
|
|
4271
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4272
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4273
|
+
}
|
|
4274
|
+
} break;
|
|
4115
4275
|
default:
|
|
4116
4276
|
throw std::runtime_error("unknown architecture");
|
|
4117
4277
|
}
|
|
@@ -4356,6 +4516,15 @@ void llama_model::print_info() const {
|
|
|
4356
4516
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
4357
4517
|
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
4358
4518
|
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
4519
|
+
|
|
4520
|
+
if (!classifier_labels.empty()) {
|
|
4521
|
+
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
|
4522
|
+
|
|
4523
|
+
size_t i = 0;
|
|
4524
|
+
for (auto label : classifier_labels) {
|
|
4525
|
+
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
|
|
4526
|
+
}
|
|
4527
|
+
}
|
|
4359
4528
|
}
|
|
4360
4529
|
|
|
4361
4530
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
@@ -6023,7 +6192,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6023
6192
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
6024
6193
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
6025
6194
|
NULL,
|
|
6026
|
-
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
6195
|
+
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
|
|
6027
6196
|
cb(cur, "ffn_out", il);
|
|
6028
6197
|
} else {
|
|
6029
6198
|
cur = build_ffn(cur,
|
|
@@ -6054,6 +6223,117 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6054
6223
|
}
|
|
6055
6224
|
};
|
|
6056
6225
|
|
|
6226
|
+
struct llm_build_neo_bert : public llm_graph_context {
|
|
6227
|
+
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6228
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6229
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6230
|
+
|
|
6231
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6232
|
+
|
|
6233
|
+
ggml_tensor * cur;
|
|
6234
|
+
ggml_tensor * inpL;
|
|
6235
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
6236
|
+
|
|
6237
|
+
// construct input embeddings (token, type, position)
|
|
6238
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
6239
|
+
cb(inpL, "inp_embd", -1);
|
|
6240
|
+
|
|
6241
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
6242
|
+
|
|
6243
|
+
// iterate layers
|
|
6244
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
6245
|
+
ggml_tensor * cur = inpL;
|
|
6246
|
+
|
|
6247
|
+
ggml_tensor * Qcur;
|
|
6248
|
+
ggml_tensor * Kcur;
|
|
6249
|
+
ggml_tensor * Vcur;
|
|
6250
|
+
|
|
6251
|
+
// pre-norm
|
|
6252
|
+
cur = build_norm(inpL,
|
|
6253
|
+
model.layers[il].attn_norm, NULL,
|
|
6254
|
+
LLM_NORM_RMS, il);
|
|
6255
|
+
|
|
6256
|
+
// self-attention
|
|
6257
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6258
|
+
cb(cur, "wqkv", il);
|
|
6259
|
+
|
|
6260
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6261
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6262
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6263
|
+
|
|
6264
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6265
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6266
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6267
|
+
|
|
6268
|
+
// RoPE
|
|
6269
|
+
Qcur = ggml_rope_ext(
|
|
6270
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6271
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6272
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6273
|
+
);
|
|
6274
|
+
|
|
6275
|
+
Kcur = ggml_rope_ext(
|
|
6276
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6277
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6278
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6279
|
+
);
|
|
6280
|
+
|
|
6281
|
+
cb(Qcur, "Qcur", il);
|
|
6282
|
+
cb(Kcur, "Kcur", il);
|
|
6283
|
+
cb(Vcur, "Vcur", il);
|
|
6284
|
+
|
|
6285
|
+
cur = build_attn(inp_attn, gf,
|
|
6286
|
+
model.layers[il].wo, nullptr,
|
|
6287
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6288
|
+
cb(cur, "kqv_out", il);
|
|
6289
|
+
|
|
6290
|
+
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
6291
|
+
// skip computing output for unused tokens
|
|
6292
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6293
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6294
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6295
|
+
}
|
|
6296
|
+
|
|
6297
|
+
// re-add the layer input
|
|
6298
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
6299
|
+
|
|
6300
|
+
ggml_tensor * ffn_inp = cur;
|
|
6301
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
6302
|
+
|
|
6303
|
+
// pre-norm
|
|
6304
|
+
cur = build_norm(ffn_inp,
|
|
6305
|
+
model.layers[il].ffn_norm, NULL,
|
|
6306
|
+
LLM_NORM_RMS, il);
|
|
6307
|
+
cb(cur, "ffn_norm", il);
|
|
6308
|
+
|
|
6309
|
+
// feed-forward network
|
|
6310
|
+
cur = build_ffn(cur,
|
|
6311
|
+
model.layers[il].ffn_up,
|
|
6312
|
+
NULL, NULL, NULL, NULL, NULL,
|
|
6313
|
+
model.layers[il].ffn_down,
|
|
6314
|
+
NULL, NULL, NULL,
|
|
6315
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
6316
|
+
|
|
6317
|
+
// attentions bypass the intermediate layer
|
|
6318
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
6319
|
+
|
|
6320
|
+
// input for next layer
|
|
6321
|
+
inpL = cur;
|
|
6322
|
+
}
|
|
6323
|
+
|
|
6324
|
+
cur = inpL;
|
|
6325
|
+
|
|
6326
|
+
cur = build_norm(cur,
|
|
6327
|
+
model.output_norm_enc, NULL,
|
|
6328
|
+
LLM_NORM_RMS, -1);
|
|
6329
|
+
|
|
6330
|
+
cb(cur, "result_embd", -1);
|
|
6331
|
+
res->t_embd = cur;
|
|
6332
|
+
|
|
6333
|
+
ggml_build_forward_expand(gf, cur);
|
|
6334
|
+
}
|
|
6335
|
+
};
|
|
6336
|
+
|
|
6057
6337
|
struct llm_build_bloom : public llm_graph_context {
|
|
6058
6338
|
llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6059
6339
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -8484,14 +8764,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
8484
8764
|
cb(Kcur, "Kcur", il);
|
|
8485
8765
|
cb(Vcur, "Vcur", il);
|
|
8486
8766
|
|
|
8487
|
-
|
|
8488
|
-
switch (model.type) {
|
|
8489
|
-
case LLM_TYPE_2B:
|
|
8490
|
-
case LLM_TYPE_9B:
|
|
8491
|
-
case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
|
|
8492
|
-
default: GGML_ABORT("fatal error");
|
|
8493
|
-
};
|
|
8494
|
-
cb(Qcur, "Qcur_scaled", il);
|
|
8767
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
8495
8768
|
|
|
8496
8769
|
cur = build_attn(inp_attn, gf,
|
|
8497
8770
|
model.layers[il].wo, NULL,
|
|
@@ -8632,9 +8905,12 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8632
8905
|
cb(Kcur, "Kcur", il);
|
|
8633
8906
|
cb(Vcur, "Vcur", il);
|
|
8634
8907
|
|
|
8908
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
|
|
8909
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
8910
|
+
|
|
8635
8911
|
cur = build_attn(inp_attn, gf,
|
|
8636
8912
|
model.layers[il].wo, NULL,
|
|
8637
|
-
Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
8913
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8638
8914
|
}
|
|
8639
8915
|
|
|
8640
8916
|
cur = build_norm(cur,
|
|
@@ -8840,8 +9116,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8840
9116
|
// {n_embd, n_tokens}
|
|
8841
9117
|
inpL = build_inp_embd(model.tok_embd);
|
|
8842
9118
|
|
|
8843
|
-
|
|
8844
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
9119
|
+
auto * rs_inp = build_rs_inp();
|
|
8845
9120
|
|
|
8846
9121
|
for (int il = 0; il < n_layer; ++il) {
|
|
8847
9122
|
// norm
|
|
@@ -8850,8 +9125,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8850
9125
|
LLM_NORM_RMS, il);
|
|
8851
9126
|
cb(cur, "attn_norm", il);
|
|
8852
9127
|
|
|
8853
|
-
|
|
8854
|
-
cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
|
|
9128
|
+
cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
|
|
8855
9129
|
|
|
8856
9130
|
if (il == n_layer - 1) {
|
|
8857
9131
|
// skip computing output for unused tokens
|
|
@@ -8889,13 +9163,12 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8889
9163
|
|
|
8890
9164
|
// TODO: split
|
|
8891
9165
|
ggml_tensor * build_mamba_layer(
|
|
8892
|
-
|
|
8893
|
-
|
|
8894
|
-
|
|
8895
|
-
|
|
8896
|
-
|
|
8897
|
-
|
|
8898
|
-
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
|
9166
|
+
llm_graph_input_rs * inp,
|
|
9167
|
+
ggml_cgraph * gf,
|
|
9168
|
+
ggml_tensor * cur,
|
|
9169
|
+
const llama_ubatch & ubatch,
|
|
9170
|
+
int il) const {
|
|
9171
|
+
const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
|
|
8899
9172
|
|
|
8900
9173
|
const auto kv_head = kv_state->get_head();
|
|
8901
9174
|
|
|
@@ -8915,17 +9188,17 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8915
9188
|
GGML_ASSERT(ubatch.equal_seqs);
|
|
8916
9189
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
8917
9190
|
|
|
8918
|
-
ggml_tensor * conv_states_all = kv_state->
|
|
8919
|
-
ggml_tensor * ssm_states_all = kv_state->
|
|
9191
|
+
ggml_tensor * conv_states_all = kv_state->get_r_l(il);
|
|
9192
|
+
ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
|
|
8920
9193
|
|
|
8921
9194
|
// (ab)using the KV cache to store the states
|
|
8922
|
-
ggml_tensor * conv =
|
|
8923
|
-
gf, conv_states_all,
|
|
8924
|
-
hparams.
|
|
9195
|
+
ggml_tensor * conv = build_rs(
|
|
9196
|
+
inp, gf, conv_states_all,
|
|
9197
|
+
hparams.n_embd_r(), n_seqs);
|
|
8925
9198
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
|
8926
|
-
ggml_tensor * ssm =
|
|
8927
|
-
gf, ssm_states_all,
|
|
8928
|
-
hparams.
|
|
9199
|
+
ggml_tensor * ssm = build_rs(
|
|
9200
|
+
inp, gf, ssm_states_all,
|
|
9201
|
+
hparams.n_embd_s(), n_seqs);
|
|
8929
9202
|
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
|
8930
9203
|
|
|
8931
9204
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
@@ -11636,14 +11909,13 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11636
11909
|
}
|
|
11637
11910
|
|
|
11638
11911
|
ggml_tensor * build_rwkv6_time_mix(
|
|
11912
|
+
llm_graph_input_rs * inp,
|
|
11639
11913
|
ggml_cgraph * gf,
|
|
11640
11914
|
ggml_tensor * cur,
|
|
11641
11915
|
ggml_tensor * x_prev,
|
|
11642
|
-
ggml_tensor * state_copy,
|
|
11643
|
-
ggml_tensor * state_mask,
|
|
11644
11916
|
const llama_ubatch & ubatch,
|
|
11645
11917
|
int il) const {
|
|
11646
|
-
const auto * kv_state = static_cast<const
|
|
11918
|
+
const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
|
|
11647
11919
|
|
|
11648
11920
|
const auto n_tokens = ubatch.n_tokens;
|
|
11649
11921
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -11764,9 +12036,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11764
12036
|
k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
|
|
11765
12037
|
}
|
|
11766
12038
|
|
|
11767
|
-
ggml_tensor * wkv_state =
|
|
11768
|
-
gf, kv_state->
|
|
11769
|
-
hparams.
|
|
12039
|
+
ggml_tensor * wkv_state = build_rs(
|
|
12040
|
+
inp, gf, kv_state->get_s_l(il),
|
|
12041
|
+
hparams.n_embd_s(), n_seqs);
|
|
11770
12042
|
|
|
11771
12043
|
ggml_tensor * wkv_output;
|
|
11772
12044
|
if (is_qrwkv) {
|
|
@@ -11784,9 +12056,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11784
12056
|
wkv_state,
|
|
11785
12057
|
ggml_view_1d(
|
|
11786
12058
|
ctx0,
|
|
11787
|
-
kv_state->
|
|
11788
|
-
hparams.
|
|
11789
|
-
hparams.
|
|
12059
|
+
kv_state->get_s_l(il),
|
|
12060
|
+
hparams.n_embd_s() * n_seqs,
|
|
12061
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
|
|
11790
12062
|
)
|
|
11791
12063
|
)
|
|
11792
12064
|
);
|
|
@@ -11820,8 +12092,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11820
12092
|
inpL = build_inp_embd(model.tok_embd);
|
|
11821
12093
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
|
11822
12094
|
|
|
11823
|
-
|
|
11824
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12095
|
+
auto * rs_inp = build_rs_inp();
|
|
11825
12096
|
|
|
11826
12097
|
const auto n_embd = hparams.n_embd;
|
|
11827
12098
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
@@ -11831,9 +12102,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11831
12102
|
const llama_layer * layer = &model.layers[il];
|
|
11832
12103
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
11833
12104
|
|
|
11834
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
11835
|
-
gf, state_copy, state_mask, ubatch, il
|
|
11836
|
-
);
|
|
12105
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
11837
12106
|
|
|
11838
12107
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
11839
12108
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -11848,7 +12117,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11848
12117
|
1
|
|
11849
12118
|
);
|
|
11850
12119
|
|
|
11851
|
-
cur = build_rwkv6_time_mix(gf, att_norm, x_prev,
|
|
12120
|
+
cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
|
|
11852
12121
|
|
|
11853
12122
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
11854
12123
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -11911,15 +12180,14 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11911
12180
|
// ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
|
|
11912
12181
|
struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
11913
12182
|
llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
|
|
11914
|
-
GGML_ASSERT(n_embd == hparams.
|
|
12183
|
+
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
11915
12184
|
|
|
11916
12185
|
ggml_tensor * cur;
|
|
11917
12186
|
ggml_tensor * inpL;
|
|
11918
12187
|
|
|
11919
12188
|
inpL = build_inp_embd(model.tok_embd);
|
|
11920
12189
|
|
|
11921
|
-
|
|
11922
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12190
|
+
auto * rs_inp = build_rs_inp();
|
|
11923
12191
|
|
|
11924
12192
|
const auto n_embd = hparams.n_embd;
|
|
11925
12193
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
@@ -11929,9 +12197,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
11929
12197
|
const llama_layer * layer = &model.layers[il];
|
|
11930
12198
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
11931
12199
|
|
|
11932
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
11933
|
-
gf, state_copy, state_mask, ubatch, il
|
|
11934
|
-
);
|
|
12200
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
11935
12201
|
|
|
11936
12202
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
11937
12203
|
cb(att_norm, "attn_norm", il);
|
|
@@ -11943,7 +12209,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
11943
12209
|
1
|
|
11944
12210
|
);
|
|
11945
12211
|
|
|
11946
|
-
cur = build_rwkv6_time_mix(gf, att_norm, x_prev,
|
|
12212
|
+
cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
|
|
11947
12213
|
|
|
11948
12214
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
11949
12215
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -12031,15 +12297,14 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12031
12297
|
}
|
|
12032
12298
|
|
|
12033
12299
|
ggml_tensor * build_rwkv7_time_mix(
|
|
12300
|
+
llm_graph_input_rs * inp,
|
|
12034
12301
|
ggml_cgraph * gf,
|
|
12035
12302
|
ggml_tensor * cur,
|
|
12036
12303
|
ggml_tensor * x_prev,
|
|
12037
|
-
ggml_tensor * state_copy,
|
|
12038
|
-
ggml_tensor * state_mask,
|
|
12039
12304
|
ggml_tensor *& first_layer_value,
|
|
12040
12305
|
const llama_ubatch & ubatch,
|
|
12041
12306
|
int il) const {
|
|
12042
|
-
const auto * kv_state = static_cast<const
|
|
12307
|
+
const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
|
|
12043
12308
|
|
|
12044
12309
|
const auto n_tokens = ubatch.n_tokens;
|
|
12045
12310
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -12118,9 +12383,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12118
12383
|
v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
|
|
12119
12384
|
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
|
12120
12385
|
|
|
12121
|
-
ggml_tensor * wkv_state =
|
|
12122
|
-
gf, kv_state->
|
|
12123
|
-
hparams.
|
|
12386
|
+
ggml_tensor * wkv_state = build_rs(
|
|
12387
|
+
inp, gf, kv_state->get_s_l(il),
|
|
12388
|
+
hparams.n_embd_s(), n_seqs);
|
|
12124
12389
|
|
|
12125
12390
|
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
|
12126
12391
|
cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
|
|
@@ -12133,9 +12398,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12133
12398
|
wkv_state,
|
|
12134
12399
|
ggml_view_1d(
|
|
12135
12400
|
ctx0,
|
|
12136
|
-
kv_state->
|
|
12137
|
-
hparams.
|
|
12138
|
-
hparams.
|
|
12401
|
+
kv_state->get_s_l(il),
|
|
12402
|
+
hparams.n_embd_s() * n_seqs,
|
|
12403
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
|
|
12139
12404
|
)
|
|
12140
12405
|
)
|
|
12141
12406
|
);
|
|
@@ -12176,8 +12441,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12176
12441
|
inpL = build_inp_embd(model.tok_embd);
|
|
12177
12442
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
|
12178
12443
|
|
|
12179
|
-
|
|
12180
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12444
|
+
auto * rs_inp = build_rs_inp();
|
|
12181
12445
|
|
|
12182
12446
|
const auto n_embd = hparams.n_embd;
|
|
12183
12447
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
@@ -12187,9 +12451,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12187
12451
|
const llama_layer * layer = &model.layers[il];
|
|
12188
12452
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
12189
12453
|
|
|
12190
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
12191
|
-
gf, state_copy, state_mask, ubatch, il
|
|
12192
|
-
);
|
|
12454
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
12193
12455
|
|
|
12194
12456
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
12195
12457
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -12204,7 +12466,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12204
12466
|
1
|
|
12205
12467
|
);
|
|
12206
12468
|
|
|
12207
|
-
cur = build_rwkv7_time_mix(gf, att_norm, x_prev,
|
|
12469
|
+
cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
|
|
12208
12470
|
|
|
12209
12471
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
12210
12472
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -12262,7 +12524,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12262
12524
|
|
|
12263
12525
|
struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
12264
12526
|
llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
|
|
12265
|
-
GGML_ASSERT(n_embd == hparams.
|
|
12527
|
+
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
12266
12528
|
|
|
12267
12529
|
ggml_tensor * cur;
|
|
12268
12530
|
ggml_tensor * inpL;
|
|
@@ -12270,8 +12532,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12270
12532
|
|
|
12271
12533
|
inpL = build_inp_embd(model.tok_embd);
|
|
12272
12534
|
|
|
12273
|
-
|
|
12274
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12535
|
+
auto * rs_inp = build_rs_inp();
|
|
12275
12536
|
|
|
12276
12537
|
const auto n_embd = hparams.n_embd;
|
|
12277
12538
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
@@ -12281,9 +12542,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12281
12542
|
const llama_layer * layer = &model.layers[il];
|
|
12282
12543
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
12283
12544
|
|
|
12284
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
12285
|
-
gf, state_copy, state_mask, ubatch, il
|
|
12286
|
-
);
|
|
12545
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
12287
12546
|
|
|
12288
12547
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
12289
12548
|
cb(att_norm, "attn_norm", il);
|
|
@@ -12295,7 +12554,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12295
12554
|
1
|
|
12296
12555
|
);
|
|
12297
12556
|
|
|
12298
|
-
cur = build_rwkv7_time_mix(gf, att_norm, x_prev,
|
|
12557
|
+
cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
|
|
12299
12558
|
|
|
12300
12559
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
12301
12560
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -13187,69 +13446,375 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13187
13446
|
}
|
|
13188
13447
|
};
|
|
13189
13448
|
|
|
13190
|
-
|
|
13191
|
-
|
|
13192
|
-
|
|
13193
|
-
switch (arch) {
|
|
13194
|
-
case LLM_ARCH_BERT:
|
|
13195
|
-
case LLM_ARCH_JINA_BERT_V2:
|
|
13196
|
-
case LLM_ARCH_NOMIC_BERT:
|
|
13197
|
-
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
13198
|
-
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
13199
|
-
{
|
|
13200
|
-
res = nullptr;
|
|
13201
|
-
} break;
|
|
13202
|
-
case LLM_ARCH_MAMBA:
|
|
13203
|
-
case LLM_ARCH_RWKV6:
|
|
13204
|
-
case LLM_ARCH_RWKV6QWEN2:
|
|
13205
|
-
case LLM_ARCH_RWKV7:
|
|
13206
|
-
case LLM_ARCH_ARWKV7:
|
|
13207
|
-
{
|
|
13208
|
-
res = new llama_kv_cache_recurrent(
|
|
13209
|
-
*this,
|
|
13210
|
-
GGML_TYPE_F32,
|
|
13211
|
-
GGML_TYPE_F32,
|
|
13212
|
-
cparams.offload_kqv,
|
|
13213
|
-
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13214
|
-
cparams.n_seq_max);
|
|
13215
|
-
} break;
|
|
13216
|
-
default:
|
|
13217
|
-
{
|
|
13218
|
-
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
13449
|
+
struct llm_build_dots1 : public llm_graph_context {
|
|
13450
|
+
llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
13451
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13219
13452
|
|
|
13220
|
-
|
|
13453
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13454
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13221
13455
|
|
|
13222
|
-
|
|
13456
|
+
ggml_tensor * cur;
|
|
13457
|
+
ggml_tensor * inpL;
|
|
13223
13458
|
|
|
13224
|
-
|
|
13225
|
-
GGML_ASSERT(hparams.is_swa_any());
|
|
13459
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13226
13460
|
|
|
13227
|
-
|
|
13228
|
-
|
|
13229
|
-
|
|
13230
|
-
|
|
13231
|
-
|
|
13232
|
-
|
|
13233
|
-
|
|
13234
|
-
|
|
13235
|
-
|
|
13236
|
-
|
|
13237
|
-
|
|
13238
|
-
|
|
13239
|
-
|
|
13461
|
+
// inp_pos - contains the positions
|
|
13462
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13463
|
+
|
|
13464
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13465
|
+
|
|
13466
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
13467
|
+
ggml_tensor * inpSA = inpL;
|
|
13468
|
+
|
|
13469
|
+
// norm
|
|
13470
|
+
cur = build_norm(inpL,
|
|
13471
|
+
model.layers[il].attn_norm, NULL,
|
|
13472
|
+
LLM_NORM_RMS, il);
|
|
13473
|
+
cb(cur, "attn_norm", il);
|
|
13474
|
+
|
|
13475
|
+
// self_attention
|
|
13476
|
+
{
|
|
13477
|
+
// compute Q and K and RoPE them
|
|
13478
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13479
|
+
cb(Qcur, "Qcur", il);
|
|
13480
|
+
|
|
13481
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13482
|
+
cb(Kcur, "Kcur", il);
|
|
13483
|
+
|
|
13484
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13485
|
+
cb(Vcur, "Vcur", il);
|
|
13486
|
+
|
|
13487
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13488
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13489
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13490
|
+
|
|
13491
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
13492
|
+
cb(Qcur, "Qcur_normed", il);
|
|
13493
|
+
|
|
13494
|
+
Qcur = ggml_rope_ext(
|
|
13495
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
13496
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13497
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13498
|
+
);
|
|
13499
|
+
|
|
13500
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
13501
|
+
cb(Kcur, "Kcur_normed", il);
|
|
13502
|
+
|
|
13503
|
+
Kcur = ggml_rope_ext(
|
|
13504
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
13505
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13506
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13507
|
+
);
|
|
13508
|
+
|
|
13509
|
+
cb(Qcur, "Qcur", il);
|
|
13510
|
+
cb(Kcur, "Kcur", il);
|
|
13511
|
+
cb(Vcur, "Vcur", il);
|
|
13512
|
+
|
|
13513
|
+
cur = build_attn(inp_attn, gf,
|
|
13514
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
13515
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13516
|
+
}
|
|
13517
|
+
|
|
13518
|
+
if (il == n_layer - 1) {
|
|
13519
|
+
// skip computing output for unused tokens
|
|
13520
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13521
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13522
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13523
|
+
}
|
|
13524
|
+
|
|
13525
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13526
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13527
|
+
|
|
13528
|
+
// MoE branch
|
|
13529
|
+
cur = build_norm(ffn_inp,
|
|
13530
|
+
model.layers[il].ffn_norm, NULL,
|
|
13531
|
+
LLM_NORM_RMS, il);
|
|
13532
|
+
cb(cur, "ffn_norm", il);
|
|
13533
|
+
|
|
13534
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
|
13535
|
+
cur = build_ffn(cur,
|
|
13536
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13537
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
13538
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13539
|
+
NULL,
|
|
13540
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13541
|
+
cb(cur, "ffn_out", il);
|
|
13542
|
+
} else {
|
|
13543
|
+
ggml_tensor * moe_out =
|
|
13544
|
+
build_moe_ffn(cur,
|
|
13545
|
+
model.layers[il].ffn_gate_inp,
|
|
13546
|
+
model.layers[il].ffn_up_exps,
|
|
13547
|
+
model.layers[il].ffn_gate_exps,
|
|
13548
|
+
model.layers[il].ffn_down_exps,
|
|
13549
|
+
model.layers[il].ffn_exp_probs_b,
|
|
13550
|
+
n_expert, n_expert_used,
|
|
13551
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
13552
|
+
true, hparams.expert_weights_scale,
|
|
13553
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
13554
|
+
il);
|
|
13555
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
13556
|
+
|
|
13557
|
+
{
|
|
13558
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
13559
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
13560
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
13561
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
13562
|
+
NULL,
|
|
13563
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13564
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
13565
|
+
|
|
13566
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
13567
|
+
cb(cur, "ffn_out", il);
|
|
13568
|
+
}
|
|
13569
|
+
}
|
|
13570
|
+
|
|
13571
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13572
|
+
|
|
13573
|
+
cur = build_cvec(cur, il);
|
|
13574
|
+
cb(cur, "l_out", il);
|
|
13575
|
+
|
|
13576
|
+
// input for next layer
|
|
13577
|
+
inpL = cur;
|
|
13578
|
+
}
|
|
13579
|
+
|
|
13580
|
+
cur = inpL;
|
|
13581
|
+
|
|
13582
|
+
cur = build_norm(cur,
|
|
13583
|
+
model.output_norm, NULL,
|
|
13584
|
+
LLM_NORM_RMS, -1);
|
|
13585
|
+
|
|
13586
|
+
cb(cur, "result_norm", -1);
|
|
13587
|
+
res->t_embd = cur;
|
|
13588
|
+
|
|
13589
|
+
// lm_head
|
|
13590
|
+
cur = build_lora_mm(model.output, cur);
|
|
13591
|
+
|
|
13592
|
+
cb(cur, "result_output", -1);
|
|
13593
|
+
res->t_logits = cur;
|
|
13594
|
+
|
|
13595
|
+
ggml_build_forward_expand(gf, cur);
|
|
13596
|
+
}
|
|
13597
|
+
};
|
|
13598
|
+
|
|
13599
|
+
struct llm_build_arcee : public llm_graph_context {
|
|
13600
|
+
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
13601
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13602
|
+
|
|
13603
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13604
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13605
|
+
|
|
13606
|
+
ggml_tensor * cur;
|
|
13607
|
+
ggml_tensor * inpL;
|
|
13608
|
+
|
|
13609
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13610
|
+
|
|
13611
|
+
// inp_pos - contains the positions
|
|
13612
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13613
|
+
|
|
13614
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13615
|
+
|
|
13616
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
13617
|
+
|
|
13618
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
13619
|
+
ggml_tensor * inpSA = inpL;
|
|
13620
|
+
|
|
13621
|
+
// norm
|
|
13622
|
+
cur = build_norm(inpL,
|
|
13623
|
+
model.layers[il].attn_norm, NULL,
|
|
13624
|
+
LLM_NORM_RMS, il);
|
|
13625
|
+
cb(cur, "attn_norm", il);
|
|
13626
|
+
|
|
13627
|
+
// self-attention
|
|
13628
|
+
{
|
|
13629
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
13630
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
13631
|
+
|
|
13632
|
+
// compute Q and K and RoPE them
|
|
13633
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13634
|
+
cb(Qcur, "Qcur", il);
|
|
13635
|
+
if (model.layers[il].bq) {
|
|
13636
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
13637
|
+
cb(Qcur, "Qcur", il);
|
|
13638
|
+
}
|
|
13639
|
+
|
|
13640
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13641
|
+
cb(Kcur, "Kcur", il);
|
|
13642
|
+
if (model.layers[il].bk) {
|
|
13643
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
13644
|
+
cb(Kcur, "Kcur", il);
|
|
13645
|
+
}
|
|
13646
|
+
|
|
13647
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13648
|
+
cb(Vcur, "Vcur", il);
|
|
13649
|
+
if (model.layers[il].bv) {
|
|
13650
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
13651
|
+
cb(Vcur, "Vcur", il);
|
|
13652
|
+
}
|
|
13653
|
+
|
|
13654
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13655
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13656
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13657
|
+
|
|
13658
|
+
Qcur = ggml_rope_ext(
|
|
13659
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
13660
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13661
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13662
|
+
);
|
|
13240
13663
|
|
|
13241
|
-
|
|
13664
|
+
Kcur = ggml_rope_ext(
|
|
13665
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
13666
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13667
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13668
|
+
);
|
|
13669
|
+
|
|
13670
|
+
cb(Qcur, "Qcur", il);
|
|
13671
|
+
cb(Kcur, "Kcur", il);
|
|
13672
|
+
cb(Vcur, "Vcur", il);
|
|
13673
|
+
|
|
13674
|
+
cur = build_attn(inp_attn, gf,
|
|
13675
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
13676
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
13677
|
+
cb(cur, "attn_out", il);
|
|
13678
|
+
}
|
|
13679
|
+
|
|
13680
|
+
if (il == n_layer - 1) {
|
|
13681
|
+
// skip computing output for unused tokens
|
|
13682
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13683
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13684
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13685
|
+
}
|
|
13686
|
+
|
|
13687
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13688
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13689
|
+
|
|
13690
|
+
// feed-forward network
|
|
13691
|
+
// ARCEE uses relu^2 instead of silu
|
|
13692
|
+
cur = build_norm(ffn_inp,
|
|
13693
|
+
model.layers[il].ffn_norm, NULL,
|
|
13694
|
+
LLM_NORM_RMS, il);
|
|
13695
|
+
cb(cur, "ffn_norm", il);
|
|
13696
|
+
|
|
13697
|
+
cur = build_ffn(cur,
|
|
13698
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13699
|
+
NULL, NULL, NULL,
|
|
13700
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13701
|
+
NULL,
|
|
13702
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
13703
|
+
cb(cur, "ffn_out", il);
|
|
13704
|
+
|
|
13705
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13706
|
+
cb(cur, "ffn_out", il);
|
|
13707
|
+
|
|
13708
|
+
cur = build_cvec(cur, il);
|
|
13709
|
+
cb(cur, "l_out", il);
|
|
13710
|
+
|
|
13711
|
+
// input for next layer
|
|
13712
|
+
inpL = cur;
|
|
13713
|
+
}
|
|
13714
|
+
|
|
13715
|
+
cur = inpL;
|
|
13716
|
+
|
|
13717
|
+
cur = build_norm(cur,
|
|
13718
|
+
model.output_norm, NULL,
|
|
13719
|
+
LLM_NORM_RMS, -1);
|
|
13720
|
+
|
|
13721
|
+
cb(cur, "result_norm", -1);
|
|
13722
|
+
res->t_embd = cur;
|
|
13723
|
+
|
|
13724
|
+
// lm_head
|
|
13725
|
+
cur = build_lora_mm(model.output, cur);
|
|
13726
|
+
|
|
13727
|
+
cb(cur, "result_output", -1);
|
|
13728
|
+
res->t_logits = cur;
|
|
13729
|
+
|
|
13730
|
+
ggml_build_forward_expand(gf, cur);
|
|
13731
|
+
}
|
|
13732
|
+
};
|
|
13733
|
+
|
|
13734
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
13735
|
+
llama_memory_i * res;
|
|
13736
|
+
|
|
13737
|
+
switch (arch) {
|
|
13738
|
+
// Models that need specific instantiation should be handled in the
|
|
13739
|
+
// switch statement
|
|
13740
|
+
case LLM_ARCH_BERT:
|
|
13741
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
13742
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
13743
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
13744
|
+
case LLM_ARCH_NEO_BERT:
|
|
13745
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
13746
|
+
{
|
|
13747
|
+
res = nullptr;
|
|
13748
|
+
} break;
|
|
13749
|
+
// Models that need standard caching should rely on recurrent/hybrid
|
|
13750
|
+
// checks
|
|
13751
|
+
default:
|
|
13752
|
+
{
|
|
13753
|
+
if (llm_arch_is_recurrent(arch)) {
|
|
13754
|
+
res = new llama_memory_recurrent(
|
|
13242
13755
|
*this,
|
|
13243
13756
|
nullptr,
|
|
13244
|
-
|
|
13245
|
-
|
|
13246
|
-
!cparams.flash_attn,
|
|
13757
|
+
GGML_TYPE_F32,
|
|
13758
|
+
GGML_TYPE_F32,
|
|
13247
13759
|
cparams.offload_kqv,
|
|
13248
|
-
cparams.
|
|
13249
|
-
cparams.n_seq_max
|
|
13250
|
-
|
|
13251
|
-
|
|
13252
|
-
|
|
13760
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13761
|
+
cparams.n_seq_max);
|
|
13762
|
+
} else if (llm_arch_is_hybrid(arch)) {
|
|
13763
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
13764
|
+
|
|
13765
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
13766
|
+
|
|
13767
|
+
res = new llama_memory_hybrid(
|
|
13768
|
+
/* model */ *this,
|
|
13769
|
+
/* attn_type_k */ params.type_k,
|
|
13770
|
+
/* attn_type_v */ params.type_v,
|
|
13771
|
+
/* attn_v_trans */ !cparams.flash_attn,
|
|
13772
|
+
/* attn_kv_size */ cparams.n_ctx,
|
|
13773
|
+
/* attn_n_pad */ padding,
|
|
13774
|
+
/* attn_n_swa */ hparams.n_swa,
|
|
13775
|
+
/* attn_swa_type */ hparams.swa_type,
|
|
13776
|
+
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
13777
|
+
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
13778
|
+
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13779
|
+
/* n_seq_max */ cparams.n_seq_max,
|
|
13780
|
+
/* offload */ cparams.offload_kqv);
|
|
13781
|
+
} else {
|
|
13782
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
13783
|
+
|
|
13784
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
13785
|
+
|
|
13786
|
+
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
13787
|
+
|
|
13788
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
13789
|
+
GGML_ASSERT(hparams.is_swa_any());
|
|
13790
|
+
|
|
13791
|
+
res = new llama_kv_cache_unified_iswa(
|
|
13792
|
+
*this,
|
|
13793
|
+
params.type_k,
|
|
13794
|
+
params.type_v,
|
|
13795
|
+
!cparams.flash_attn,
|
|
13796
|
+
cparams.offload_kqv,
|
|
13797
|
+
params.swa_full,
|
|
13798
|
+
cparams.n_ctx,
|
|
13799
|
+
cparams.n_seq_max,
|
|
13800
|
+
cparams.n_ubatch,
|
|
13801
|
+
padding);
|
|
13802
|
+
} else {
|
|
13803
|
+
GGML_ASSERT(!hparams.is_swa_any());
|
|
13804
|
+
|
|
13805
|
+
res = new llama_kv_cache_unified(
|
|
13806
|
+
*this,
|
|
13807
|
+
nullptr,
|
|
13808
|
+
params.type_k,
|
|
13809
|
+
params.type_v,
|
|
13810
|
+
!cparams.flash_attn,
|
|
13811
|
+
cparams.offload_kqv,
|
|
13812
|
+
cparams.n_ctx,
|
|
13813
|
+
cparams.n_seq_max,
|
|
13814
|
+
padding,
|
|
13815
|
+
hparams.n_swa,
|
|
13816
|
+
hparams.swa_type);
|
|
13817
|
+
}
|
|
13253
13818
|
}
|
|
13254
13819
|
}
|
|
13255
13820
|
}
|
|
@@ -13303,6 +13868,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13303
13868
|
{
|
|
13304
13869
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
|
13305
13870
|
} break;
|
|
13871
|
+
case LLM_ARCH_NEO_BERT:
|
|
13872
|
+
{
|
|
13873
|
+
llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
|
|
13874
|
+
} break;
|
|
13306
13875
|
case LLM_ARCH_BLOOM:
|
|
13307
13876
|
{
|
|
13308
13877
|
llm = std::make_unique<llm_build_bloom>(*this, params, gf);
|
|
@@ -13525,6 +14094,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13525
14094
|
{
|
|
13526
14095
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
|
13527
14096
|
} break;
|
|
14097
|
+
case LLM_ARCH_DOTS1:
|
|
14098
|
+
{
|
|
14099
|
+
llm = std::make_unique<llm_build_dots1>(*this, params, gf);
|
|
14100
|
+
} break;
|
|
14101
|
+
case LLM_ARCH_ARCEE:
|
|
14102
|
+
{
|
|
14103
|
+
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
|
|
14104
|
+
} break;
|
|
13528
14105
|
default:
|
|
13529
14106
|
GGML_ABORT("fatal error");
|
|
13530
14107
|
}
|
|
@@ -13600,6 +14177,18 @@ int32_t llama_model_n_swa(const llama_model * model) {
|
|
|
13600
14177
|
return model->hparams.n_swa;
|
|
13601
14178
|
}
|
|
13602
14179
|
|
|
14180
|
+
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
|
|
14181
|
+
return model->hparams.n_cls_out;
|
|
14182
|
+
}
|
|
14183
|
+
|
|
14184
|
+
const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
|
|
14185
|
+
if (i < model->classifier_labels.size()) {
|
|
14186
|
+
return model->classifier_labels[i].c_str();
|
|
14187
|
+
}
|
|
14188
|
+
|
|
14189
|
+
return nullptr;
|
|
14190
|
+
}
|
|
14191
|
+
|
|
13603
14192
|
// deprecated
|
|
13604
14193
|
int32_t llama_n_ctx_train(const llama_model * model) {
|
|
13605
14194
|
return llama_model_n_ctx_train(model);
|
|
@@ -13662,6 +14251,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13662
14251
|
case LLM_ARCH_GRANITE_MOE:
|
|
13663
14252
|
case LLM_ARCH_CHAMELEON:
|
|
13664
14253
|
case LLM_ARCH_BAILINGMOE:
|
|
14254
|
+
case LLM_ARCH_NEO_BERT:
|
|
14255
|
+
case LLM_ARCH_ARCEE:
|
|
13665
14256
|
return LLAMA_ROPE_TYPE_NORM;
|
|
13666
14257
|
|
|
13667
14258
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -13695,6 +14286,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13695
14286
|
case LLM_ARCH_NEMOTRON:
|
|
13696
14287
|
case LLM_ARCH_EXAONE:
|
|
13697
14288
|
case LLM_ARCH_MINICPM3:
|
|
14289
|
+
case LLM_ARCH_DOTS1:
|
|
13698
14290
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
13699
14291
|
|
|
13700
14292
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -13760,7 +14352,7 @@ uint64_t llama_model_size(const llama_model * model) {
|
|
|
13760
14352
|
}
|
|
13761
14353
|
|
|
13762
14354
|
const char * llama_model_chat_template(const llama_model * model, const char * name) {
|
|
13763
|
-
const auto key = name ? LLM_KV(model->arch, name)(
|
|
14355
|
+
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
|
|
13764
14356
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
|
13765
14357
|
const auto & it = model->gguf_kv.find(key);
|
|
13766
14358
|
if (it == model->gguf_kv.end()) {
|
|
@@ -13802,14 +14394,7 @@ llama_token llama_model_decoder_start_token(const llama_model * model) {
|
|
|
13802
14394
|
}
|
|
13803
14395
|
|
|
13804
14396
|
bool llama_model_is_recurrent(const llama_model * model) {
|
|
13805
|
-
|
|
13806
|
-
case LLM_ARCH_MAMBA: return true;
|
|
13807
|
-
case LLM_ARCH_RWKV6: return true;
|
|
13808
|
-
case LLM_ARCH_RWKV6QWEN2: return true;
|
|
13809
|
-
case LLM_ARCH_RWKV7: return true;
|
|
13810
|
-
case LLM_ARCH_ARWKV7: return true;
|
|
13811
|
-
default: return false;
|
|
13812
|
-
}
|
|
14397
|
+
return llm_arch_is_recurrent(model->arch);
|
|
13813
14398
|
}
|
|
13814
14399
|
|
|
13815
14400
|
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|