@novastera-oss/llamarn 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/common/arg.cpp +7 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +1 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
- package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -3
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
- package/cpp/llama.cpp/src/llama-batch.h +98 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
- package/cpp/llama.cpp/src/llama-graph.h +44 -32
- package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-hparams.h +8 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
- package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.h +18 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
- package/cpp/llama.cpp/src/llama-model.h +22 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/common.h +1 -0
- package/ios/include/llama.h +8 -3
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -103,6 +103,8 @@ const char * llm_type_name(llm_type type) {
|
|
|
103
103
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
104
104
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
105
105
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
106
|
+
case LLM_TYPE_E2B: return "E2B";
|
|
107
|
+
case LLM_TYPE_E4B: return "E4B";
|
|
106
108
|
default: return "?B";
|
|
107
109
|
}
|
|
108
110
|
}
|
|
@@ -1017,6 +1019,24 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1017
1019
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
1018
1020
|
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
1019
1021
|
} break;
|
|
1022
|
+
case LLM_ARCH_GEMMA3N:
|
|
1023
|
+
{
|
|
1024
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1025
|
+
hparams.set_swa_pattern(5);
|
|
1026
|
+
|
|
1027
|
+
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1028
|
+
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1029
|
+
hparams.f_attention_scale = 1.0f;
|
|
1030
|
+
|
|
1031
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1032
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1033
|
+
|
|
1034
|
+
switch (hparams.n_layer) {
|
|
1035
|
+
case 30: type = LLM_TYPE_E2B; break;
|
|
1036
|
+
case 35: type = LLM_TYPE_E4B; break;
|
|
1037
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1038
|
+
}
|
|
1039
|
+
} break;
|
|
1020
1040
|
case LLM_ARCH_STARCODER2:
|
|
1021
1041
|
{
|
|
1022
1042
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -2950,6 +2970,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2950
2970
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2951
2971
|
}
|
|
2952
2972
|
} break;
|
|
2973
|
+
case LLM_ARCH_GEMMA3N:
|
|
2974
|
+
{
|
|
2975
|
+
const int64_t n_altup = hparams.n_altup;
|
|
2976
|
+
const int64_t laurel_rank = hparams.laurel_rank;
|
|
2977
|
+
const int64_t n_embd_altup = hparams.n_embd_altup;
|
|
2978
|
+
|
|
2979
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2980
|
+
// if output is NULL, init from the input tok embed
|
|
2981
|
+
if (output == NULL) {
|
|
2982
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2983
|
+
}
|
|
2984
|
+
|
|
2985
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2986
|
+
tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
|
|
2987
|
+
|
|
2988
|
+
altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
|
|
2989
|
+
altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
|
|
2990
|
+
per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
|
|
2991
|
+
per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
|
|
2992
|
+
|
|
2993
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2994
|
+
|
|
2995
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2996
|
+
auto & layer = layers[i];
|
|
2997
|
+
|
|
2998
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2999
|
+
|
|
3000
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3001
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3002
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3003
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
3004
|
+
|
|
3005
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
3006
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
3007
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3008
|
+
|
|
3009
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3010
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3011
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3012
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3013
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3014
|
+
|
|
3015
|
+
// altup & laurel
|
|
3016
|
+
layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
|
|
3017
|
+
layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
|
|
3018
|
+
layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3019
|
+
layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
|
|
3020
|
+
layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
|
|
3021
|
+
layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
|
|
3022
|
+
layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
|
|
3023
|
+
layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
|
|
3024
|
+
layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
|
|
3025
|
+
layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
|
|
3026
|
+
layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3027
|
+
}
|
|
3028
|
+
} break;
|
|
2953
3029
|
case LLM_ARCH_STARCODER2:
|
|
2954
3030
|
{
|
|
2955
3031
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4707,6 +4783,8 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4707
4783
|
|
|
4708
4784
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4709
4785
|
|
|
4786
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4787
|
+
|
|
4710
4788
|
for (int il = 0; il < n_layer; ++il) {
|
|
4711
4789
|
ggml_tensor * inpSA = inpL;
|
|
4712
4790
|
|
|
@@ -4769,9 +4847,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4769
4847
|
cb(cur, "attn_out", il);
|
|
4770
4848
|
}
|
|
4771
4849
|
|
|
4772
|
-
if (il == n_layer - 1) {
|
|
4773
|
-
// skip computing output for unused tokens
|
|
4774
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4850
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
4775
4851
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4776
4852
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4777
4853
|
}
|
|
@@ -4867,6 +4943,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
4867
4943
|
|
|
4868
4944
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4869
4945
|
|
|
4946
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4947
|
+
|
|
4870
4948
|
for (int il = 0; il < n_layer; ++il) {
|
|
4871
4949
|
ggml_tensor * inpSA = inpL;
|
|
4872
4950
|
|
|
@@ -4943,9 +5021,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
4943
5021
|
cb(cur, "attn_out", il);
|
|
4944
5022
|
}
|
|
4945
5023
|
|
|
4946
|
-
if (il == n_layer - 1) {
|
|
4947
|
-
// skip computing output for unused tokens
|
|
4948
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5024
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
4949
5025
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4950
5026
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4951
5027
|
}
|
|
@@ -5045,6 +5121,9 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
5045
5121
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5046
5122
|
|
|
5047
5123
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
5124
|
+
|
|
5125
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5126
|
+
|
|
5048
5127
|
for (int il = 0; il < n_layer; ++il) {
|
|
5049
5128
|
ggml_tensor * inpSA = inpL;
|
|
5050
5129
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
@@ -5118,9 +5197,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
5118
5197
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
5119
5198
|
}
|
|
5120
5199
|
|
|
5121
|
-
if (il == n_layer - 1) {
|
|
5122
|
-
// skip computing output for unused tokens
|
|
5123
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5200
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5124
5201
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5125
5202
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5126
5203
|
}
|
|
@@ -5199,6 +5276,8 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
5199
5276
|
|
|
5200
5277
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5201
5278
|
|
|
5279
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5280
|
+
|
|
5202
5281
|
for (int il = 0; il < n_layer; ++il) {
|
|
5203
5282
|
ggml_tensor * inpSA = inpL;
|
|
5204
5283
|
|
|
@@ -5250,9 +5329,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
5250
5329
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5251
5330
|
}
|
|
5252
5331
|
|
|
5253
|
-
if (il == n_layer - 1) {
|
|
5254
|
-
// skip computing output for unused tokens
|
|
5255
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5332
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5256
5333
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5257
5334
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5258
5335
|
}
|
|
@@ -5321,6 +5398,8 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
5321
5398
|
|
|
5322
5399
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5323
5400
|
|
|
5401
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5402
|
+
|
|
5324
5403
|
for (int il = 0; il < n_layer; ++il) {
|
|
5325
5404
|
ggml_tensor * inpSA = inpL;
|
|
5326
5405
|
|
|
@@ -5365,9 +5444,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
5365
5444
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5366
5445
|
}
|
|
5367
5446
|
|
|
5368
|
-
if (il == n_layer - 1) {
|
|
5369
|
-
// skip computing output for unused tokens
|
|
5370
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5447
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5371
5448
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5372
5449
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5373
5450
|
}
|
|
@@ -5435,6 +5512,8 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5435
5512
|
|
|
5436
5513
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5437
5514
|
|
|
5515
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5516
|
+
|
|
5438
5517
|
for (int il = 0; il < n_layer; ++il) {
|
|
5439
5518
|
ggml_tensor * attn_norm;
|
|
5440
5519
|
|
|
@@ -5490,9 +5569,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5490
5569
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5491
5570
|
}
|
|
5492
5571
|
|
|
5493
|
-
if (il == n_layer - 1) {
|
|
5494
|
-
// skip computing output for unused tokens
|
|
5495
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5572
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5496
5573
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5497
5574
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5498
5575
|
attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
|
|
@@ -5561,6 +5638,8 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
5561
5638
|
|
|
5562
5639
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5563
5640
|
|
|
5641
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5642
|
+
|
|
5564
5643
|
for (int il = 0; il < n_layer; ++il) {
|
|
5565
5644
|
ggml_tensor * inpSA = inpL;
|
|
5566
5645
|
|
|
@@ -5620,9 +5699,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
5620
5699
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
5621
5700
|
}
|
|
5622
5701
|
|
|
5623
|
-
if (il == n_layer - 1) {
|
|
5624
|
-
// skip computing output for unused tokens
|
|
5625
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5702
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5626
5703
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5627
5704
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5628
5705
|
}
|
|
@@ -5721,6 +5798,8 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
5721
5798
|
|
|
5722
5799
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5723
5800
|
|
|
5801
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5802
|
+
|
|
5724
5803
|
for (int il = 0; il < n_layer; ++il) {
|
|
5725
5804
|
ggml_tensor * inpSA = inpL;
|
|
5726
5805
|
|
|
@@ -5771,9 +5850,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
5771
5850
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5772
5851
|
}
|
|
5773
5852
|
|
|
5774
|
-
if (il == n_layer - 1) {
|
|
5775
|
-
// skip computing output for unused tokens
|
|
5776
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5853
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5777
5854
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5778
5855
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5779
5856
|
}
|
|
@@ -5853,6 +5930,8 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
5853
5930
|
inpL = ggml_add(ctx0, inpL, pos);
|
|
5854
5931
|
cb(inpL, "inpL", -1);
|
|
5855
5932
|
|
|
5933
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5934
|
+
|
|
5856
5935
|
for (int il = 0; il < n_layer; ++il) {
|
|
5857
5936
|
cur = build_norm(inpL,
|
|
5858
5937
|
model.layers[il].attn_norm,
|
|
@@ -5885,9 +5964,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
5885
5964
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5886
5965
|
}
|
|
5887
5966
|
|
|
5888
|
-
if (il == n_layer - 1) {
|
|
5889
|
-
// skip computing output for unused tokens
|
|
5890
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5967
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5891
5968
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5892
5969
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5893
5970
|
}
|
|
@@ -5952,6 +6029,8 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
5952
6029
|
|
|
5953
6030
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5954
6031
|
|
|
6032
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6033
|
+
|
|
5955
6034
|
for (int il = 0; il < n_layer; ++il) {
|
|
5956
6035
|
ggml_tensor * inpSA = inpL;
|
|
5957
6036
|
|
|
@@ -5984,9 +6063,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
5984
6063
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5985
6064
|
}
|
|
5986
6065
|
|
|
5987
|
-
if (il == n_layer - 1) {
|
|
5988
|
-
// skip computing output for unused tokens
|
|
5989
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6066
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5990
6067
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5991
6068
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5992
6069
|
}
|
|
@@ -6072,78 +6149,79 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6072
6149
|
|
|
6073
6150
|
auto * inp_attn = build_attn_inp_no_cache();
|
|
6074
6151
|
|
|
6075
|
-
|
|
6152
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6153
|
+
|
|
6076
6154
|
for (int il = 0; il < n_layer; ++il) {
|
|
6077
6155
|
ggml_tensor * cur = inpL;
|
|
6078
6156
|
|
|
6079
|
-
|
|
6080
|
-
|
|
6081
|
-
|
|
6157
|
+
{
|
|
6158
|
+
ggml_tensor * Qcur;
|
|
6159
|
+
ggml_tensor * Kcur;
|
|
6160
|
+
ggml_tensor * Vcur;
|
|
6082
6161
|
|
|
6083
|
-
|
|
6084
|
-
|
|
6085
|
-
|
|
6086
|
-
|
|
6162
|
+
// self-attention
|
|
6163
|
+
if (model.layers[il].wqkv) {
|
|
6164
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6165
|
+
cb(cur, "wqkv", il);
|
|
6087
6166
|
|
|
6088
|
-
|
|
6089
|
-
|
|
6090
|
-
|
|
6091
|
-
|
|
6167
|
+
if (model.layers[il].bqkv) {
|
|
6168
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6169
|
+
cb(cur, "bqkv", il);
|
|
6170
|
+
}
|
|
6092
6171
|
|
|
6093
|
-
|
|
6094
|
-
|
|
6095
|
-
|
|
6096
|
-
|
|
6097
|
-
|
|
6098
|
-
|
|
6099
|
-
|
|
6100
|
-
|
|
6172
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6173
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6174
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6175
|
+
} else {
|
|
6176
|
+
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
6177
|
+
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
6178
|
+
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
6179
|
+
}
|
|
6101
6180
|
|
|
6102
|
-
|
|
6103
|
-
|
|
6104
|
-
|
|
6105
|
-
|
|
6106
|
-
|
|
6107
|
-
|
|
6181
|
+
if (model.layers[il].attn_q_norm) {
|
|
6182
|
+
Qcur = build_norm(Qcur,
|
|
6183
|
+
model.layers[il].attn_q_norm,
|
|
6184
|
+
model.layers[il].attn_q_norm_b,
|
|
6185
|
+
LLM_NORM, il);
|
|
6186
|
+
}
|
|
6108
6187
|
|
|
6109
|
-
|
|
6110
|
-
|
|
6111
|
-
|
|
6112
|
-
|
|
6113
|
-
|
|
6114
|
-
|
|
6188
|
+
if (model.layers[il].attn_k_norm) {
|
|
6189
|
+
Kcur = build_norm(Kcur,
|
|
6190
|
+
model.layers[il].attn_k_norm,
|
|
6191
|
+
model.layers[il].attn_k_norm_b,
|
|
6192
|
+
LLM_NORM, il);
|
|
6193
|
+
}
|
|
6115
6194
|
|
|
6116
|
-
|
|
6117
|
-
|
|
6118
|
-
|
|
6195
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6196
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6197
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6119
6198
|
|
|
6120
|
-
|
|
6121
|
-
|
|
6122
|
-
|
|
6123
|
-
|
|
6124
|
-
|
|
6125
|
-
|
|
6126
|
-
|
|
6199
|
+
// RoPE
|
|
6200
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
6201
|
+
Qcur = ggml_rope_ext(
|
|
6202
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6203
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6204
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6205
|
+
);
|
|
6127
6206
|
|
|
6128
|
-
|
|
6129
|
-
|
|
6130
|
-
|
|
6131
|
-
|
|
6132
|
-
|
|
6133
|
-
|
|
6207
|
+
Kcur = ggml_rope_ext(
|
|
6208
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6209
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6210
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6211
|
+
);
|
|
6212
|
+
}
|
|
6134
6213
|
|
|
6135
|
-
|
|
6136
|
-
|
|
6137
|
-
|
|
6214
|
+
cb(Qcur, "Qcur", il);
|
|
6215
|
+
cb(Kcur, "Kcur", il);
|
|
6216
|
+
cb(Vcur, "Vcur", il);
|
|
6138
6217
|
|
|
6139
|
-
|
|
6140
|
-
|
|
6141
|
-
|
|
6142
|
-
|
|
6218
|
+
cur = build_attn(inp_attn, gf,
|
|
6219
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
6220
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6221
|
+
cb(cur, "kqv_out", il);
|
|
6222
|
+
}
|
|
6143
6223
|
|
|
6144
|
-
if (il == n_layer - 1 &&
|
|
6145
|
-
// skip computing output for unused tokens
|
|
6146
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6224
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6147
6225
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6148
6226
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6149
6227
|
}
|
|
@@ -6240,56 +6318,57 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
6240
6318
|
|
|
6241
6319
|
auto * inp_attn = build_attn_inp_no_cache();
|
|
6242
6320
|
|
|
6243
|
-
|
|
6321
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6322
|
+
|
|
6244
6323
|
for (int il = 0; il < n_layer; ++il) {
|
|
6245
6324
|
ggml_tensor * cur = inpL;
|
|
6246
6325
|
|
|
6247
|
-
ggml_tensor * Qcur;
|
|
6248
|
-
ggml_tensor * Kcur;
|
|
6249
|
-
ggml_tensor * Vcur;
|
|
6250
|
-
|
|
6251
6326
|
// pre-norm
|
|
6252
6327
|
cur = build_norm(inpL,
|
|
6253
6328
|
model.layers[il].attn_norm, NULL,
|
|
6254
6329
|
LLM_NORM_RMS, il);
|
|
6255
6330
|
|
|
6256
|
-
|
|
6257
|
-
|
|
6258
|
-
|
|
6259
|
-
|
|
6260
|
-
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6261
|
-
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6262
|
-
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6263
|
-
|
|
6264
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6265
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6266
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6267
|
-
|
|
6268
|
-
// RoPE
|
|
6269
|
-
Qcur = ggml_rope_ext(
|
|
6270
|
-
ctx0, Qcur, inp_pos, nullptr,
|
|
6271
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6272
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6273
|
-
);
|
|
6331
|
+
{
|
|
6332
|
+
ggml_tensor * Qcur;
|
|
6333
|
+
ggml_tensor * Kcur;
|
|
6334
|
+
ggml_tensor * Vcur;
|
|
6274
6335
|
|
|
6275
|
-
|
|
6276
|
-
|
|
6277
|
-
|
|
6278
|
-
|
|
6279
|
-
|
|
6336
|
+
// self-attention
|
|
6337
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6338
|
+
cb(cur, "wqkv", il);
|
|
6339
|
+
|
|
6340
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6341
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6342
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6343
|
+
|
|
6344
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6345
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6346
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6347
|
+
|
|
6348
|
+
// RoPE
|
|
6349
|
+
Qcur = ggml_rope_ext(
|
|
6350
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6351
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6352
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6353
|
+
);
|
|
6354
|
+
|
|
6355
|
+
Kcur = ggml_rope_ext(
|
|
6356
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6357
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6358
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6359
|
+
);
|
|
6280
6360
|
|
|
6281
|
-
|
|
6282
|
-
|
|
6283
|
-
|
|
6361
|
+
cb(Qcur, "Qcur", il);
|
|
6362
|
+
cb(Kcur, "Kcur", il);
|
|
6363
|
+
cb(Vcur, "Vcur", il);
|
|
6284
6364
|
|
|
6285
|
-
|
|
6286
|
-
|
|
6287
|
-
|
|
6288
|
-
|
|
6365
|
+
cur = build_attn(inp_attn, gf,
|
|
6366
|
+
model.layers[il].wo, nullptr,
|
|
6367
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6368
|
+
cb(cur, "kqv_out", il);
|
|
6369
|
+
}
|
|
6289
6370
|
|
|
6290
|
-
if (il == n_layer - 1 &&
|
|
6291
|
-
// skip computing output for unused tokens
|
|
6292
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6371
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6293
6372
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6294
6373
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6295
6374
|
}
|
|
@@ -6354,6 +6433,8 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
6354
6433
|
LLM_NORM, -1);
|
|
6355
6434
|
cb(inpL, "inp_norm", -1);
|
|
6356
6435
|
|
|
6436
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6437
|
+
|
|
6357
6438
|
for (int il = 0; il < n_layer; ++il) {
|
|
6358
6439
|
cur = build_norm(inpL,
|
|
6359
6440
|
model.layers[il].attn_norm,
|
|
@@ -6386,9 +6467,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
6386
6467
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6387
6468
|
}
|
|
6388
6469
|
|
|
6389
|
-
if (il == n_layer - 1) {
|
|
6390
|
-
// skip computing output for unused tokens
|
|
6391
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6470
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6392
6471
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6393
6472
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6394
6473
|
}
|
|
@@ -6465,6 +6544,8 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6465
6544
|
cb(inpL, "inpL", -1);
|
|
6466
6545
|
}
|
|
6467
6546
|
|
|
6547
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6548
|
+
|
|
6468
6549
|
for (int il = 0; il < n_layer; ++il) {
|
|
6469
6550
|
ggml_tensor * attn_norm;
|
|
6470
6551
|
|
|
@@ -6527,9 +6608,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6527
6608
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6528
6609
|
}
|
|
6529
6610
|
|
|
6530
|
-
if (il == n_layer - 1) {
|
|
6531
|
-
// skip computing output for unused tokens
|
|
6532
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6611
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6533
6612
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6534
6613
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6535
6614
|
}
|
|
@@ -6598,6 +6677,8 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
6598
6677
|
|
|
6599
6678
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6600
6679
|
|
|
6680
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6681
|
+
|
|
6601
6682
|
for (int il = 0; il < n_layer; ++il) {
|
|
6602
6683
|
// norm
|
|
6603
6684
|
cur = build_norm(inpL,
|
|
@@ -6673,9 +6754,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
6673
6754
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6674
6755
|
}
|
|
6675
6756
|
|
|
6676
|
-
if (il == n_layer - 1) {
|
|
6677
|
-
// skip computing output for unused tokens
|
|
6678
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6757
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6679
6758
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6680
6759
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6681
6760
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
@@ -6750,6 +6829,8 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
6750
6829
|
|
|
6751
6830
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6752
6831
|
|
|
6832
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6833
|
+
|
|
6753
6834
|
for (int il = 0; il < n_layer; ++il) {
|
|
6754
6835
|
ggml_tensor * inpSA = inpL;
|
|
6755
6836
|
|
|
@@ -6796,9 +6877,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
6796
6877
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6797
6878
|
}
|
|
6798
6879
|
|
|
6799
|
-
if (il == n_layer - 1) {
|
|
6800
|
-
// skip computing output for unused tokens
|
|
6801
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6880
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6802
6881
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6803
6882
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6804
6883
|
}
|
|
@@ -6867,6 +6946,8 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
6867
6946
|
|
|
6868
6947
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6869
6948
|
|
|
6949
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6950
|
+
|
|
6870
6951
|
for (int il = 0; il < n_layer; ++il) {
|
|
6871
6952
|
ggml_tensor * inpSA = inpL;
|
|
6872
6953
|
|
|
@@ -6916,9 +6997,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
6916
6997
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6917
6998
|
}
|
|
6918
6999
|
|
|
6919
|
-
if (il == n_layer - 1) {
|
|
6920
|
-
// skip computing output for unused tokens
|
|
6921
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7000
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6922
7001
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6923
7002
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6924
7003
|
}
|
|
@@ -6988,6 +7067,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
6988
7067
|
int sections[4];
|
|
6989
7068
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
6990
7069
|
|
|
7070
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7071
|
+
|
|
6991
7072
|
for (int il = 0; il < n_layer; ++il) {
|
|
6992
7073
|
ggml_tensor * inpSA = inpL;
|
|
6993
7074
|
|
|
@@ -7037,9 +7118,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
7037
7118
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7038
7119
|
}
|
|
7039
7120
|
|
|
7040
|
-
if (il == n_layer - 1) {
|
|
7041
|
-
// skip computing output for unused tokens
|
|
7042
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7121
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7043
7122
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7044
7123
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7045
7124
|
}
|
|
@@ -7106,6 +7185,8 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
7106
7185
|
|
|
7107
7186
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7108
7187
|
|
|
7188
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7189
|
+
|
|
7109
7190
|
for (int il = 0; il < n_layer; ++il) {
|
|
7110
7191
|
ggml_tensor * inpSA = inpL;
|
|
7111
7192
|
|
|
@@ -7164,9 +7245,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
7164
7245
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7165
7246
|
}
|
|
7166
7247
|
|
|
7167
|
-
if (il == n_layer - 1) {
|
|
7168
|
-
// skip computing output for unused tokens
|
|
7169
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7248
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7170
7249
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7171
7250
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7172
7251
|
}
|
|
@@ -7265,6 +7344,8 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
7265
7344
|
|
|
7266
7345
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7267
7346
|
|
|
7347
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7348
|
+
|
|
7268
7349
|
for (int il = 0; il < n_layer; ++il) {
|
|
7269
7350
|
ggml_tensor * inpSA = inpL;
|
|
7270
7351
|
|
|
@@ -7317,9 +7398,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
7317
7398
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7318
7399
|
}
|
|
7319
7400
|
|
|
7320
|
-
if (il == n_layer - 1) {
|
|
7321
|
-
// skip computing output for unused tokens
|
|
7322
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7401
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7323
7402
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7324
7403
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7325
7404
|
}
|
|
@@ -7386,6 +7465,8 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
7386
7465
|
|
|
7387
7466
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7388
7467
|
|
|
7468
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7469
|
+
|
|
7389
7470
|
for (int il = 0; il < n_layer; ++il) {
|
|
7390
7471
|
ggml_tensor * inpSA = inpL;
|
|
7391
7472
|
|
|
@@ -7438,9 +7519,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
7438
7519
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7439
7520
|
}
|
|
7440
7521
|
|
|
7441
|
-
if (il == n_layer - 1) {
|
|
7442
|
-
// skip computing output for unused tokens
|
|
7443
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7522
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7444
7523
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7445
7524
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7446
7525
|
}
|
|
@@ -7516,6 +7595,8 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7516
7595
|
|
|
7517
7596
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7518
7597
|
|
|
7598
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7599
|
+
|
|
7519
7600
|
for (int il = 0; il < n_layer; ++il) {
|
|
7520
7601
|
attn_norm_output = build_norm(inpL,
|
|
7521
7602
|
model.layers[il].attn_norm,
|
|
@@ -7578,9 +7659,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7578
7659
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7579
7660
|
}
|
|
7580
7661
|
|
|
7581
|
-
if (il == n_layer - 1) {
|
|
7582
|
-
// skip computing output for unused tokens
|
|
7583
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7662
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7584
7663
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7585
7664
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7586
7665
|
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
|
@@ -7652,6 +7731,8 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7652
7731
|
inp_attn = build_attn_inp_kv_unified();
|
|
7653
7732
|
}
|
|
7654
7733
|
|
|
7734
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7735
|
+
|
|
7655
7736
|
for (int il = 0; il < n_layer; ++il) {
|
|
7656
7737
|
auto * residual = inpL;
|
|
7657
7738
|
|
|
@@ -7715,9 +7796,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7715
7796
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7716
7797
|
}
|
|
7717
7798
|
|
|
7718
|
-
if (il == n_layer - 1) {
|
|
7719
|
-
// skip computing output for unused tokens
|
|
7720
|
-
ggml_tensor* inp_out_ids = build_inp_out_ids();
|
|
7799
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7721
7800
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7722
7801
|
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
7723
7802
|
}
|
|
@@ -7803,15 +7882,16 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
7803
7882
|
|
|
7804
7883
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7805
7884
|
|
|
7806
|
-
|
|
7885
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7807
7886
|
|
|
7887
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7808
7888
|
// norm
|
|
7809
7889
|
cur = build_norm(inpL,
|
|
7810
7890
|
model.layers[il].attn_norm, NULL,
|
|
7811
7891
|
LLM_NORM_RMS, il);
|
|
7812
7892
|
cb(cur, "attn_norm", il);
|
|
7813
7893
|
|
|
7814
|
-
ggml_tensor *
|
|
7894
|
+
ggml_tensor * sa_inp = cur;
|
|
7815
7895
|
|
|
7816
7896
|
// self-attention
|
|
7817
7897
|
{
|
|
@@ -7849,18 +7929,17 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
7849
7929
|
model.layers[il].wo, NULL,
|
|
7850
7930
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7851
7931
|
}
|
|
7852
|
-
ggml_tensor * sa_out = cur;
|
|
7853
7932
|
|
|
7854
|
-
|
|
7855
|
-
|
|
7856
|
-
if (il == n_layer - 1) {
|
|
7857
|
-
// skip computing output for unused tokens
|
|
7858
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7933
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7859
7934
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7860
|
-
|
|
7935
|
+
sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
|
|
7861
7936
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7862
7937
|
}
|
|
7863
7938
|
|
|
7939
|
+
ggml_tensor * sa_out = cur;
|
|
7940
|
+
|
|
7941
|
+
cur = sa_inp;
|
|
7942
|
+
|
|
7864
7943
|
// feed-forward network
|
|
7865
7944
|
{
|
|
7866
7945
|
cur = build_ffn(cur,
|
|
@@ -7925,6 +8004,8 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
7925
8004
|
inpL = ggml_add(ctx0, inpL, pos);
|
|
7926
8005
|
cb(inpL, "inpL", -1);
|
|
7927
8006
|
|
|
8007
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8008
|
+
|
|
7928
8009
|
for (int il = 0; il < n_layer; ++il) {
|
|
7929
8010
|
cur = build_norm(inpL,
|
|
7930
8011
|
model.layers[il].attn_norm,
|
|
@@ -7957,9 +8038,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
7957
8038
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7958
8039
|
}
|
|
7959
8040
|
|
|
7960
|
-
if (il == n_layer - 1) {
|
|
7961
|
-
// skip computing output for unused tokens
|
|
7962
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8041
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7963
8042
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7964
8043
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7965
8044
|
}
|
|
@@ -8029,6 +8108,8 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
8029
8108
|
|
|
8030
8109
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8031
8110
|
|
|
8111
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8112
|
+
|
|
8032
8113
|
for (int il = 0; il < n_layer; ++il) {
|
|
8033
8114
|
cur = build_norm(inpL,
|
|
8034
8115
|
model.layers[il].attn_norm,
|
|
@@ -8073,9 +8154,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
8073
8154
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8074
8155
|
}
|
|
8075
8156
|
|
|
8076
|
-
if (il == n_layer - 1) {
|
|
8077
|
-
// skip computing output for unused tokens
|
|
8078
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8157
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8079
8158
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8080
8159
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8081
8160
|
}
|
|
@@ -8129,133 +8208,6 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
8129
8208
|
|
|
8130
8209
|
struct llm_build_orion : public llm_graph_context {
|
|
8131
8210
|
llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
8132
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8133
|
-
|
|
8134
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8135
|
-
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
8136
|
-
|
|
8137
|
-
ggml_tensor * cur;
|
|
8138
|
-
ggml_tensor * inpL;
|
|
8139
|
-
|
|
8140
|
-
inpL = build_inp_embd(model.tok_embd);
|
|
8141
|
-
|
|
8142
|
-
// inp_pos - contains the positions
|
|
8143
|
-
ggml_tensor * inp_pos = build_inp_pos();
|
|
8144
|
-
|
|
8145
|
-
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8146
|
-
|
|
8147
|
-
for (int il = 0; il < n_layer; ++il) {
|
|
8148
|
-
ggml_tensor * inpSA = inpL;
|
|
8149
|
-
|
|
8150
|
-
// norm
|
|
8151
|
-
cur = build_norm(inpL,
|
|
8152
|
-
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
|
8153
|
-
LLM_NORM, il);
|
|
8154
|
-
cb(cur, "attn_norm", il);
|
|
8155
|
-
|
|
8156
|
-
// self-attention
|
|
8157
|
-
{
|
|
8158
|
-
// compute Q and K and RoPE them
|
|
8159
|
-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8160
|
-
cb(Qcur, "Qcur", il);
|
|
8161
|
-
// if (model.layers[il].bq) {
|
|
8162
|
-
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8163
|
-
// cb(Qcur, "Qcur", il);
|
|
8164
|
-
// }
|
|
8165
|
-
|
|
8166
|
-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8167
|
-
cb(Kcur, "Kcur", il);
|
|
8168
|
-
// if (model.layers[il].bk) {
|
|
8169
|
-
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
8170
|
-
// cb(Kcur, "Kcur", il);
|
|
8171
|
-
// }
|
|
8172
|
-
|
|
8173
|
-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8174
|
-
cb(Vcur, "Vcur", il);
|
|
8175
|
-
// if (model.layers[il].bv) {
|
|
8176
|
-
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
8177
|
-
// cb(Vcur, "Vcur", il);
|
|
8178
|
-
// }
|
|
8179
|
-
|
|
8180
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8181
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8182
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8183
|
-
|
|
8184
|
-
Qcur = ggml_rope_ext(
|
|
8185
|
-
ctx0, Qcur, inp_pos, nullptr,
|
|
8186
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8187
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8188
|
-
);
|
|
8189
|
-
|
|
8190
|
-
Kcur = ggml_rope_ext(
|
|
8191
|
-
ctx0, Kcur, inp_pos, nullptr,
|
|
8192
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8193
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8194
|
-
);
|
|
8195
|
-
|
|
8196
|
-
cb(Qcur, "Qcur", il);
|
|
8197
|
-
cb(Kcur, "Kcur", il);
|
|
8198
|
-
cb(Vcur, "Vcur", il);
|
|
8199
|
-
|
|
8200
|
-
cur = build_attn(inp_attn, gf,
|
|
8201
|
-
model.layers[il].wo, NULL,
|
|
8202
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8203
|
-
}
|
|
8204
|
-
|
|
8205
|
-
if (il == n_layer - 1) {
|
|
8206
|
-
// skip computing output for unused tokens
|
|
8207
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8208
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8209
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8210
|
-
}
|
|
8211
|
-
|
|
8212
|
-
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
8213
|
-
cb(ffn_inp, "ffn_inp", il);
|
|
8214
|
-
|
|
8215
|
-
// feed-forward network
|
|
8216
|
-
cur = build_norm(ffn_inp,
|
|
8217
|
-
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
|
8218
|
-
LLM_NORM, il);
|
|
8219
|
-
cb(cur, "ffn_norm", il);
|
|
8220
|
-
|
|
8221
|
-
cur = build_ffn(cur,
|
|
8222
|
-
model.layers[il].ffn_up, NULL, NULL,
|
|
8223
|
-
model.layers[il].ffn_gate, NULL, NULL,
|
|
8224
|
-
model.layers[il].ffn_down, NULL, NULL,
|
|
8225
|
-
NULL,
|
|
8226
|
-
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8227
|
-
cb(cur, "ffn_out", il);
|
|
8228
|
-
|
|
8229
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
8230
|
-
|
|
8231
|
-
cur = build_cvec(cur, il);
|
|
8232
|
-
cb(cur, "l_out", il);
|
|
8233
|
-
|
|
8234
|
-
// input for next layer
|
|
8235
|
-
inpL = cur;
|
|
8236
|
-
}
|
|
8237
|
-
|
|
8238
|
-
cur = inpL;
|
|
8239
|
-
|
|
8240
|
-
cur = build_norm(cur,
|
|
8241
|
-
model.output_norm, model.output_norm_b,
|
|
8242
|
-
LLM_NORM, -1);
|
|
8243
|
-
|
|
8244
|
-
cb(cur, "result_norm", -1);
|
|
8245
|
-
res->t_embd = cur;
|
|
8246
|
-
|
|
8247
|
-
// lm_head
|
|
8248
|
-
cur = build_lora_mm(model.output, cur);
|
|
8249
|
-
|
|
8250
|
-
cb(cur, "result_output", -1);
|
|
8251
|
-
res->t_logits = cur;
|
|
8252
|
-
|
|
8253
|
-
ggml_build_forward_expand(gf, cur);
|
|
8254
|
-
}
|
|
8255
|
-
};
|
|
8256
|
-
|
|
8257
|
-
struct llm_build_internlm2 : public llm_graph_context {
|
|
8258
|
-
llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
8259
8211
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8260
8212
|
|
|
8261
8213
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -8271,13 +8223,15 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
8271
8223
|
|
|
8272
8224
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8273
8225
|
|
|
8226
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8227
|
+
|
|
8274
8228
|
for (int il = 0; il < n_layer; ++il) {
|
|
8275
8229
|
ggml_tensor * inpSA = inpL;
|
|
8276
8230
|
|
|
8277
8231
|
// norm
|
|
8278
8232
|
cur = build_norm(inpL,
|
|
8279
|
-
model.layers[il].attn_norm,
|
|
8280
|
-
|
|
8233
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
|
8234
|
+
LLM_NORM, il);
|
|
8281
8235
|
cb(cur, "attn_norm", il);
|
|
8282
8236
|
|
|
8283
8237
|
// self-attention
|
|
@@ -8285,24 +8239,24 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
8285
8239
|
// compute Q and K and RoPE them
|
|
8286
8240
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8287
8241
|
cb(Qcur, "Qcur", il);
|
|
8288
|
-
if (model.layers[il].bq) {
|
|
8289
|
-
|
|
8290
|
-
|
|
8291
|
-
}
|
|
8242
|
+
// if (model.layers[il].bq) {
|
|
8243
|
+
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8244
|
+
// cb(Qcur, "Qcur", il);
|
|
8245
|
+
// }
|
|
8292
8246
|
|
|
8293
8247
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8294
8248
|
cb(Kcur, "Kcur", il);
|
|
8295
|
-
if (model.layers[il].bk) {
|
|
8296
|
-
|
|
8297
|
-
|
|
8298
|
-
}
|
|
8249
|
+
// if (model.layers[il].bk) {
|
|
8250
|
+
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
8251
|
+
// cb(Kcur, "Kcur", il);
|
|
8252
|
+
// }
|
|
8299
8253
|
|
|
8300
8254
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8301
8255
|
cb(Vcur, "Vcur", il);
|
|
8302
|
-
if (model.layers[il].bv) {
|
|
8303
|
-
|
|
8304
|
-
|
|
8305
|
-
}
|
|
8256
|
+
// if (model.layers[il].bv) {
|
|
8257
|
+
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
8258
|
+
// cb(Vcur, "Vcur", il);
|
|
8259
|
+
// }
|
|
8306
8260
|
|
|
8307
8261
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8308
8262
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
@@ -8325,13 +8279,11 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
8325
8279
|
cb(Vcur, "Vcur", il);
|
|
8326
8280
|
|
|
8327
8281
|
cur = build_attn(inp_attn, gf,
|
|
8328
|
-
model.layers[il].wo,
|
|
8282
|
+
model.layers[il].wo, NULL,
|
|
8329
8283
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8330
8284
|
}
|
|
8331
8285
|
|
|
8332
|
-
if (il == n_layer - 1) {
|
|
8333
|
-
// skip computing output for unused tokens
|
|
8334
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8286
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8335
8287
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8336
8288
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8337
8289
|
}
|
|
@@ -8341,8 +8293,135 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
8341
8293
|
|
|
8342
8294
|
// feed-forward network
|
|
8343
8295
|
cur = build_norm(ffn_inp,
|
|
8344
|
-
model.layers[il].ffn_norm,
|
|
8345
|
-
|
|
8296
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
|
8297
|
+
LLM_NORM, il);
|
|
8298
|
+
cb(cur, "ffn_norm", il);
|
|
8299
|
+
|
|
8300
|
+
cur = build_ffn(cur,
|
|
8301
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
8302
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
8303
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
8304
|
+
NULL,
|
|
8305
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8306
|
+
cb(cur, "ffn_out", il);
|
|
8307
|
+
|
|
8308
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
8309
|
+
|
|
8310
|
+
cur = build_cvec(cur, il);
|
|
8311
|
+
cb(cur, "l_out", il);
|
|
8312
|
+
|
|
8313
|
+
// input for next layer
|
|
8314
|
+
inpL = cur;
|
|
8315
|
+
}
|
|
8316
|
+
|
|
8317
|
+
cur = inpL;
|
|
8318
|
+
|
|
8319
|
+
cur = build_norm(cur,
|
|
8320
|
+
model.output_norm, model.output_norm_b,
|
|
8321
|
+
LLM_NORM, -1);
|
|
8322
|
+
|
|
8323
|
+
cb(cur, "result_norm", -1);
|
|
8324
|
+
res->t_embd = cur;
|
|
8325
|
+
|
|
8326
|
+
// lm_head
|
|
8327
|
+
cur = build_lora_mm(model.output, cur);
|
|
8328
|
+
|
|
8329
|
+
cb(cur, "result_output", -1);
|
|
8330
|
+
res->t_logits = cur;
|
|
8331
|
+
|
|
8332
|
+
ggml_build_forward_expand(gf, cur);
|
|
8333
|
+
}
|
|
8334
|
+
};
|
|
8335
|
+
|
|
8336
|
+
struct llm_build_internlm2 : public llm_graph_context {
|
|
8337
|
+
llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
8338
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8339
|
+
|
|
8340
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8341
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
8342
|
+
|
|
8343
|
+
ggml_tensor * cur;
|
|
8344
|
+
ggml_tensor * inpL;
|
|
8345
|
+
|
|
8346
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
8347
|
+
|
|
8348
|
+
// inp_pos - contains the positions
|
|
8349
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
8350
|
+
|
|
8351
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8352
|
+
|
|
8353
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8354
|
+
|
|
8355
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
8356
|
+
ggml_tensor * inpSA = inpL;
|
|
8357
|
+
|
|
8358
|
+
// norm
|
|
8359
|
+
cur = build_norm(inpL,
|
|
8360
|
+
model.layers[il].attn_norm, NULL,
|
|
8361
|
+
LLM_NORM_RMS, il);
|
|
8362
|
+
cb(cur, "attn_norm", il);
|
|
8363
|
+
|
|
8364
|
+
// self-attention
|
|
8365
|
+
{
|
|
8366
|
+
// compute Q and K and RoPE them
|
|
8367
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8368
|
+
cb(Qcur, "Qcur", il);
|
|
8369
|
+
if (model.layers[il].bq) {
|
|
8370
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8371
|
+
cb(Qcur, "Qcur", il);
|
|
8372
|
+
}
|
|
8373
|
+
|
|
8374
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8375
|
+
cb(Kcur, "Kcur", il);
|
|
8376
|
+
if (model.layers[il].bk) {
|
|
8377
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
8378
|
+
cb(Kcur, "Kcur", il);
|
|
8379
|
+
}
|
|
8380
|
+
|
|
8381
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8382
|
+
cb(Vcur, "Vcur", il);
|
|
8383
|
+
if (model.layers[il].bv) {
|
|
8384
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
8385
|
+
cb(Vcur, "Vcur", il);
|
|
8386
|
+
}
|
|
8387
|
+
|
|
8388
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8389
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8390
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8391
|
+
|
|
8392
|
+
Qcur = ggml_rope_ext(
|
|
8393
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
8394
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8395
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8396
|
+
);
|
|
8397
|
+
|
|
8398
|
+
Kcur = ggml_rope_ext(
|
|
8399
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
8400
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8401
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8402
|
+
);
|
|
8403
|
+
|
|
8404
|
+
cb(Qcur, "Qcur", il);
|
|
8405
|
+
cb(Kcur, "Kcur", il);
|
|
8406
|
+
cb(Vcur, "Vcur", il);
|
|
8407
|
+
|
|
8408
|
+
cur = build_attn(inp_attn, gf,
|
|
8409
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
8410
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8411
|
+
}
|
|
8412
|
+
|
|
8413
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8414
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8415
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8416
|
+
}
|
|
8417
|
+
|
|
8418
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
8419
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
8420
|
+
|
|
8421
|
+
// feed-forward network
|
|
8422
|
+
cur = build_norm(ffn_inp,
|
|
8423
|
+
model.layers[il].ffn_norm, NULL,
|
|
8424
|
+
LLM_NORM_RMS, il);
|
|
8346
8425
|
cb(cur, "ffn_norm", il);
|
|
8347
8426
|
|
|
8348
8427
|
cur = build_ffn(cur,
|
|
@@ -8407,6 +8486,8 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8407
8486
|
|
|
8408
8487
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8409
8488
|
|
|
8489
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8490
|
+
|
|
8410
8491
|
for (int il = 0; il < n_layer; ++il) {
|
|
8411
8492
|
ggml_tensor * inpSA = inpL;
|
|
8412
8493
|
|
|
@@ -8526,15 +8607,13 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8526
8607
|
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
8527
8608
|
}
|
|
8528
8609
|
|
|
8529
|
-
if (il == n_layer - 1) {
|
|
8530
|
-
// skip computing output for unused tokens
|
|
8531
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8610
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8532
8611
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8533
8612
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8534
8613
|
}
|
|
8535
8614
|
|
|
8536
8615
|
// scale_res - scale the hidden states for residual connection
|
|
8537
|
-
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
|
8616
|
+
const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
|
|
8538
8617
|
cur = ggml_scale(ctx0, cur, scale_res);
|
|
8539
8618
|
cb(cur, "hidden_scaled", il);
|
|
8540
8619
|
|
|
@@ -8611,6 +8690,8 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
8611
8690
|
|
|
8612
8691
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8613
8692
|
|
|
8693
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8694
|
+
|
|
8614
8695
|
for (int il = 0; il < n_layer; ++il) {
|
|
8615
8696
|
// norm
|
|
8616
8697
|
cur = build_norm(inpL,
|
|
@@ -8656,9 +8737,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
8656
8737
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8657
8738
|
}
|
|
8658
8739
|
|
|
8659
|
-
if (il == n_layer - 1) {
|
|
8660
|
-
// skip computing output for unused tokens
|
|
8661
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8740
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8662
8741
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8663
8742
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8664
8743
|
}
|
|
@@ -8727,6 +8806,8 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
8727
8806
|
|
|
8728
8807
|
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8729
8808
|
|
|
8809
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8810
|
+
|
|
8730
8811
|
for (int il = 0; il < n_layer; ++il) {
|
|
8731
8812
|
// norm
|
|
8732
8813
|
cur = build_norm(inpL,
|
|
@@ -8771,18 +8852,16 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
8771
8852
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8772
8853
|
}
|
|
8773
8854
|
|
|
8855
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8856
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8857
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8858
|
+
}
|
|
8859
|
+
|
|
8774
8860
|
cur = build_norm(cur,
|
|
8775
8861
|
model.layers[il].attn_post_norm, NULL,
|
|
8776
8862
|
LLM_NORM_RMS, il);
|
|
8777
8863
|
cb(cur, "attn_post_norm", il);
|
|
8778
8864
|
|
|
8779
|
-
if (il == n_layer - 1) {
|
|
8780
|
-
// skip computing output for unused tokens
|
|
8781
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8782
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8783
|
-
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8784
|
-
}
|
|
8785
|
-
|
|
8786
8865
|
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
8787
8866
|
cb(sa_out, "sa_out", il);
|
|
8788
8867
|
|
|
@@ -8861,6 +8940,8 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8861
8940
|
// TODO: is causal == true correct? might need some changes
|
|
8862
8941
|
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8863
8942
|
|
|
8943
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8944
|
+
|
|
8864
8945
|
for (int il = 0; il < n_layer; ++il) {
|
|
8865
8946
|
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
8866
8947
|
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
@@ -8913,18 +8994,16 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8913
8994
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8914
8995
|
}
|
|
8915
8996
|
|
|
8997
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8998
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8999
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9000
|
+
}
|
|
9001
|
+
|
|
8916
9002
|
cur = build_norm(cur,
|
|
8917
9003
|
model.layers[il].attn_post_norm, NULL,
|
|
8918
9004
|
LLM_NORM_RMS, il);
|
|
8919
9005
|
cb(cur, "attn_post_norm", il);
|
|
8920
9006
|
|
|
8921
|
-
if (il == n_layer - 1) {
|
|
8922
|
-
// skip computing output for unused tokens
|
|
8923
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8924
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8925
|
-
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8926
|
-
}
|
|
8927
|
-
|
|
8928
9007
|
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
8929
9008
|
cb(sa_out, "sa_out", il);
|
|
8930
9009
|
|
|
@@ -8977,6 +9056,442 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8977
9056
|
}
|
|
8978
9057
|
};
|
|
8979
9058
|
|
|
9059
|
+
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
9060
|
+
const llama_model & model;
|
|
9061
|
+
ggml_cgraph * gf;
|
|
9062
|
+
|
|
9063
|
+
const int64_t n_embd_head;
|
|
9064
|
+
const int64_t n_embd_altup;
|
|
9065
|
+
const int64_t n_altup;
|
|
9066
|
+
const int i_altup_act;
|
|
9067
|
+
const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
|
|
9068
|
+
const int n_layer_sparsity = 10; // number of layers using activation sparsity
|
|
9069
|
+
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
|
|
9070
|
+
|
|
9071
|
+
ggml_tensor * one; // containing single element 1.0f
|
|
9072
|
+
|
|
9073
|
+
llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
|
|
9074
|
+
: llm_graph_context(params),
|
|
9075
|
+
model(model),
|
|
9076
|
+
gf(gf),
|
|
9077
|
+
n_embd_head(model.hparams.n_embd_head_k),
|
|
9078
|
+
n_embd_altup(model.hparams.n_embd_altup),
|
|
9079
|
+
n_altup(model.hparams.n_altup),
|
|
9080
|
+
i_altup_act(model.hparams.i_altup_act) {
|
|
9081
|
+
ggml_tensor * cur;
|
|
9082
|
+
ggml_tensor * inpL;
|
|
9083
|
+
|
|
9084
|
+
// TODO: remove this when ggml_scale_add is implemented
|
|
9085
|
+
one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
|
9086
|
+
{
|
|
9087
|
+
auto inp = std::make_unique<llm_graph_input_one>();
|
|
9088
|
+
inp->one = one;
|
|
9089
|
+
res->add_input(std::move(inp));
|
|
9090
|
+
}
|
|
9091
|
+
|
|
9092
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
9093
|
+
|
|
9094
|
+
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
9095
|
+
if (ubatch.token) {
|
|
9096
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
|
9097
|
+
cb(inpL, "inp_scaled", -1);
|
|
9098
|
+
}
|
|
9099
|
+
|
|
9100
|
+
// inp_pos - contains the positions
|
|
9101
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
9102
|
+
|
|
9103
|
+
// TODO: is causal == true correct? might need some changes
|
|
9104
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
9105
|
+
|
|
9106
|
+
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
|
|
9107
|
+
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
|
9108
|
+
|
|
9109
|
+
// inpL now has only 1 altup, project it to the rest of the altups
|
|
9110
|
+
// these "added" altups will be concat to the last dim of inpL
|
|
9111
|
+
{
|
|
9112
|
+
ggml_tensor * target_magnitude = calc_magnitude(inpL);
|
|
9113
|
+
ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
|
|
9114
|
+
ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
|
|
9115
|
+
ggml_tensor * new_magnitude = calc_magnitude(altup_added);
|
|
9116
|
+
altup_added = ggml_div(ctx0,
|
|
9117
|
+
ggml_mul(ctx0, altup_added, target_magnitude),
|
|
9118
|
+
new_magnitude);
|
|
9119
|
+
inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
|
|
9120
|
+
cb(inpL, "inp_stacked", -1);
|
|
9121
|
+
}
|
|
9122
|
+
|
|
9123
|
+
// inpL now has shape: [n_embd, n_tokens, n_altup]
|
|
9124
|
+
// inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
|
|
9125
|
+
|
|
9126
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
9127
|
+
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code
|
|
9128
|
+
const bool has_kv = (il < n_layer_kv);
|
|
9129
|
+
|
|
9130
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
9131
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
9132
|
+
|
|
9133
|
+
ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
|
|
9134
|
+
ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
|
|
9135
|
+
|
|
9136
|
+
// predicted value will go through self-attention and laurel
|
|
9137
|
+
ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
|
|
9138
|
+
cur = active_prediction;
|
|
9139
|
+
cb(cur, "active_prediction", il);
|
|
9140
|
+
|
|
9141
|
+
// norm
|
|
9142
|
+
cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
9143
|
+
cb(cur, "attn_norm", il);
|
|
9144
|
+
|
|
9145
|
+
// laurel
|
|
9146
|
+
ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
|
|
9147
|
+
|
|
9148
|
+
// self-attention
|
|
9149
|
+
if (has_kv) {
|
|
9150
|
+
// compute Q and K and RoPE them
|
|
9151
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
9152
|
+
cb(Qcur, "Qcur", il);
|
|
9153
|
+
|
|
9154
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
9155
|
+
cb(Kcur, "Kcur", il);
|
|
9156
|
+
|
|
9157
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
9158
|
+
cb(Vcur, "Vcur", il);
|
|
9159
|
+
|
|
9160
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9161
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9162
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9163
|
+
|
|
9164
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
9165
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
9166
|
+
Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
|
|
9167
|
+
|
|
9168
|
+
cb(Qcur, "Qcur_normed", il);
|
|
9169
|
+
cb(Kcur, "Kcur_normed", il);
|
|
9170
|
+
cb(Vcur, "Vcur_normed", il);
|
|
9171
|
+
|
|
9172
|
+
Qcur = ggml_rope_ext(
|
|
9173
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
9174
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
9175
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9176
|
+
|
|
9177
|
+
Kcur = ggml_rope_ext(
|
|
9178
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
9179
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
9180
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9181
|
+
|
|
9182
|
+
cb(Qcur, "Qcur_pos", il);
|
|
9183
|
+
cb(Kcur, "Kcur_pos", il);
|
|
9184
|
+
|
|
9185
|
+
cur = build_attn(inp_attn, gf,
|
|
9186
|
+
model.layers[il].wo, NULL,
|
|
9187
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
9188
|
+
} else {
|
|
9189
|
+
// no KV layers
|
|
9190
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
9191
|
+
cb(Qcur, "Qcur", il);
|
|
9192
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9193
|
+
|
|
9194
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
9195
|
+
cb(Qcur, "Qcur_normed", il);
|
|
9196
|
+
|
|
9197
|
+
Qcur = ggml_rope_ext(
|
|
9198
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
9199
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
9200
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9201
|
+
cb(Qcur, "Qcur_pos", il);
|
|
9202
|
+
|
|
9203
|
+
cur = build_attn(inp_attn, gf,
|
|
9204
|
+
model.layers[il].wo, NULL,
|
|
9205
|
+
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
9206
|
+
}
|
|
9207
|
+
|
|
9208
|
+
cur = build_norm(cur,
|
|
9209
|
+
model.layers[il].attn_post_norm, NULL,
|
|
9210
|
+
LLM_NORM_RMS, il);
|
|
9211
|
+
cb(cur, "attn_post_norm", il);
|
|
9212
|
+
|
|
9213
|
+
cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
|
|
9214
|
+
cb(cur, "attn_gated", il);
|
|
9215
|
+
|
|
9216
|
+
ggml_tensor * attn_laurel = ggml_scale(ctx0,
|
|
9217
|
+
ggml_add(ctx0, cur, laurel_out),
|
|
9218
|
+
1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
|
|
9219
|
+
cb(attn_laurel, "attn_laurel", il);
|
|
9220
|
+
|
|
9221
|
+
cur = build_norm(attn_laurel,
|
|
9222
|
+
model.layers[il].ffn_norm, NULL,
|
|
9223
|
+
LLM_NORM_RMS, il);
|
|
9224
|
+
cb(cur, "ffn_norm", il);
|
|
9225
|
+
|
|
9226
|
+
// feed-forward network
|
|
9227
|
+
{
|
|
9228
|
+
ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
|
|
9229
|
+
ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
|
|
9230
|
+
|
|
9231
|
+
if (il < n_layer_sparsity) {
|
|
9232
|
+
// apply activation sparsity
|
|
9233
|
+
gate_proj = gaussian_topk(gate_proj);
|
|
9234
|
+
}
|
|
9235
|
+
gate_proj = ggml_gelu(ctx0, gate_proj);
|
|
9236
|
+
|
|
9237
|
+
cur = ggml_mul(ctx0, up_proj, gate_proj);
|
|
9238
|
+
cur = build_lora_mm(model.layers[il].ffn_down, cur);
|
|
9239
|
+
cb(cur, "ffn_out", il);
|
|
9240
|
+
}
|
|
9241
|
+
|
|
9242
|
+
cur = build_norm(cur,
|
|
9243
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
9244
|
+
LLM_NORM_RMS, -1);
|
|
9245
|
+
cb(cur, "ffn_post_norm", il);
|
|
9246
|
+
|
|
9247
|
+
ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
|
|
9248
|
+
cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
|
|
9249
|
+
|
|
9250
|
+
ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
|
|
9251
|
+
|
|
9252
|
+
ggml_tensor * first_prediction; // [n_embd, n_tokens]
|
|
9253
|
+
{
|
|
9254
|
+
first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
|
|
9255
|
+
first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
|
|
9256
|
+
first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
|
|
9257
|
+
first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
|
|
9258
|
+
cb(first_prediction, "first_prediction_gated", il);
|
|
9259
|
+
ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
|
|
9260
|
+
first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
|
|
9261
|
+
cb(first_prediction, "first_prediction_scaled", il);
|
|
9262
|
+
|
|
9263
|
+
first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
|
|
9264
|
+
first_prediction = build_norm(first_prediction,
|
|
9265
|
+
model.layers[il].per_layer_post_norm, NULL,
|
|
9266
|
+
LLM_NORM_RMS, il);
|
|
9267
|
+
cb(first_prediction, "first_prediction_out", il);
|
|
9268
|
+
}
|
|
9269
|
+
|
|
9270
|
+
// equivalent to python code: corrected_predictions[1:] += first_prediction
|
|
9271
|
+
{
|
|
9272
|
+
ggml_tensor * slice_first = view_2d_slice(corrected, 0);
|
|
9273
|
+
ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
|
|
9274
|
+
ggml_row_size(corrected->type, n_embd),
|
|
9275
|
+
ggml_row_size(corrected->type, n_embd*n_tokens),
|
|
9276
|
+
n_embd*n_tokens*ggml_element_size(corrected));
|
|
9277
|
+
ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
|
|
9278
|
+
corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
|
|
9279
|
+
}
|
|
9280
|
+
|
|
9281
|
+
cur = corrected; // [n_embd, n_tokens, n_altup]
|
|
9282
|
+
cur = build_cvec(cur, il);
|
|
9283
|
+
cb(cur, "l_out", il);
|
|
9284
|
+
|
|
9285
|
+
// input for next layer
|
|
9286
|
+
inpL = cur;
|
|
9287
|
+
}
|
|
9288
|
+
|
|
9289
|
+
cur = inpL; // [n_embd, n_tokens, n_altup]
|
|
9290
|
+
|
|
9291
|
+
// cur now has multiple altup(s), we want to merge them back to 1 altup
|
|
9292
|
+
{
|
|
9293
|
+
ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
|
|
9294
|
+
// do a view to skip the first slice (active altup)
|
|
9295
|
+
ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
|
|
9296
|
+
ggml_row_size(cur->type, n_embd),
|
|
9297
|
+
ggml_row_size(cur->type, n_embd*n_tokens),
|
|
9298
|
+
n_embd*n_tokens*ggml_element_size(cur));
|
|
9299
|
+
ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
|
|
9300
|
+
ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
|
|
9301
|
+
altup_unembd = ggml_div(ctx0,
|
|
9302
|
+
ggml_mul(ctx0, altup_unembd, target_magnitude),
|
|
9303
|
+
new_magnitude);
|
|
9304
|
+
cb(altup_unembd, "altup_unembd", -1);
|
|
9305
|
+
|
|
9306
|
+
// equivalent to torch.mean(hidden_states, dim=0)
|
|
9307
|
+
cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
|
|
9308
|
+
for (int i = 0; i < n_altup - 1; ++i) {
|
|
9309
|
+
cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
|
|
9310
|
+
}
|
|
9311
|
+
cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
|
|
9312
|
+
cb(cur, "unembd_merged", -1);
|
|
9313
|
+
}
|
|
9314
|
+
|
|
9315
|
+
// cur now has shape: [n_embd, n_tokens]
|
|
9316
|
+
|
|
9317
|
+
// TODO: move this to right after the last KV layer
|
|
9318
|
+
{
|
|
9319
|
+
// skip computing output for unused tokens
|
|
9320
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9321
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9322
|
+
}
|
|
9323
|
+
|
|
9324
|
+
cur = build_norm(cur,
|
|
9325
|
+
model.output_norm, NULL,
|
|
9326
|
+
LLM_NORM_RMS, -1);
|
|
9327
|
+
|
|
9328
|
+
cb(cur, "result_norm", -1);
|
|
9329
|
+
res->t_embd = cur;
|
|
9330
|
+
|
|
9331
|
+
cur = build_lora_mm(model.output, cur);
|
|
9332
|
+
|
|
9333
|
+
{
|
|
9334
|
+
// final logit soft-capping
|
|
9335
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
|
9336
|
+
cur = ggml_tanh(ctx0, cur);
|
|
9337
|
+
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
|
9338
|
+
}
|
|
9339
|
+
|
|
9340
|
+
cb(cur, "result_output", -1);
|
|
9341
|
+
res->t_logits = cur;
|
|
9342
|
+
|
|
9343
|
+
ggml_build_forward_expand(gf, cur);
|
|
9344
|
+
}
|
|
9345
|
+
|
|
9346
|
+
ggml_tensor * calc_magnitude(ggml_tensor * x) {
|
|
9347
|
+
return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
|
|
9348
|
+
}
|
|
9349
|
+
|
|
9350
|
+
// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
|
|
9351
|
+
ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
|
|
9352
|
+
GGML_ASSERT(idx < (int)x->ne[2]);
|
|
9353
|
+
return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
|
|
9354
|
+
ggml_row_size(x->type, x->ne[0]),
|
|
9355
|
+
idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
|
|
9356
|
+
}
|
|
9357
|
+
|
|
9358
|
+
// equivalent to get_per_layer_inputs() in python code
|
|
9359
|
+
// output shape: [n_embd_altup, n_layer, n_tokens]
|
|
9360
|
+
ggml_tensor * get_per_layer_inputs() {
|
|
9361
|
+
auto inp = std::make_unique<llm_graph_input_embd>();
|
|
9362
|
+
ggml_tensor * inp_per_layer;
|
|
9363
|
+
if (ubatch.token) {
|
|
9364
|
+
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
|
9365
|
+
ggml_set_input(inp->tokens);
|
|
9366
|
+
res->t_tokens = inp->tokens;
|
|
9367
|
+
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
|
|
9368
|
+
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
|
9369
|
+
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
|
|
9370
|
+
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
|
9371
|
+
} else {
|
|
9372
|
+
GGML_ABORT("TODO: support embd input");
|
|
9373
|
+
}
|
|
9374
|
+
res->add_input(std::move(inp));
|
|
9375
|
+
return inp_per_layer;
|
|
9376
|
+
}
|
|
9377
|
+
|
|
9378
|
+
// equivalent to project_per_layer_inputs() in python code
|
|
9379
|
+
// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
|
|
9380
|
+
// output shape: [n_embd_altup, n_tokens, n_layer]
|
|
9381
|
+
ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
|
|
9382
|
+
const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
|
|
9383
|
+
const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
|
|
9384
|
+
|
|
9385
|
+
ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
|
|
9386
|
+
per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
|
|
9387
|
+
per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
|
|
9388
|
+
per_layer_proj = build_norm(per_layer_proj,
|
|
9389
|
+
model.per_layer_proj_norm, NULL,
|
|
9390
|
+
LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
|
|
9391
|
+
cb(per_layer_proj, "per_layer_proj", -1);
|
|
9392
|
+
|
|
9393
|
+
inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
|
|
9394
|
+
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
|
9395
|
+
cb(inp_per_layer, "inp_per_layer", -1);
|
|
9396
|
+
|
|
9397
|
+
// permute to shape: [n_embd_altup, n_tokens, n_layer]
|
|
9398
|
+
inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
|
|
9399
|
+
return inp_per_layer;
|
|
9400
|
+
}
|
|
9401
|
+
|
|
9402
|
+
// input cur shape: [n_altup, n_tokens]
|
|
9403
|
+
// output shape: [n_altup, n_tokens]
|
|
9404
|
+
ggml_tensor * laurel(ggml_tensor * cur, int il) {
|
|
9405
|
+
ggml_tensor * tmp = cur;
|
|
9406
|
+
tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
|
|
9407
|
+
tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
|
|
9408
|
+
tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
|
|
9409
|
+
tmp = ggml_add(ctx0, tmp, cur);
|
|
9410
|
+
cb(tmp, "laurel_out", il);
|
|
9411
|
+
return tmp;
|
|
9412
|
+
}
|
|
9413
|
+
|
|
9414
|
+
// input x shape: [n_embd, n_tokens]
|
|
9415
|
+
// output shape: [n_embd, n_tokens]
|
|
9416
|
+
ggml_tensor * gaussian_topk(ggml_tensor * x) {
|
|
9417
|
+
ggml_tensor * mean = ggml_mean(ctx0, x);
|
|
9418
|
+
ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
|
|
9419
|
+
ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
|
|
9420
|
+
1.0f / (float)(x->ne[0] - 1)
|
|
9421
|
+
));
|
|
9422
|
+
ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
|
|
9423
|
+
return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
|
|
9424
|
+
}
|
|
9425
|
+
|
|
9426
|
+
//
|
|
9427
|
+
// altup functions
|
|
9428
|
+
//
|
|
9429
|
+
|
|
9430
|
+
// equivalent to compute_router_modalities() in python code
|
|
9431
|
+
// input x shape: [n_embd, n_tokens]
|
|
9432
|
+
// output shape: [n_altup, n_tokens]
|
|
9433
|
+
ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
|
|
9434
|
+
ggml_tensor * router_inputs = build_norm(x,
|
|
9435
|
+
model.layers[il].altup_router_norm, NULL,
|
|
9436
|
+
LLM_NORM_RMS, il);
|
|
9437
|
+
|
|
9438
|
+
// router_input_scale
|
|
9439
|
+
router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
|
|
9440
|
+
|
|
9441
|
+
ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
|
|
9442
|
+
return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
|
|
9443
|
+
}
|
|
9444
|
+
|
|
9445
|
+
// input cur shape: [n_embd, n_tokens, n_altup]
|
|
9446
|
+
// output shape: [n_embd, n_tokens, n_altup]
|
|
9447
|
+
ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
|
|
9448
|
+
ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
|
|
9449
|
+
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
|
9450
|
+
cb(modalities, "modalities", il);
|
|
9451
|
+
|
|
9452
|
+
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
|
|
9453
|
+
cb(all_coefs, "all_coefs", il);
|
|
9454
|
+
// first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
|
|
9455
|
+
all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
|
|
9456
|
+
|
|
9457
|
+
// permute to [n_altup, n_embd, n_tokens]
|
|
9458
|
+
ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
|
9459
|
+
ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
|
|
9460
|
+
|
|
9461
|
+
// final shape must be the same as cur: [n_embd, n_tokens, n_altup]
|
|
9462
|
+
predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
|
|
9463
|
+
predictions = ggml_add(ctx0, predictions, cur);
|
|
9464
|
+
cb(predictions, "predictions", il);
|
|
9465
|
+
|
|
9466
|
+
return predictions;
|
|
9467
|
+
}
|
|
9468
|
+
|
|
9469
|
+
// input predictions shape: [n_embd, n_tokens, n_altup]
|
|
9470
|
+
// input activated shape: [n_embd, n_tokens]
|
|
9471
|
+
// output shape: [n_embd, n_tokens, n_altup]
|
|
9472
|
+
ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
|
|
9473
|
+
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
|
9474
|
+
cb(modalities, "modalities", il);
|
|
9475
|
+
|
|
9476
|
+
ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
|
|
9477
|
+
ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
|
|
9478
|
+
cb(innovation, "innovation", il);
|
|
9479
|
+
|
|
9480
|
+
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
|
|
9481
|
+
all_coefs = ggml_add(ctx0, all_coefs, one);
|
|
9482
|
+
cb(all_coefs, "all_coefs", il);
|
|
9483
|
+
all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
|
|
9484
|
+
all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
|
|
9485
|
+
|
|
9486
|
+
innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
|
|
9487
|
+
ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
|
|
9488
|
+
corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
|
|
9489
|
+
cb(corrected, "corrected", il);
|
|
9490
|
+
|
|
9491
|
+
return corrected;
|
|
9492
|
+
}
|
|
9493
|
+
};
|
|
9494
|
+
|
|
8980
9495
|
// TODO: move up next to build_starcoder
|
|
8981
9496
|
struct llm_build_starcoder2 : public llm_graph_context {
|
|
8982
9497
|
llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
@@ -8995,6 +9510,8 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
8995
9510
|
|
|
8996
9511
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8997
9512
|
|
|
9513
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9514
|
+
|
|
8998
9515
|
for (int il = 0; il < n_layer; ++il) {
|
|
8999
9516
|
ggml_tensor * inpSA = inpL;
|
|
9000
9517
|
|
|
@@ -9053,9 +9570,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
9053
9570
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9054
9571
|
}
|
|
9055
9572
|
|
|
9056
|
-
if (il == n_layer - 1) {
|
|
9057
|
-
// skip computing output for unused tokens
|
|
9058
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9573
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9059
9574
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9060
9575
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9061
9576
|
}
|
|
@@ -9118,6 +9633,8 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9118
9633
|
|
|
9119
9634
|
auto * rs_inp = build_rs_inp();
|
|
9120
9635
|
|
|
9636
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9637
|
+
|
|
9121
9638
|
for (int il = 0; il < n_layer; ++il) {
|
|
9122
9639
|
// norm
|
|
9123
9640
|
cur = build_norm(inpL,
|
|
@@ -9127,9 +9644,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9127
9644
|
|
|
9128
9645
|
cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
|
|
9129
9646
|
|
|
9130
|
-
if (il == n_layer - 1) {
|
|
9131
|
-
// skip computing output for unused tokens
|
|
9132
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9647
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9133
9648
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9134
9649
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9135
9650
|
}
|
|
@@ -9168,9 +9683,9 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9168
9683
|
ggml_tensor * cur,
|
|
9169
9684
|
const llama_ubatch & ubatch,
|
|
9170
9685
|
int il) const {
|
|
9171
|
-
const auto *
|
|
9686
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
9172
9687
|
|
|
9173
|
-
const auto kv_head =
|
|
9688
|
+
const auto kv_head = mctx_cur->get_head();
|
|
9174
9689
|
|
|
9175
9690
|
const int64_t d_conv = hparams.ssm_d_conv;
|
|
9176
9691
|
const int64_t d_inner = hparams.ssm_d_inner;
|
|
@@ -9188,8 +9703,8 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9188
9703
|
GGML_ASSERT(ubatch.equal_seqs);
|
|
9189
9704
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
9190
9705
|
|
|
9191
|
-
ggml_tensor * conv_states_all =
|
|
9192
|
-
ggml_tensor * ssm_states_all =
|
|
9706
|
+
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
9707
|
+
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
9193
9708
|
|
|
9194
9709
|
// (ab)using the KV cache to store the states
|
|
9195
9710
|
ggml_tensor * conv = build_rs(
|
|
@@ -9311,13 +9826,15 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
9311
9826
|
|
|
9312
9827
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9313
9828
|
|
|
9314
|
-
|
|
9829
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9315
9830
|
|
|
9831
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
9316
9832
|
// norm
|
|
9317
9833
|
cur = build_norm(inpL,
|
|
9318
9834
|
model.layers[il].attn_norm, NULL,
|
|
9319
9835
|
LLM_NORM, il);
|
|
9320
9836
|
cb(cur, "attn_norm", il);
|
|
9837
|
+
|
|
9321
9838
|
ggml_tensor * ffn_inp = cur;
|
|
9322
9839
|
|
|
9323
9840
|
// self-attention
|
|
@@ -9385,9 +9902,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
9385
9902
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9386
9903
|
}
|
|
9387
9904
|
|
|
9388
|
-
if (il == n_layer - 1) {
|
|
9389
|
-
// skip computing output for unused tokens
|
|
9390
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9905
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9391
9906
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9392
9907
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9393
9908
|
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
@@ -9458,6 +9973,8 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
9458
9973
|
|
|
9459
9974
|
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
9460
9975
|
|
|
9976
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9977
|
+
|
|
9461
9978
|
for (int il = 0; il < n_layer; ++il) {
|
|
9462
9979
|
const bool is_swa = hparams.is_swa(il);
|
|
9463
9980
|
|
|
@@ -9520,9 +10037,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
9520
10037
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9521
10038
|
}
|
|
9522
10039
|
|
|
9523
|
-
if (il == n_layer - 1) {
|
|
9524
|
-
// skip computing output for unused tokens
|
|
9525
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10040
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9526
10041
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9527
10042
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9528
10043
|
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
@@ -9593,6 +10108,8 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
9593
10108
|
|
|
9594
10109
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9595
10110
|
|
|
10111
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10112
|
+
|
|
9596
10113
|
for (int il = 0; il < n_layer; ++il) {
|
|
9597
10114
|
ggml_tensor * inpSA = inpL;
|
|
9598
10115
|
|
|
@@ -9651,9 +10168,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
9651
10168
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9652
10169
|
}
|
|
9653
10170
|
|
|
9654
|
-
if (il == n_layer - 1) {
|
|
9655
|
-
// skip computing output for unused tokens
|
|
9656
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10171
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9657
10172
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9658
10173
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9659
10174
|
}
|
|
@@ -9721,6 +10236,8 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
9721
10236
|
|
|
9722
10237
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9723
10238
|
|
|
10239
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10240
|
+
|
|
9724
10241
|
for (int il = 0; il < n_layer; ++il) {
|
|
9725
10242
|
ggml_tensor * inpSA = inpL;
|
|
9726
10243
|
|
|
@@ -9771,18 +10288,16 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
9771
10288
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9772
10289
|
}
|
|
9773
10290
|
|
|
10291
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10292
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10293
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10294
|
+
}
|
|
10295
|
+
|
|
9774
10296
|
cur = build_norm(cur,
|
|
9775
10297
|
model.layers[il].attn_post_norm, NULL,
|
|
9776
10298
|
LLM_NORM_RMS, il);
|
|
9777
10299
|
cb(cur, "attn_post_norm", il);
|
|
9778
10300
|
|
|
9779
|
-
if (il == n_layer - 1) {
|
|
9780
|
-
// skip computing output for unused tokens
|
|
9781
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9782
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9783
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9784
|
-
}
|
|
9785
|
-
|
|
9786
10301
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
9787
10302
|
cb(ffn_inp, "ffn_inp", il);
|
|
9788
10303
|
|
|
@@ -9850,6 +10365,8 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
9850
10365
|
|
|
9851
10366
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9852
10367
|
|
|
10368
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10369
|
+
|
|
9853
10370
|
for (int il = 0; il < n_layer; ++il) {
|
|
9854
10371
|
ggml_tensor * inpSA = inpL;
|
|
9855
10372
|
|
|
@@ -9904,9 +10421,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
9904
10421
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9905
10422
|
}
|
|
9906
10423
|
|
|
9907
|
-
if (il == n_layer - 1) {
|
|
9908
|
-
// skip computing output for unused tokens
|
|
9909
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10424
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9910
10425
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9911
10426
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9912
10427
|
}
|
|
@@ -9976,6 +10491,8 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
9976
10491
|
|
|
9977
10492
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9978
10493
|
|
|
10494
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10495
|
+
|
|
9979
10496
|
for (int il = 0; il < n_layer; ++il) {
|
|
9980
10497
|
const int64_t n_head = hparams.n_head(il);
|
|
9981
10498
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
@@ -10037,11 +10554,9 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
10037
10554
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10038
10555
|
}
|
|
10039
10556
|
|
|
10040
|
-
if (il == n_layer - 1) {
|
|
10041
|
-
// skip computing output for unused tokens
|
|
10042
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10557
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10043
10558
|
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
10044
|
-
cur
|
|
10559
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10045
10560
|
}
|
|
10046
10561
|
|
|
10047
10562
|
ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
|
@@ -10107,6 +10622,8 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
10107
10622
|
|
|
10108
10623
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10109
10624
|
|
|
10625
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10626
|
+
|
|
10110
10627
|
for (int il = 0; il < n_layer; ++il) {
|
|
10111
10628
|
cur = build_norm(inpL,
|
|
10112
10629
|
model.layers[il].attn_norm,
|
|
@@ -10151,9 +10668,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
10151
10668
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10152
10669
|
}
|
|
10153
10670
|
|
|
10154
|
-
if (il == n_layer - 1) {
|
|
10155
|
-
// skip computing output for unused tokens
|
|
10156
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10671
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10157
10672
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10158
10673
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
10159
10674
|
}
|
|
@@ -10255,6 +10770,8 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
10255
10770
|
|
|
10256
10771
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10257
10772
|
|
|
10773
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10774
|
+
|
|
10258
10775
|
for (int il = 0; il < n_layer; ++il) {
|
|
10259
10776
|
ggml_tensor * inpSA = inpL;
|
|
10260
10777
|
|
|
@@ -10301,9 +10818,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
10301
10818
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10302
10819
|
}
|
|
10303
10820
|
|
|
10304
|
-
if (il == n_layer - 1) {
|
|
10305
|
-
// skip computing output for unused tokens
|
|
10306
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10821
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10307
10822
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10308
10823
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10309
10824
|
}
|
|
@@ -10395,6 +10910,8 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
10395
10910
|
|
|
10396
10911
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
10397
10912
|
|
|
10913
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10914
|
+
|
|
10398
10915
|
for (int il = 0; il < n_layer; ++il) {
|
|
10399
10916
|
ggml_tensor * inpSA = inpL;
|
|
10400
10917
|
|
|
@@ -10456,14 +10973,11 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
10456
10973
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
10457
10974
|
}
|
|
10458
10975
|
|
|
10459
|
-
if (il == n_layer - 1) {
|
|
10460
|
-
// skip computing output for unused tokens
|
|
10461
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10976
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10462
10977
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10463
10978
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10464
10979
|
}
|
|
10465
10980
|
|
|
10466
|
-
|
|
10467
10981
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
10468
10982
|
cb(ffn_inp, "ffn_inp", il);
|
|
10469
10983
|
|
|
@@ -10571,6 +11085,8 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
10571
11085
|
|
|
10572
11086
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10573
11087
|
|
|
11088
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11089
|
+
|
|
10574
11090
|
for (int il = 0; il < n_layer; ++il) {
|
|
10575
11091
|
ggml_tensor * inpSA = inpL;
|
|
10576
11092
|
|
|
@@ -10720,9 +11236,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
10720
11236
|
}
|
|
10721
11237
|
}
|
|
10722
11238
|
|
|
10723
|
-
if (il == n_layer - 1) {
|
|
10724
|
-
// skip computing output for unused tokens
|
|
10725
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11239
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10726
11240
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10727
11241
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10728
11242
|
}
|
|
@@ -10818,6 +11332,8 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
10818
11332
|
|
|
10819
11333
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10820
11334
|
|
|
11335
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11336
|
+
|
|
10821
11337
|
for (int il = 0; il < n_layer; ++il) {
|
|
10822
11338
|
ggml_tensor * inpSA = inpL;
|
|
10823
11339
|
|
|
@@ -10900,9 +11416,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
10900
11416
|
cb(cur, "attn_o_out", il);
|
|
10901
11417
|
}
|
|
10902
11418
|
|
|
10903
|
-
if (il == n_layer - 1) {
|
|
10904
|
-
// skip computing output for unused tokens
|
|
10905
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11419
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10906
11420
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10907
11421
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10908
11422
|
}
|
|
@@ -10977,6 +11491,8 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
10977
11491
|
|
|
10978
11492
|
auto * inp_attn = build_attn_inp_no_cache();
|
|
10979
11493
|
|
|
11494
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11495
|
+
|
|
10980
11496
|
for (int il = 0; il < n_layer; ++il) {
|
|
10981
11497
|
ggml_tensor * inpSA = inpL;
|
|
10982
11498
|
|
|
@@ -11010,9 +11526,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
11010
11526
|
cb(cur, "kqv_out", il);
|
|
11011
11527
|
}
|
|
11012
11528
|
|
|
11013
|
-
if (il == n_layer - 1) {
|
|
11014
|
-
// skip computing output for unused tokens
|
|
11015
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11529
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11016
11530
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11017
11531
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11018
11532
|
}
|
|
@@ -11083,6 +11597,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
11083
11597
|
auto * inp_attn_self = build_attn_inp_kv_unified();
|
|
11084
11598
|
auto * inp_attn_cross = build_attn_inp_cross();
|
|
11085
11599
|
|
|
11600
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11601
|
+
|
|
11086
11602
|
for (int il = 0; il < n_layer; ++il) {
|
|
11087
11603
|
ggml_tensor * inpSA = inpL;
|
|
11088
11604
|
|
|
@@ -11174,11 +11690,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
11174
11690
|
//cb(cur, "kqv_out", il);
|
|
11175
11691
|
}
|
|
11176
11692
|
|
|
11177
|
-
if (il == n_layer - 1) {
|
|
11178
|
-
// skip computing output for unused tokens
|
|
11179
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11693
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11180
11694
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11181
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11182
11695
|
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
|
11183
11696
|
}
|
|
11184
11697
|
|
|
@@ -11248,6 +11761,8 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
11248
11761
|
|
|
11249
11762
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11250
11763
|
|
|
11764
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11765
|
+
|
|
11251
11766
|
for (int il = 0; il < n_layer; ++il) {
|
|
11252
11767
|
cur = build_norm(inpL,
|
|
11253
11768
|
model.layers[il].attn_norm,
|
|
@@ -11280,9 +11795,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
11280
11795
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
11281
11796
|
}
|
|
11282
11797
|
|
|
11283
|
-
if (il == n_layer - 1) {
|
|
11284
|
-
// skip computing output for unused tokens
|
|
11285
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11798
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11286
11799
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11287
11800
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
11288
11801
|
}
|
|
@@ -11346,6 +11859,8 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11346
11859
|
|
|
11347
11860
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11348
11861
|
|
|
11862
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11863
|
+
|
|
11349
11864
|
for (int il = 0; il < n_layer; ++il) {
|
|
11350
11865
|
ggml_tensor * inpSA = inpL;
|
|
11351
11866
|
|
|
@@ -11412,9 +11927,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11412
11927
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11413
11928
|
}
|
|
11414
11929
|
|
|
11415
|
-
if (il == n_layer - 1) {
|
|
11416
|
-
// skip computing output for unused tokens
|
|
11417
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11930
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11418
11931
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11419
11932
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11420
11933
|
}
|
|
@@ -11479,6 +11992,8 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
11479
11992
|
|
|
11480
11993
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11481
11994
|
|
|
11995
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11996
|
+
|
|
11482
11997
|
for (int il = 0; il < n_layer; ++il) {
|
|
11483
11998
|
ggml_tensor * inpSA = inpL;
|
|
11484
11999
|
|
|
@@ -11545,9 +12060,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
11545
12060
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11546
12061
|
}
|
|
11547
12062
|
|
|
11548
|
-
if (il == n_layer - 1) {
|
|
11549
|
-
// skip computing output for unused tokens
|
|
11550
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12063
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11551
12064
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11552
12065
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11553
12066
|
}
|
|
@@ -11630,6 +12143,8 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
11630
12143
|
|
|
11631
12144
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11632
12145
|
|
|
12146
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12147
|
+
|
|
11633
12148
|
for (int il = 0; il < n_layer; ++il) {
|
|
11634
12149
|
ggml_tensor * inpSA = inpL;
|
|
11635
12150
|
|
|
@@ -11689,9 +12204,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
11689
12204
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11690
12205
|
}
|
|
11691
12206
|
|
|
11692
|
-
if (il == n_layer - 1) {
|
|
11693
|
-
// skip computing output for unused tokens
|
|
11694
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12207
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11695
12208
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11696
12209
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11697
12210
|
}
|
|
@@ -11759,6 +12272,8 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
11759
12272
|
|
|
11760
12273
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11761
12274
|
|
|
12275
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12276
|
+
|
|
11762
12277
|
for (int il = 0; il < n_layer; ++il) {
|
|
11763
12278
|
ggml_tensor * inpSA = inpL;
|
|
11764
12279
|
|
|
@@ -11820,9 +12335,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
11820
12335
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11821
12336
|
}
|
|
11822
12337
|
|
|
11823
|
-
if (il == n_layer - 1) {
|
|
11824
|
-
// skip computing output for unused tokens
|
|
11825
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12338
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11826
12339
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11827
12340
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11828
12341
|
}
|
|
@@ -11915,7 +12428,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11915
12428
|
ggml_tensor * x_prev,
|
|
11916
12429
|
const llama_ubatch & ubatch,
|
|
11917
12430
|
int il) const {
|
|
11918
|
-
const auto *
|
|
12431
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
11919
12432
|
|
|
11920
12433
|
const auto n_tokens = ubatch.n_tokens;
|
|
11921
12434
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -11925,7 +12438,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11925
12438
|
const auto n_head = n_embd / head_size;
|
|
11926
12439
|
const auto n_head_kv = hparams.n_head_kv(il);
|
|
11927
12440
|
|
|
11928
|
-
const auto kv_head =
|
|
12441
|
+
const auto kv_head = mctx_cur->get_head();
|
|
11929
12442
|
|
|
11930
12443
|
const auto & layer = model.layers[il];
|
|
11931
12444
|
|
|
@@ -12037,7 +12550,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
12037
12550
|
}
|
|
12038
12551
|
|
|
12039
12552
|
ggml_tensor * wkv_state = build_rs(
|
|
12040
|
-
inp, gf,
|
|
12553
|
+
inp, gf, mctx_cur->get_s_l(il),
|
|
12041
12554
|
hparams.n_embd_s(), n_seqs);
|
|
12042
12555
|
|
|
12043
12556
|
ggml_tensor * wkv_output;
|
|
@@ -12056,9 +12569,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
12056
12569
|
wkv_state,
|
|
12057
12570
|
ggml_view_1d(
|
|
12058
12571
|
ctx0,
|
|
12059
|
-
|
|
12572
|
+
mctx_cur->get_s_l(il),
|
|
12060
12573
|
hparams.n_embd_s() * n_seqs,
|
|
12061
|
-
hparams.n_embd_s() * kv_head * ggml_element_size(
|
|
12574
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
|
|
12062
12575
|
)
|
|
12063
12576
|
)
|
|
12064
12577
|
);
|
|
@@ -12098,6 +12611,8 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
12098
12611
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12099
12612
|
const auto n_seqs = ubatch.n_seqs;
|
|
12100
12613
|
|
|
12614
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12615
|
+
|
|
12101
12616
|
for (int il = 0; il < n_layer; ++il) {
|
|
12102
12617
|
const llama_layer * layer = &model.layers[il];
|
|
12103
12618
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
@@ -12139,13 +12654,16 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
12139
12654
|
);
|
|
12140
12655
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
12141
12656
|
|
|
12142
|
-
|
|
12143
|
-
|
|
12144
|
-
|
|
12145
|
-
|
|
12146
|
-
|
|
12147
|
-
|
|
12148
|
-
|
|
12657
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
12658
|
+
ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
|
|
12659
|
+
x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
|
|
12660
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
12661
|
+
|
|
12662
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12663
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
12664
|
+
ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
|
|
12665
|
+
x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
|
|
12666
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12149
12667
|
}
|
|
12150
12668
|
|
|
12151
12669
|
cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
|
|
@@ -12193,6 +12711,8 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
12193
12711
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12194
12712
|
const auto n_seqs = ubatch.n_seqs;
|
|
12195
12713
|
|
|
12714
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12715
|
+
|
|
12196
12716
|
for (int il = 0; il < n_layer; ++il) {
|
|
12197
12717
|
const llama_layer * layer = &model.layers[il];
|
|
12198
12718
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
@@ -12217,11 +12737,12 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
12217
12737
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
12218
12738
|
cb(ffn_inp, "ffn_inp", il);
|
|
12219
12739
|
|
|
12220
|
-
|
|
12221
|
-
|
|
12222
|
-
|
|
12223
|
-
|
|
12224
|
-
|
|
12740
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
12741
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
12742
|
+
|
|
12743
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12744
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12745
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
12225
12746
|
}
|
|
12226
12747
|
|
|
12227
12748
|
// feed-forward network
|
|
@@ -12304,7 +12825,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12304
12825
|
ggml_tensor *& first_layer_value,
|
|
12305
12826
|
const llama_ubatch & ubatch,
|
|
12306
12827
|
int il) const {
|
|
12307
|
-
const auto *
|
|
12828
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
12308
12829
|
|
|
12309
12830
|
const auto n_tokens = ubatch.n_tokens;
|
|
12310
12831
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -12313,7 +12834,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12313
12834
|
const auto head_count = n_embd / head_size;
|
|
12314
12835
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12315
12836
|
|
|
12316
|
-
const auto kv_head =
|
|
12837
|
+
const auto kv_head = mctx_cur->get_head();
|
|
12317
12838
|
|
|
12318
12839
|
const auto & layer = model.layers[il];
|
|
12319
12840
|
|
|
@@ -12384,7 +12905,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12384
12905
|
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
|
12385
12906
|
|
|
12386
12907
|
ggml_tensor * wkv_state = build_rs(
|
|
12387
|
-
inp, gf,
|
|
12908
|
+
inp, gf, mctx_cur->get_s_l(il),
|
|
12388
12909
|
hparams.n_embd_s(), n_seqs);
|
|
12389
12910
|
|
|
12390
12911
|
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
|
@@ -12398,9 +12919,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12398
12919
|
wkv_state,
|
|
12399
12920
|
ggml_view_1d(
|
|
12400
12921
|
ctx0,
|
|
12401
|
-
|
|
12922
|
+
mctx_cur->get_s_l(il),
|
|
12402
12923
|
hparams.n_embd_s() * n_seqs,
|
|
12403
|
-
hparams.n_embd_s() * kv_head * ggml_element_size(
|
|
12924
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
|
|
12404
12925
|
)
|
|
12405
12926
|
)
|
|
12406
12927
|
);
|
|
@@ -12447,6 +12968,8 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12447
12968
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12448
12969
|
const auto n_seqs = ubatch.n_seqs;
|
|
12449
12970
|
|
|
12971
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12972
|
+
|
|
12450
12973
|
for (int il = 0; il < n_layer; ++il) {
|
|
12451
12974
|
const llama_layer * layer = &model.layers[il];
|
|
12452
12975
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
@@ -12488,12 +13011,14 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12488
13011
|
);
|
|
12489
13012
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
12490
13013
|
|
|
12491
|
-
|
|
12492
|
-
|
|
12493
|
-
|
|
12494
|
-
|
|
12495
|
-
|
|
12496
|
-
|
|
13014
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
13015
|
+
ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
|
|
13016
|
+
x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
|
|
13017
|
+
|
|
13018
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13019
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
13020
|
+
ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
|
|
13021
|
+
x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
|
|
12497
13022
|
}
|
|
12498
13023
|
|
|
12499
13024
|
cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
|
|
@@ -12538,6 +13063,8 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12538
13063
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12539
13064
|
const auto n_seqs = ubatch.n_seqs;
|
|
12540
13065
|
|
|
13066
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13067
|
+
|
|
12541
13068
|
for (int il = 0; il < n_layer; ++il) {
|
|
12542
13069
|
const llama_layer * layer = &model.layers[il];
|
|
12543
13070
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
@@ -12562,11 +13089,12 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12562
13089
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
12563
13090
|
cb(ffn_inp, "ffn_inp", il);
|
|
12564
13091
|
|
|
12565
|
-
|
|
12566
|
-
|
|
12567
|
-
|
|
12568
|
-
|
|
12569
|
-
|
|
13092
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
13093
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
13094
|
+
|
|
13095
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13096
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13097
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
12570
13098
|
}
|
|
12571
13099
|
|
|
12572
13100
|
// feed-forward network
|
|
@@ -12635,6 +13163,9 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
12635
13163
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12636
13164
|
|
|
12637
13165
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
13166
|
+
|
|
13167
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13168
|
+
|
|
12638
13169
|
for (int il = 0; il < n_layer; ++il) {
|
|
12639
13170
|
ggml_tensor * inpSA = inpL;
|
|
12640
13171
|
|
|
@@ -12697,9 +13228,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
12697
13228
|
cb(cur, "attn_out", il);
|
|
12698
13229
|
}
|
|
12699
13230
|
|
|
12700
|
-
if (il == n_layer - 1) {
|
|
12701
|
-
// skip computing output for unused tokens
|
|
12702
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13231
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12703
13232
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12704
13233
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12705
13234
|
}
|
|
@@ -12818,6 +13347,8 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
12818
13347
|
|
|
12819
13348
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12820
13349
|
|
|
13350
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13351
|
+
|
|
12821
13352
|
for (int il = 0; il < n_layer; ++il) {
|
|
12822
13353
|
ggml_tensor * inpSA = inpL;
|
|
12823
13354
|
|
|
@@ -12894,21 +13425,19 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
12894
13425
|
cur = build_attn(inp_attn, gf,
|
|
12895
13426
|
model.layers[il].wo, nullptr,
|
|
12896
13427
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12897
|
-
|
|
12898
|
-
if (hparams.swin_norm) {
|
|
12899
|
-
cur = build_norm(cur,
|
|
12900
|
-
model.layers[il].attn_norm, NULL,
|
|
12901
|
-
LLM_NORM_RMS, il);
|
|
12902
|
-
}
|
|
12903
13428
|
}
|
|
12904
13429
|
|
|
12905
|
-
if (il == n_layer - 1) {
|
|
12906
|
-
// skip computing output for unused tokens
|
|
12907
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13430
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12908
13431
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12909
13432
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12910
13433
|
}
|
|
12911
13434
|
|
|
13435
|
+
if (hparams.swin_norm) {
|
|
13436
|
+
cur = build_norm(cur,
|
|
13437
|
+
model.layers[il].attn_norm, NULL,
|
|
13438
|
+
LLM_NORM_RMS, il);
|
|
13439
|
+
}
|
|
13440
|
+
|
|
12912
13441
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12913
13442
|
cb(ffn_inp, "ffn_inp", il);
|
|
12914
13443
|
|
|
@@ -13149,6 +13678,8 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
13149
13678
|
|
|
13150
13679
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13151
13680
|
|
|
13681
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13682
|
+
|
|
13152
13683
|
for (int il = 0; il < n_layer; ++il) {
|
|
13153
13684
|
ggml_tensor * inpSA = inpL;
|
|
13154
13685
|
|
|
@@ -13252,9 +13783,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
13252
13783
|
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
13253
13784
|
}
|
|
13254
13785
|
|
|
13255
|
-
if (il == n_layer - 1) {
|
|
13256
|
-
// skip computing output for unused tokens
|
|
13257
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13786
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13258
13787
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13259
13788
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13260
13789
|
}
|
|
@@ -13314,6 +13843,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13314
13843
|
|
|
13315
13844
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13316
13845
|
|
|
13846
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13847
|
+
|
|
13317
13848
|
for (int il = 0; il < n_layer; ++il) {
|
|
13318
13849
|
ggml_tensor * inpSA = inpL;
|
|
13319
13850
|
|
|
@@ -13375,9 +13906,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13375
13906
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
13376
13907
|
}
|
|
13377
13908
|
|
|
13378
|
-
if (il == n_layer - 1) {
|
|
13379
|
-
// skip computing output for unused tokens
|
|
13380
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13909
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13381
13910
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13382
13911
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13383
13912
|
}
|
|
@@ -13463,6 +13992,8 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
13463
13992
|
|
|
13464
13993
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13465
13994
|
|
|
13995
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13996
|
+
|
|
13466
13997
|
for (int il = 0; il < n_layer; ++il) {
|
|
13467
13998
|
ggml_tensor * inpSA = inpL;
|
|
13468
13999
|
|
|
@@ -13515,9 +14046,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
13515
14046
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13516
14047
|
}
|
|
13517
14048
|
|
|
13518
|
-
if (il == n_layer - 1) {
|
|
13519
|
-
// skip computing output for unused tokens
|
|
13520
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14049
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13521
14050
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13522
14051
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13523
14052
|
}
|
|
@@ -13615,6 +14144,8 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
13615
14144
|
|
|
13616
14145
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
13617
14146
|
|
|
14147
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14148
|
+
|
|
13618
14149
|
for (int il = 0; il < n_layer; ++il) {
|
|
13619
14150
|
ggml_tensor * inpSA = inpL;
|
|
13620
14151
|
|
|
@@ -13677,9 +14208,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
13677
14208
|
cb(cur, "attn_out", il);
|
|
13678
14209
|
}
|
|
13679
14210
|
|
|
13680
|
-
if (il == n_layer - 1) {
|
|
13681
|
-
// skip computing output for unused tokens
|
|
13682
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14211
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13683
14212
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13684
14213
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13685
14214
|
}
|
|
@@ -13957,6 +14486,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13957
14486
|
{
|
|
13958
14487
|
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
|
|
13959
14488
|
} break;
|
|
14489
|
+
case LLM_ARCH_GEMMA3N:
|
|
14490
|
+
{
|
|
14491
|
+
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params, gf);
|
|
14492
|
+
} break;
|
|
13960
14493
|
case LLM_ARCH_STARCODER2:
|
|
13961
14494
|
{
|
|
13962
14495
|
llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
|
|
@@ -14278,6 +14811,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14278
14811
|
case LLM_ARCH_GEMMA:
|
|
14279
14812
|
case LLM_ARCH_GEMMA2:
|
|
14280
14813
|
case LLM_ARCH_GEMMA3:
|
|
14814
|
+
case LLM_ARCH_GEMMA3N:
|
|
14281
14815
|
case LLM_ARCH_STARCODER2:
|
|
14282
14816
|
case LLM_ARCH_OPENELM:
|
|
14283
14817
|
case LLM_ARCH_GPTNEOX:
|
|
@@ -14360,7 +14894,7 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
|
|
14360
14894
|
// do not extend this list unless absolutely necessary
|
|
14361
14895
|
// Mistral-Small-2503 does not have built-in chat template
|
|
14362
14896
|
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
|
14363
|
-
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
|
14897
|
+
if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
|
14364
14898
|
return "mistral-v7-tekken";
|
|
14365
14899
|
}
|
|
14366
14900
|
|