cui-llama.rn 1.6.1 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +6 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +38 -5
- package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
- package/android/src/main/jni.cpp +153 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
- package/cpp/chat.cpp +128 -106
- package/cpp/chat.h +2 -0
- package/cpp/common.cpp +41 -76
- package/cpp/common.h +23 -19
- package/cpp/ggml-backend.cpp +9 -5
- package/cpp/ggml-backend.h +4 -4
- package/cpp/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/cpp/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/cpp/ggml-cpu/ggml-cpu.c +5 -13
- package/cpp/ggml-cpu/ggml-cpu.cpp +29 -16
- package/cpp/ggml-cpu/ops.cpp +107 -13
- package/cpp/ggml-cpu/vec.cpp +0 -6
- package/cpp/ggml-cpu/vec.h +16 -0
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal-impl.h +36 -11
- package/cpp/ggml-metal.m +321 -132
- package/cpp/ggml-opt.cpp +373 -190
- package/cpp/ggml-opt.h +49 -28
- package/cpp/ggml-quants.c +0 -6
- package/cpp/ggml.c +93 -38
- package/cpp/ggml.h +21 -7
- package/cpp/gguf.cpp +33 -33
- package/cpp/llama-adapter.cpp +6 -0
- package/cpp/llama-arch.cpp +3 -0
- package/cpp/llama-batch.cpp +3 -1
- package/cpp/llama-chat.cpp +8 -6
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +349 -135
- package/cpp/llama-context.h +30 -3
- package/cpp/llama-cparams.h +1 -0
- package/cpp/llama-graph.cpp +150 -234
- package/cpp/llama-graph.h +52 -7
- package/cpp/llama-hparams.cpp +17 -1
- package/cpp/llama-hparams.h +34 -5
- package/cpp/llama-kv-cache.cpp +662 -321
- package/cpp/llama-kv-cache.h +203 -93
- package/cpp/llama-memory.h +3 -2
- package/cpp/llama-model-loader.cpp +24 -15
- package/cpp/llama-model-saver.cpp +281 -0
- package/cpp/llama-model-saver.h +37 -0
- package/cpp/llama-model.cpp +536 -132
- package/cpp/llama-model.h +7 -1
- package/cpp/llama-sampling.cpp +18 -6
- package/cpp/llama-vocab.cpp +46 -8
- package/cpp/llama-vocab.h +6 -0
- package/cpp/llama.cpp +14 -0
- package/cpp/llama.h +72 -131
- package/cpp/minja/chat-template.hpp +9 -5
- package/cpp/minja/minja.hpp +69 -36
- package/cpp/rn-llama.cpp +611 -47
- package/cpp/rn-llama.h +33 -3
- package/cpp/sampling.cpp +57 -50
- package/cpp/tools/mtmd/clip-impl.h +462 -0
- package/cpp/tools/mtmd/clip.cpp +4024 -0
- package/cpp/tools/mtmd/clip.h +101 -0
- package/cpp/tools/mtmd/miniaudio.h +93468 -0
- package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
- package/cpp/tools/mtmd/mtmd.cpp +942 -0
- package/cpp/tools/mtmd/mtmd.h +362 -0
- package/cpp/tools/mtmd/stb_image.h +7988 -0
- package/ios/CMakeLists.txt +7 -0
- package/ios/RNLlama.mm +77 -3
- package/ios/RNLlamaContext.h +5 -1
- package/ios/RNLlamaContext.mm +105 -10
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +23 -19
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +21 -7
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +30 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +52 -7
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +34 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +203 -93
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +3 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +7 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +72 -131
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +33 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +23 -19
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +21 -7
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +30 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +52 -7
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +34 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +203 -93
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +3 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +7 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +72 -131
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +33 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +23 -19
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +21 -7
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +30 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +52 -7
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +34 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +203 -93
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +3 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +7 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +72 -131
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +33 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +23 -19
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +21 -7
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +30 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +52 -7
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +34 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +203 -93
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +3 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +7 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +72 -131
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +33 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +33 -7
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/index.js +153 -21
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/index.js +152 -20
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +50 -4
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +72 -6
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +67 -4
- package/src/index.ts +212 -38
- package/lib/commonjs/chat.js +0 -37
- package/lib/commonjs/chat.js.map +0 -1
- package/lib/module/chat.js +0 -33
- package/lib/module/chat.js.map +0 -1
- package/lib/typescript/chat.d.ts +0 -10
- package/lib/typescript/chat.d.ts.map +0 -1
- package/src/chat.ts +0 -44
package/cpp/llama-model.cpp
CHANGED
@@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) {
|
|
80
80
|
case LLM_TYPE_236B: return "236B";
|
81
81
|
case LLM_TYPE_290B: return "290B";
|
82
82
|
case LLM_TYPE_314B: return "314B";
|
83
|
+
case LLM_TYPE_405B: return "405B";
|
83
84
|
case LLM_TYPE_671B: return "671B";
|
84
85
|
case LLM_TYPE_SMALL: return "0.1B";
|
85
86
|
case LLM_TYPE_MEDIUM: return "0.4B";
|
@@ -116,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
|
|
116
117
|
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
117
118
|
};
|
118
119
|
|
120
|
+
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
|
121
|
+
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
|
122
|
+
}
|
123
|
+
|
119
124
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
120
125
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
121
126
|
if (kv.second == name) {
|
@@ -298,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<lm_ggml_backend_dev_t> &
|
|
298
303
|
// add extra buffer types, only if no GPU device is present
|
299
304
|
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
300
305
|
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
306
|
+
if (cpu_dev == nullptr) {
|
307
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
308
|
+
}
|
309
|
+
|
301
310
|
auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
|
302
311
|
auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
|
303
312
|
lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
|
@@ -454,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
454
463
|
LM_GGML_ASSERT(hparams.n_expert_used == 0);
|
455
464
|
}
|
456
465
|
|
457
|
-
// zero-out the array hparams
|
458
466
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
459
467
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
460
468
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
461
469
|
|
470
|
+
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
471
|
+
|
472
|
+
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
|
473
|
+
|
462
474
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
463
475
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
464
476
|
|
@@ -562,9 +574,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
562
574
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
563
575
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
564
576
|
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
565
|
-
|
566
|
-
hparams.
|
567
|
-
hparams.n_swa
|
577
|
+
|
578
|
+
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
579
|
+
hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
580
|
+
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
568
581
|
|
569
582
|
switch (hparams.n_expert) {
|
570
583
|
case 16: type = LLM_TYPE_17B_16E; break;
|
@@ -582,6 +595,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
582
595
|
switch (hparams.n_layer) {
|
583
596
|
case 32: type = LLM_TYPE_7B; break;
|
584
597
|
case 80: type = LLM_TYPE_70B; break;
|
598
|
+
case 162: type = LLM_TYPE_405B; break;
|
585
599
|
default: type = LLM_TYPE_UNKNOWN;
|
586
600
|
}
|
587
601
|
} break;
|
@@ -842,22 +856,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
842
856
|
default: type = LLM_TYPE_UNKNOWN;
|
843
857
|
}
|
844
858
|
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
//
|
852
|
-
hparams.
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
hparams.n_swa = 131072;
|
857
|
-
}
|
858
|
-
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
859
|
-
if (!found_swa && hparams.n_swa == 0) {
|
860
|
-
throw std::runtime_error("invalid value for sliding_window");
|
859
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
860
|
+
|
861
|
+
if (found_swa && hparams.n_swa > 0) {
|
862
|
+
LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
|
863
|
+
__func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
|
864
|
+
|
865
|
+
// TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
|
866
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
867
|
+
|
868
|
+
hparams.n_swa = 0;
|
869
|
+
hparams.set_swa_pattern(1);
|
861
870
|
}
|
862
871
|
} break;
|
863
872
|
case LLM_ARCH_PHIMOE:
|
@@ -927,8 +936,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
927
936
|
} break;
|
928
937
|
case LLM_ARCH_GEMMA2:
|
929
938
|
{
|
939
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
930
940
|
hparams.n_swa = 4096; // default value of gemma 2
|
931
|
-
hparams.
|
941
|
+
hparams.set_swa_pattern(2);
|
932
942
|
hparams.attn_soft_cap = true;
|
933
943
|
|
934
944
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
@@ -945,7 +955,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
945
955
|
} break;
|
946
956
|
case LLM_ARCH_GEMMA3:
|
947
957
|
{
|
948
|
-
hparams.
|
958
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
959
|
+
hparams.set_swa_pattern(6);
|
949
960
|
|
950
961
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
951
962
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
@@ -1029,7 +1040,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
1029
1040
|
} break;
|
1030
1041
|
case LLM_ARCH_COHERE2:
|
1031
1042
|
{
|
1032
|
-
hparams.
|
1043
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
1044
|
+
hparams.set_swa_pattern(4);
|
1033
1045
|
|
1034
1046
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
1035
1047
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
@@ -1379,6 +1391,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
1379
1391
|
// Add additional layer/vocab/etc checks here for other model sizes
|
1380
1392
|
default: type = LLM_TYPE_UNKNOWN;
|
1381
1393
|
}
|
1394
|
+
|
1395
|
+
// For Granite MoE Shared
|
1396
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
1382
1397
|
} break;
|
1383
1398
|
case LLM_ARCH_CHAMELEON:
|
1384
1399
|
{
|
@@ -1482,6 +1497,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1482
1497
|
}
|
1483
1498
|
|
1484
1499
|
lm_ggml_backend_dev_t cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
1500
|
+
if (cpu_dev == nullptr) {
|
1501
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
1502
|
+
}
|
1485
1503
|
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
1486
1504
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
1487
1505
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
@@ -1649,8 +1667,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1649
1667
|
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
1650
1668
|
std::regex pattern(overrides->pattern);
|
1651
1669
|
if (std::regex_search(tensor_name, pattern)) {
|
1652
|
-
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), lm_ggml_backend_buft_name(overrides->buft));
|
1653
1670
|
buft = overrides->buft;
|
1671
|
+
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
1672
|
+
tensor_name.c_str(),
|
1673
|
+
lm_ggml_nbytes(t_meta) / 1024 / 1024, lm_ggml_type_name(t_meta->type),
|
1674
|
+
lm_ggml_backend_buft_name(buft));
|
1654
1675
|
break;
|
1655
1676
|
}
|
1656
1677
|
}
|
@@ -1667,6 +1688,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1667
1688
|
auto * buft_dev = lm_ggml_backend_buft_get_device(buft);
|
1668
1689
|
if (ml.use_mmap && buft_dev && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
|
1669
1690
|
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
1691
|
+
if (!cpu_dev) {
|
1692
|
+
throw std::runtime_error("no CPU backend found");
|
1693
|
+
}
|
1670
1694
|
buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
|
1671
1695
|
}
|
1672
1696
|
|
@@ -1753,6 +1777,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1753
1777
|
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
1754
1778
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
1755
1779
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
1780
|
+
|
1781
|
+
// For Granite MoE Shared
|
1782
|
+
if (hparams.n_ff_shexp > 0) {
|
1783
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
1784
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
1785
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
1786
|
+
}
|
1756
1787
|
}
|
1757
1788
|
}
|
1758
1789
|
} break;
|
@@ -1848,7 +1879,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1848
1879
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
1849
1880
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
1850
1881
|
|
1851
|
-
|
1882
|
+
if (n_ff > 0) {
|
1883
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1884
|
+
}
|
1852
1885
|
|
1853
1886
|
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
1854
1887
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
@@ -1858,9 +1891,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1858
1891
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
1859
1892
|
}
|
1860
1893
|
|
1861
|
-
|
1862
|
-
|
1863
|
-
|
1894
|
+
if (n_ff > 0) {
|
1895
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1896
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1897
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1898
|
+
}
|
1864
1899
|
|
1865
1900
|
// optional MLP bias
|
1866
1901
|
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
@@ -3504,7 +3539,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
3504
3539
|
|
3505
3540
|
// output
|
3506
3541
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
3507
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
3542
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
3543
|
+
// if output is NULL, init from the input tok embed
|
3544
|
+
if (output == NULL) {
|
3545
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
3546
|
+
}
|
3508
3547
|
|
3509
3548
|
for (int i = 0; i < n_layer; ++i) {
|
3510
3549
|
auto & layer = layers[i];
|
@@ -4109,6 +4148,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
4109
4148
|
if (!dev) {
|
4110
4149
|
// FIXME: workaround for CPU backend buft having a NULL device
|
4111
4150
|
dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
4151
|
+
if (!dev) {
|
4152
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
4153
|
+
}
|
4112
4154
|
}
|
4113
4155
|
lm_ggml_backend_dev_props props;
|
4114
4156
|
lm_ggml_backend_dev_get_props(dev, &props);
|
@@ -4238,7 +4280,7 @@ uint64_t llama_model::n_elements() const {
|
|
4238
4280
|
}
|
4239
4281
|
|
4240
4282
|
void llama_model::print_info() const {
|
4241
|
-
const
|
4283
|
+
const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
|
4242
4284
|
|
4243
4285
|
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
4244
4286
|
bool is_var = false;
|
@@ -4281,7 +4323,7 @@ void llama_model::print_info() const {
|
|
4281
4323
|
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
4282
4324
|
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
4283
4325
|
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
4284
|
-
LLAMA_LOG_INFO("%s:
|
4326
|
+
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
4285
4327
|
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
4286
4328
|
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
4287
4329
|
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
@@ -4299,7 +4341,7 @@ void llama_model::print_info() const {
|
|
4299
4341
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
4300
4342
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
4301
4343
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
4302
|
-
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
4344
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
4303
4345
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
4304
4346
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
4305
4347
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
@@ -4355,10 +4397,13 @@ void llama_model::print_info() const {
|
|
4355
4397
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
4356
4398
|
}
|
4357
4399
|
|
4358
|
-
if (arch == LLM_ARCH_MINICPM ||
|
4400
|
+
if (arch == LLM_ARCH_MINICPM ||
|
4401
|
+
arch == LLM_ARCH_GRANITE ||
|
4402
|
+
arch == LLM_ARCH_GRANITE_MOE) {
|
4359
4403
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
4360
4404
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
4361
4405
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
4406
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
4362
4407
|
}
|
4363
4408
|
|
4364
4409
|
if (arch == LLM_ARCH_BAILINGMOE) {
|
@@ -4446,7 +4491,17 @@ const lm_ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
4446
4491
|
return it->second;
|
4447
4492
|
}
|
4448
4493
|
|
4449
|
-
|
4494
|
+
float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
|
4495
|
+
return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
4496
|
+
}
|
4497
|
+
|
4498
|
+
float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
|
4499
|
+
return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
4500
|
+
}
|
4501
|
+
|
4502
|
+
lm_ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
|
4503
|
+
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
4504
|
+
|
4450
4505
|
// choose long/short freq factors based on the context size
|
4451
4506
|
if (layers[il].rope_freqs != nullptr) {
|
4452
4507
|
return layers[il].rope_freqs;
|
@@ -4474,21 +4529,174 @@ struct llm_build_llama : public llm_graph_context {
|
|
4474
4529
|
// inp_pos - contains the positions
|
4475
4530
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
4476
4531
|
|
4532
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
4533
|
+
|
4534
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
4535
|
+
|
4536
|
+
for (int il = 0; il < n_layer; ++il) {
|
4537
|
+
lm_ggml_tensor * inpSA = inpL;
|
4538
|
+
|
4539
|
+
// norm
|
4540
|
+
cur = build_norm(inpL,
|
4541
|
+
model.layers[il].attn_norm, NULL,
|
4542
|
+
LLM_NORM_RMS, il);
|
4543
|
+
cb(cur, "attn_norm", il);
|
4544
|
+
|
4545
|
+
// self-attention
|
4546
|
+
{
|
4547
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
4548
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
4549
|
+
|
4550
|
+
// compute Q and K and RoPE them
|
4551
|
+
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
4552
|
+
cb(Qcur, "Qcur", il);
|
4553
|
+
if (model.layers[il].bq) {
|
4554
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
4555
|
+
cb(Qcur, "Qcur", il);
|
4556
|
+
}
|
4557
|
+
|
4558
|
+
lm_ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
4559
|
+
cb(Kcur, "Kcur", il);
|
4560
|
+
if (model.layers[il].bk) {
|
4561
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
4562
|
+
cb(Kcur, "Kcur", il);
|
4563
|
+
}
|
4564
|
+
|
4565
|
+
lm_ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
4566
|
+
cb(Vcur, "Vcur", il);
|
4567
|
+
if (model.layers[il].bv) {
|
4568
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
4569
|
+
cb(Vcur, "Vcur", il);
|
4570
|
+
}
|
4571
|
+
|
4572
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
4573
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
4574
|
+
Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
4575
|
+
|
4576
|
+
Qcur = lm_ggml_rope_ext(
|
4577
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
4578
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
4579
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4580
|
+
);
|
4581
|
+
|
4582
|
+
Kcur = lm_ggml_rope_ext(
|
4583
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
4584
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
4585
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4586
|
+
);
|
4587
|
+
|
4588
|
+
cb(Qcur, "Qcur", il);
|
4589
|
+
cb(Kcur, "Kcur", il);
|
4590
|
+
cb(Vcur, "Vcur", il);
|
4591
|
+
|
4592
|
+
cur = build_attn(inp_attn, gf,
|
4593
|
+
model.layers[il].wo, model.layers[il].bo,
|
4594
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
4595
|
+
cb(cur, "attn_out", il);
|
4596
|
+
}
|
4597
|
+
|
4598
|
+
if (il == n_layer - 1) {
|
4599
|
+
// skip computing output for unused tokens
|
4600
|
+
lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
4601
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
4602
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
4603
|
+
}
|
4604
|
+
|
4605
|
+
lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
4606
|
+
cb(ffn_inp, "ffn_inp", il);
|
4607
|
+
|
4608
|
+
// feed-forward network (non-MoE)
|
4609
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
4610
|
+
|
4611
|
+
cur = build_norm(ffn_inp,
|
4612
|
+
model.layers[il].ffn_norm, NULL,
|
4613
|
+
LLM_NORM_RMS, il);
|
4614
|
+
cb(cur, "ffn_norm", il);
|
4615
|
+
|
4616
|
+
cur = build_ffn(cur,
|
4617
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
4618
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
4619
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
4620
|
+
NULL,
|
4621
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
4622
|
+
cb(cur, "ffn_out", il);
|
4623
|
+
} else {
|
4624
|
+
// MoE branch
|
4625
|
+
cur = build_norm(ffn_inp,
|
4626
|
+
model.layers[il].ffn_norm, NULL,
|
4627
|
+
LLM_NORM_RMS, il);
|
4628
|
+
cb(cur, "ffn_norm", il);
|
4629
|
+
|
4630
|
+
cur = build_moe_ffn(cur,
|
4631
|
+
model.layers[il].ffn_gate_inp,
|
4632
|
+
model.layers[il].ffn_up_exps,
|
4633
|
+
model.layers[il].ffn_gate_exps,
|
4634
|
+
model.layers[il].ffn_down_exps,
|
4635
|
+
nullptr,
|
4636
|
+
n_expert, n_expert_used,
|
4637
|
+
LLM_FFN_SILU, true,
|
4638
|
+
false, 0.0,
|
4639
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
4640
|
+
il);
|
4641
|
+
cb(cur, "ffn_moe_out", il);
|
4642
|
+
}
|
4643
|
+
|
4644
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
4645
|
+
cb(cur, "ffn_out", il);
|
4646
|
+
|
4647
|
+
cur = build_cvec(cur, il);
|
4648
|
+
cb(cur, "l_out", il);
|
4649
|
+
|
4650
|
+
// input for next layer
|
4651
|
+
inpL = cur;
|
4652
|
+
}
|
4653
|
+
|
4654
|
+
cur = inpL;
|
4655
|
+
|
4656
|
+
cur = build_norm(cur,
|
4657
|
+
model.output_norm, NULL,
|
4658
|
+
LLM_NORM_RMS, -1);
|
4659
|
+
|
4660
|
+
cb(cur, "result_norm", -1);
|
4661
|
+
res->t_embd = cur;
|
4662
|
+
|
4663
|
+
// lm_head
|
4664
|
+
cur = build_lora_mm(model.output, cur);
|
4665
|
+
|
4666
|
+
cb(cur, "result_output", -1);
|
4667
|
+
res->t_logits = cur;
|
4668
|
+
|
4669
|
+
lm_ggml_build_forward_expand(gf, cur);
|
4670
|
+
}
|
4671
|
+
};
|
4672
|
+
|
4673
|
+
struct llm_build_llama_iswa : public llm_graph_context {
|
4674
|
+
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
4675
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4676
|
+
|
4677
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4678
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
4679
|
+
|
4680
|
+
lm_ggml_tensor * cur;
|
4681
|
+
lm_ggml_tensor * inpL;
|
4682
|
+
|
4683
|
+
inpL = build_inp_embd(model.tok_embd);
|
4684
|
+
|
4685
|
+
// inp_pos - contains the positions
|
4686
|
+
lm_ggml_tensor * inp_pos = build_inp_pos();
|
4687
|
+
|
4477
4688
|
// temperature tuning
|
4478
4689
|
lm_ggml_tensor * inp_attn_scale = nullptr;
|
4479
|
-
|
4480
|
-
inp_attn_scale = build_inp_attn_scale();
|
4481
|
-
}
|
4690
|
+
inp_attn_scale = build_inp_attn_scale();
|
4482
4691
|
|
4483
|
-
auto * inp_attn =
|
4692
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
4484
4693
|
|
4485
4694
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
4695
|
+
|
4486
4696
|
for (int il = 0; il < n_layer; ++il) {
|
4487
4697
|
lm_ggml_tensor * inpSA = inpL;
|
4488
4698
|
|
4489
|
-
bool use_rope =
|
4490
|
-
? (il + 1) % hparams.n_no_rope_layer_step != 0
|
4491
|
-
: true;
|
4699
|
+
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
4492
4700
|
|
4493
4701
|
// norm
|
4494
4702
|
cur = build_norm(inpL,
|
@@ -4499,7 +4707,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
4499
4707
|
// self-attention
|
4500
4708
|
{
|
4501
4709
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
4502
|
-
lm_ggml_tensor * rope_factors = model.get_rope_factors(
|
4710
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
4503
4711
|
|
4504
4712
|
// compute Q and K and RoPE them
|
4505
4713
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -4547,7 +4755,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
4547
4755
|
cb(Kcur, "Kcur", il);
|
4548
4756
|
cb(Vcur, "Vcur", il);
|
4549
4757
|
|
4550
|
-
if (
|
4758
|
+
if (use_rope && hparams.use_kq_norm) {
|
4551
4759
|
// Llama4TextL2Norm
|
4552
4760
|
Qcur = lm_ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
|
4553
4761
|
Kcur = lm_ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
|
@@ -4568,17 +4776,11 @@ struct llm_build_llama : public llm_graph_context {
|
|
4568
4776
|
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
4569
4777
|
}
|
4570
4778
|
|
4571
|
-
// For Granite architecture
|
4572
|
-
if (hparams.f_residual_scale) {
|
4573
|
-
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
4574
|
-
}
|
4575
|
-
|
4576
4779
|
lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
4577
4780
|
cb(ffn_inp, "ffn_inp", il);
|
4578
4781
|
|
4579
4782
|
// feed-forward network (non-MoE)
|
4580
4783
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
4581
|
-
|
4582
4784
|
cur = build_norm(ffn_inp,
|
4583
4785
|
model.layers[il].ffn_norm, NULL,
|
4584
4786
|
LLM_NORM_RMS, il);
|
@@ -4591,9 +4793,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
4591
4793
|
NULL,
|
4592
4794
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
4593
4795
|
cb(cur, "ffn_out", il);
|
4594
|
-
|
4595
|
-
} else if (arch == LLM_ARCH_LLAMA4) {
|
4596
|
-
// llama4 MoE
|
4796
|
+
} else {
|
4597
4797
|
lm_ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
|
4598
4798
|
model.layers[il].ffn_norm, NULL,
|
4599
4799
|
LLM_NORM_RMS, il);
|
@@ -4622,31 +4822,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
4622
4822
|
|
4623
4823
|
cur = lm_ggml_add(ctx0, moe_out, shexp_out);
|
4624
4824
|
cb(cur, "ffn_moe_out_merged", il);
|
4625
|
-
|
4626
|
-
} else {
|
4627
|
-
// MoE branch
|
4628
|
-
cur = build_norm(ffn_inp,
|
4629
|
-
model.layers[il].ffn_norm, NULL,
|
4630
|
-
LLM_NORM_RMS, il);
|
4631
|
-
cb(cur, "ffn_norm", il);
|
4632
|
-
|
4633
|
-
cur = build_moe_ffn(cur,
|
4634
|
-
model.layers[il].ffn_gate_inp,
|
4635
|
-
model.layers[il].ffn_up_exps,
|
4636
|
-
model.layers[il].ffn_gate_exps,
|
4637
|
-
model.layers[il].ffn_down_exps,
|
4638
|
-
nullptr,
|
4639
|
-
n_expert, n_expert_used,
|
4640
|
-
LLM_FFN_SILU, true,
|
4641
|
-
false, 0.0,
|
4642
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
4643
|
-
il);
|
4644
|
-
cb(cur, "ffn_moe_out", il);
|
4645
|
-
}
|
4646
|
-
|
4647
|
-
// For Granite architecture
|
4648
|
-
if (hparams.f_residual_scale) {
|
4649
|
-
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
4650
4825
|
}
|
4651
4826
|
|
4652
4827
|
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
@@ -4671,11 +4846,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
4671
4846
|
// lm_head
|
4672
4847
|
cur = build_lora_mm(model.output, cur);
|
4673
4848
|
|
4674
|
-
// For Granite architecture
|
4675
|
-
if (hparams.f_logit_scale) {
|
4676
|
-
cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
4677
|
-
}
|
4678
|
-
|
4679
4849
|
cb(cur, "result_output", -1);
|
4680
4850
|
res->t_logits = cur;
|
4681
4851
|
|
@@ -4705,6 +4875,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
4705
4875
|
lm_ggml_tensor * inpSA = inpL;
|
4706
4876
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
4707
4877
|
const int64_t n_head = hparams.n_head(il);
|
4878
|
+
const int64_t n_ff = hparams.n_ff(il);
|
4708
4879
|
|
4709
4880
|
if (n_head == 0) {
|
4710
4881
|
// attention-free layer of Llama-3_1-Nemotron-51B
|
@@ -4724,7 +4895,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
4724
4895
|
} else if (n_head > 0) {
|
4725
4896
|
// self-attention
|
4726
4897
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
4727
|
-
lm_ggml_tensor * rope_factors = model.get_rope_factors(
|
4898
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
4728
4899
|
|
4729
4900
|
// compute Q and K and RoPE them
|
4730
4901
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -4780,9 +4951,9 @@ struct llm_build_deci : public llm_graph_context {
|
|
4780
4951
|
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
4781
4952
|
}
|
4782
4953
|
|
4783
|
-
//
|
4784
|
-
if (
|
4785
|
-
|
4954
|
+
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
|
4955
|
+
if (n_ff == 0) {
|
4956
|
+
continue;
|
4786
4957
|
}
|
4787
4958
|
|
4788
4959
|
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
@@ -4808,11 +4979,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
4808
4979
|
cb(cur, "ffn_out", il);
|
4809
4980
|
}
|
4810
4981
|
|
4811
|
-
// For Granite architecture
|
4812
|
-
if (hparams.f_residual_scale) {
|
4813
|
-
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
4814
|
-
}
|
4815
|
-
|
4816
4982
|
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
4817
4983
|
cb(cur, "ffn_out", il);
|
4818
4984
|
|
@@ -4835,11 +5001,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
4835
5001
|
// lm_head
|
4836
5002
|
cur = build_lora_mm(model.output, cur);
|
4837
5003
|
|
4838
|
-
// For Granite architecture
|
4839
|
-
if (hparams.f_logit_scale) {
|
4840
|
-
cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
4841
|
-
}
|
4842
|
-
|
4843
5004
|
cb(cur, "result_output", -1);
|
4844
5005
|
res->t_logits = cur;
|
4845
5006
|
|
@@ -7183,6 +7344,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
7183
7344
|
}
|
7184
7345
|
};
|
7185
7346
|
|
7347
|
+
template<bool iswa>
|
7186
7348
|
struct llm_build_phi3 : public llm_graph_context {
|
7187
7349
|
llm_build_phi3(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
7188
7350
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
@@ -7198,7 +7360,14 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
7198
7360
|
// inp_pos - contains the positions
|
7199
7361
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
7200
7362
|
|
7201
|
-
|
7363
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
7364
|
+
inp_attn_type * inp_attn = nullptr;
|
7365
|
+
|
7366
|
+
if constexpr (iswa) {
|
7367
|
+
inp_attn = build_attn_inp_kv_unified_iswa();
|
7368
|
+
} else {
|
7369
|
+
inp_attn = build_attn_inp_kv_unified();
|
7370
|
+
}
|
7202
7371
|
|
7203
7372
|
for (int il = 0; il < n_layer; ++il) {
|
7204
7373
|
auto * residual = inpL;
|
@@ -7206,7 +7375,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
7206
7375
|
// self-attention
|
7207
7376
|
{
|
7208
7377
|
// rope freq factors for 128k context
|
7209
|
-
lm_ggml_tensor * rope_factors = model.get_rope_factors(
|
7378
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
7210
7379
|
|
7211
7380
|
lm_ggml_tensor* attn_norm_output = build_norm(inpL,
|
7212
7381
|
model.layers[il].attn_norm,
|
@@ -7958,7 +8127,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
7958
8127
|
for (int il = 0; il < n_layer; ++il) {
|
7959
8128
|
lm_ggml_tensor * inpSA = inpL;
|
7960
8129
|
|
7961
|
-
lm_ggml_tensor * rope_factors = model.get_rope_factors(
|
8130
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
7962
8131
|
|
7963
8132
|
// norm
|
7964
8133
|
cur = build_norm(inpL,
|
@@ -8258,8 +8427,8 @@ struct llm_build_gemma : public llm_graph_context {
|
|
8258
8427
|
}
|
8259
8428
|
};
|
8260
8429
|
|
8261
|
-
struct
|
8262
|
-
|
8430
|
+
struct llm_build_gemma2_iswa : public llm_graph_context {
|
8431
|
+
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
8263
8432
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
8264
8433
|
|
8265
8434
|
lm_ggml_tensor * cur;
|
@@ -8273,7 +8442,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
8273
8442
|
// inp_pos - contains the positions
|
8274
8443
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
8275
8444
|
|
8276
|
-
auto * inp_attn =
|
8445
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
8277
8446
|
|
8278
8447
|
for (int il = 0; il < n_layer; ++il) {
|
8279
8448
|
// norm
|
@@ -8395,8 +8564,8 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
8395
8564
|
}
|
8396
8565
|
};
|
8397
8566
|
|
8398
|
-
struct
|
8399
|
-
|
8567
|
+
struct llm_build_gemma3_iswa : public llm_graph_context {
|
8568
|
+
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
8400
8569
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
8401
8570
|
|
8402
8571
|
lm_ggml_tensor * cur;
|
@@ -8414,13 +8583,11 @@ struct llm_build_gemma3 : public llm_graph_context {
|
|
8414
8583
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
8415
8584
|
|
8416
8585
|
// TODO: is causal == true correct? might need some changes
|
8417
|
-
auto * inp_attn =
|
8586
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
8418
8587
|
|
8419
8588
|
for (int il = 0; il < n_layer; ++il) {
|
8420
|
-
const
|
8421
|
-
|
8422
|
-
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
8423
|
-
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
8589
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
8590
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
8424
8591
|
|
8425
8592
|
// norm
|
8426
8593
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
@@ -8997,8 +9164,8 @@ struct llm_build_command_r : public llm_graph_context {
|
|
8997
9164
|
}
|
8998
9165
|
};
|
8999
9166
|
|
9000
|
-
struct
|
9001
|
-
|
9167
|
+
struct llm_build_cohere2_iswa : public llm_graph_context {
|
9168
|
+
llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
9002
9169
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9003
9170
|
|
9004
9171
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -9013,7 +9180,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
9013
9180
|
// inp_pos - contains the positions
|
9014
9181
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
9015
9182
|
|
9016
|
-
auto * inp_attn =
|
9183
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
9017
9184
|
|
9018
9185
|
for (int il = 0; il < n_layer; ++il) {
|
9019
9186
|
const bool is_swa = hparams.is_swa(il);
|
@@ -9026,7 +9193,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
9026
9193
|
// self-attention
|
9027
9194
|
{
|
9028
9195
|
// rope freq factors for 128k context
|
9029
|
-
lm_ggml_tensor * rope_factors = model.get_rope_factors(
|
9196
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
9030
9197
|
|
9031
9198
|
// compute Q and K and RoPE them
|
9032
9199
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -9964,7 +10131,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
9964
10131
|
// self-attention
|
9965
10132
|
{
|
9966
10133
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
9967
|
-
lm_ggml_tensor * rope_factors = model.get_rope_factors(
|
10134
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
9968
10135
|
|
9969
10136
|
// compute Q and K and RoPE them
|
9970
10137
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -11328,7 +11495,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
11328
11495
|
// self-attention
|
11329
11496
|
{
|
11330
11497
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
11331
|
-
lm_ggml_tensor * rope_factors = model.get_rope_factors(
|
11498
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
11332
11499
|
|
11333
11500
|
// compute Q and K and RoPE them
|
11334
11501
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -12178,6 +12345,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
12178
12345
|
}
|
12179
12346
|
};
|
12180
12347
|
|
12348
|
+
|
12349
|
+
struct llm_build_granite : public llm_graph_context {
|
12350
|
+
llm_build_granite(
|
12351
|
+
const llama_model & model,
|
12352
|
+
const llm_graph_params & params,
|
12353
|
+
lm_ggml_cgraph * gf,
|
12354
|
+
const bool use_rope = true)
|
12355
|
+
: llm_graph_context(params) {
|
12356
|
+
|
12357
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
12358
|
+
|
12359
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
12360
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
12361
|
+
|
12362
|
+
lm_ggml_tensor * cur;
|
12363
|
+
lm_ggml_tensor * inpL;
|
12364
|
+
|
12365
|
+
inpL = build_inp_embd(model.tok_embd);
|
12366
|
+
|
12367
|
+
// inp_pos - built only if rope enabled
|
12368
|
+
lm_ggml_tensor * inp_pos = nullptr;
|
12369
|
+
if (use_rope) {
|
12370
|
+
inp_pos = build_inp_pos();
|
12371
|
+
}
|
12372
|
+
|
12373
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
12374
|
+
|
12375
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
12376
|
+
for (int il = 0; il < n_layer; ++il) {
|
12377
|
+
lm_ggml_tensor * inpSA = inpL;
|
12378
|
+
|
12379
|
+
// norm
|
12380
|
+
cur = build_norm(inpL,
|
12381
|
+
model.layers[il].attn_norm, NULL,
|
12382
|
+
LLM_NORM_RMS, il);
|
12383
|
+
cb(cur, "attn_norm", il);
|
12384
|
+
|
12385
|
+
// self-attention
|
12386
|
+
{
|
12387
|
+
// compute Q and K and (optionally) RoPE them
|
12388
|
+
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
12389
|
+
cb(Qcur, "Qcur", il);
|
12390
|
+
if (model.layers[il].bq) {
|
12391
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
12392
|
+
cb(Qcur, "Qcur", il);
|
12393
|
+
}
|
12394
|
+
|
12395
|
+
lm_ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
12396
|
+
cb(Kcur, "Kcur", il);
|
12397
|
+
if (model.layers[il].bk) {
|
12398
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
12399
|
+
cb(Kcur, "Kcur", il);
|
12400
|
+
}
|
12401
|
+
|
12402
|
+
lm_ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
12403
|
+
cb(Vcur, "Vcur", il);
|
12404
|
+
if (model.layers[il].bv) {
|
12405
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
12406
|
+
cb(Vcur, "Vcur", il);
|
12407
|
+
}
|
12408
|
+
|
12409
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
12410
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
12411
|
+
Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
12412
|
+
|
12413
|
+
if (use_rope) {
|
12414
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
12415
|
+
Qcur = lm_ggml_rope_ext(
|
12416
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
12417
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
12418
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
12419
|
+
);
|
12420
|
+
|
12421
|
+
Kcur = lm_ggml_rope_ext(
|
12422
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
12423
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
12424
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
12425
|
+
);
|
12426
|
+
}
|
12427
|
+
|
12428
|
+
cb(Qcur, "Qcur", il);
|
12429
|
+
cb(Kcur, "Kcur", il);
|
12430
|
+
cb(Vcur, "Vcur", il);
|
12431
|
+
|
12432
|
+
cur = build_attn(inp_attn, gf,
|
12433
|
+
model.layers[il].wo, model.layers[il].bo,
|
12434
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
12435
|
+
cb(cur, "attn_out", il);
|
12436
|
+
}
|
12437
|
+
|
12438
|
+
if (il == n_layer - 1) {
|
12439
|
+
// skip computing output for unused tokens
|
12440
|
+
lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
12441
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
12442
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
12443
|
+
}
|
12444
|
+
|
12445
|
+
// For Granite architectures - scale residual
|
12446
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
12447
|
+
lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
12448
|
+
cb(ffn_inp, "ffn_inp", il);
|
12449
|
+
|
12450
|
+
// feed-forward network (non-MoE)
|
12451
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
12452
|
+
|
12453
|
+
cur = build_norm(ffn_inp,
|
12454
|
+
model.layers[il].ffn_norm, NULL,
|
12455
|
+
LLM_NORM_RMS, il);
|
12456
|
+
cb(cur, "ffn_norm", il);
|
12457
|
+
|
12458
|
+
cur = build_ffn(cur,
|
12459
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
12460
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
12461
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
12462
|
+
NULL,
|
12463
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
12464
|
+
cb(cur, "ffn_out", il);
|
12465
|
+
|
12466
|
+
} else {
|
12467
|
+
// MoE branch
|
12468
|
+
cur = build_norm(ffn_inp,
|
12469
|
+
model.layers[il].ffn_norm, NULL,
|
12470
|
+
LLM_NORM_RMS, il);
|
12471
|
+
cb(cur, "ffn_norm", il);
|
12472
|
+
|
12473
|
+
lm_ggml_tensor * moe_out = build_moe_ffn(cur,
|
12474
|
+
model.layers[il].ffn_gate_inp,
|
12475
|
+
model.layers[il].ffn_up_exps,
|
12476
|
+
model.layers[il].ffn_gate_exps,
|
12477
|
+
model.layers[il].ffn_down_exps,
|
12478
|
+
nullptr,
|
12479
|
+
n_expert, n_expert_used,
|
12480
|
+
LLM_FFN_SILU, true,
|
12481
|
+
false, 0.0,
|
12482
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
12483
|
+
il);
|
12484
|
+
cb(moe_out, "ffn_moe_out", il);
|
12485
|
+
|
12486
|
+
// For Granite MoE Shared
|
12487
|
+
if (hparams.n_ff_shexp > 0) {
|
12488
|
+
lm_ggml_tensor * ffn_shexp = build_ffn(cur,
|
12489
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
12490
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
12491
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
12492
|
+
NULL,
|
12493
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
12494
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
12495
|
+
|
12496
|
+
cur = lm_ggml_add(ctx0, moe_out, ffn_shexp);
|
12497
|
+
cb(cur, "ffn_out", il);
|
12498
|
+
} else {
|
12499
|
+
cur = moe_out;
|
12500
|
+
}
|
12501
|
+
}
|
12502
|
+
|
12503
|
+
// For Granite architectures - scale residual
|
12504
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
12505
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
12506
|
+
cb(cur, "ffn_out", il);
|
12507
|
+
|
12508
|
+
cur = build_cvec(cur, il);
|
12509
|
+
cb(cur, "l_out", il);
|
12510
|
+
|
12511
|
+
// input for next layer
|
12512
|
+
inpL = cur;
|
12513
|
+
}
|
12514
|
+
|
12515
|
+
cur = inpL;
|
12516
|
+
|
12517
|
+
cur = build_norm(cur,
|
12518
|
+
model.output_norm, NULL,
|
12519
|
+
LLM_NORM_RMS, -1);
|
12520
|
+
|
12521
|
+
cb(cur, "result_norm", -1);
|
12522
|
+
res->t_embd = cur;
|
12523
|
+
|
12524
|
+
// lm_head
|
12525
|
+
cur = build_lora_mm(model.output, cur);
|
12526
|
+
|
12527
|
+
// For Granite architectures - scale logits
|
12528
|
+
cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
12529
|
+
cb(cur, "result_output", -1);
|
12530
|
+
res->t_logits = cur;
|
12531
|
+
|
12532
|
+
lm_ggml_build_forward_expand(gf, cur);
|
12533
|
+
}
|
12534
|
+
};
|
12535
|
+
|
12181
12536
|
// ref: https://github.com/facebookresearch/chameleon
|
12182
12537
|
// based on the original build_llama() function, changes:
|
12183
12538
|
// * qk-norm
|
@@ -12709,7 +13064,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
12709
13064
|
// self-attention
|
12710
13065
|
{
|
12711
13066
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
12712
|
-
lm_ggml_tensor * rope_factors = model.get_rope_factors(
|
13067
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
12713
13068
|
|
12714
13069
|
// compute Q and K and RoPE them
|
12715
13070
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -12833,6 +13188,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
12833
13188
|
llama_memory_i * res;
|
12834
13189
|
|
12835
13190
|
switch (arch) {
|
13191
|
+
case LLM_ARCH_BERT:
|
13192
|
+
case LLM_ARCH_JINA_BERT_V2:
|
13193
|
+
case LLM_ARCH_NOMIC_BERT:
|
13194
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
13195
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
13196
|
+
{
|
13197
|
+
res = nullptr;
|
13198
|
+
} break;
|
12836
13199
|
case LLM_ARCH_MAMBA:
|
12837
13200
|
case LLM_ARCH_RWKV6:
|
12838
13201
|
case LLM_ARCH_RWKV6QWEN2:
|
@@ -12844,7 +13207,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
12844
13207
|
LM_GGML_TYPE_F32,
|
12845
13208
|
LM_GGML_TYPE_F32,
|
12846
13209
|
cparams.offload_kqv,
|
12847
|
-
std::max((uint32_t) 1, cparams.n_seq_max)
|
13210
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
13211
|
+
cparams.n_seq_max);
|
12848
13212
|
} break;
|
12849
13213
|
default:
|
12850
13214
|
{
|
@@ -12854,14 +13218,36 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
12854
13218
|
|
12855
13219
|
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
12856
13220
|
|
12857
|
-
|
12858
|
-
|
12859
|
-
|
12860
|
-
|
12861
|
-
|
12862
|
-
|
12863
|
-
|
12864
|
-
|
13221
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
13222
|
+
LM_GGML_ASSERT(hparams.is_swa_any());
|
13223
|
+
|
13224
|
+
res = new llama_kv_cache_unified_iswa(
|
13225
|
+
*this,
|
13226
|
+
params.type_k,
|
13227
|
+
params.type_v,
|
13228
|
+
!cparams.flash_attn,
|
13229
|
+
cparams.offload_kqv,
|
13230
|
+
params.swa_full,
|
13231
|
+
cparams.n_ctx,
|
13232
|
+
cparams.n_seq_max,
|
13233
|
+
cparams.n_batch,
|
13234
|
+
padding);
|
13235
|
+
} else {
|
13236
|
+
LM_GGML_ASSERT(!hparams.is_swa_any());
|
13237
|
+
|
13238
|
+
res = new llama_kv_cache_unified(
|
13239
|
+
*this,
|
13240
|
+
nullptr,
|
13241
|
+
params.type_k,
|
13242
|
+
params.type_v,
|
13243
|
+
!cparams.flash_attn,
|
13244
|
+
cparams.offload_kqv,
|
13245
|
+
cparams.n_ctx,
|
13246
|
+
cparams.n_seq_max,
|
13247
|
+
padding,
|
13248
|
+
hparams.n_swa,
|
13249
|
+
hparams.swa_type);
|
13250
|
+
}
|
12865
13251
|
}
|
12866
13252
|
}
|
12867
13253
|
|
@@ -12876,13 +13262,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12876
13262
|
|
12877
13263
|
switch (arch) {
|
12878
13264
|
case LLM_ARCH_LLAMA:
|
12879
|
-
case LLM_ARCH_LLAMA4:
|
12880
13265
|
case LLM_ARCH_MINICPM:
|
12881
|
-
case LLM_ARCH_GRANITE:
|
12882
|
-
case LLM_ARCH_GRANITE_MOE:
|
12883
13266
|
{
|
12884
13267
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
12885
13268
|
} break;
|
13269
|
+
case LLM_ARCH_LLAMA4:
|
13270
|
+
{
|
13271
|
+
llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
|
13272
|
+
} break;
|
12886
13273
|
case LLM_ARCH_DECI:
|
12887
13274
|
{
|
12888
13275
|
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
@@ -12957,7 +13344,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12957
13344
|
case LLM_ARCH_PHI3:
|
12958
13345
|
case LLM_ARCH_PHIMOE:
|
12959
13346
|
{
|
12960
|
-
|
13347
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
13348
|
+
llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
|
13349
|
+
} else {
|
13350
|
+
llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
|
13351
|
+
}
|
12961
13352
|
} break;
|
12962
13353
|
case LLM_ARCH_PLAMO:
|
12963
13354
|
{
|
@@ -12989,11 +13380,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12989
13380
|
} break;
|
12990
13381
|
case LLM_ARCH_GEMMA2:
|
12991
13382
|
{
|
12992
|
-
llm = std::make_unique<
|
13383
|
+
llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
|
12993
13384
|
} break;
|
12994
13385
|
case LLM_ARCH_GEMMA3:
|
12995
13386
|
{
|
12996
|
-
llm = std::make_unique<
|
13387
|
+
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
|
12997
13388
|
} break;
|
12998
13389
|
case LLM_ARCH_STARCODER2:
|
12999
13390
|
{
|
@@ -13013,7 +13404,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
13013
13404
|
} break;
|
13014
13405
|
case LLM_ARCH_COHERE2:
|
13015
13406
|
{
|
13016
|
-
llm = std::make_unique<
|
13407
|
+
llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
|
13017
13408
|
} break;
|
13018
13409
|
case LLM_ARCH_DBRX:
|
13019
13410
|
{
|
@@ -13110,6 +13501,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
13110
13501
|
{
|
13111
13502
|
llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
|
13112
13503
|
} break;
|
13504
|
+
case LLM_ARCH_GRANITE:
|
13505
|
+
case LLM_ARCH_GRANITE_MOE:
|
13506
|
+
{
|
13507
|
+
llm = std::make_unique<llm_build_granite>(*this, params, gf);
|
13508
|
+
} break;
|
13113
13509
|
case LLM_ARCH_CHAMELEON:
|
13114
13510
|
{
|
13115
13511
|
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
@@ -13361,6 +13757,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
|
13361
13757
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
13362
13758
|
const auto & it = model->lm_gguf_kv.find(key);
|
13363
13759
|
if (it == model->lm_gguf_kv.end()) {
|
13760
|
+
// one-off fix for very popular models (so we are not flooded with issues)
|
13761
|
+
// do not extend this list unless absolutely necessary
|
13762
|
+
// Mistral-Small-2503 does not have built-in chat template
|
13763
|
+
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
13764
|
+
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
13765
|
+
return "mistral-v7-tekken";
|
13766
|
+
}
|
13767
|
+
|
13364
13768
|
return nullptr;
|
13365
13769
|
}
|
13366
13770
|
|