@fugood/llama.node 0.3.17 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
40
40
|
case LLM_TYPE_335M: return "335M";
|
|
41
41
|
case LLM_TYPE_410M: return "410M";
|
|
42
42
|
case LLM_TYPE_450M: return "450M";
|
|
43
|
+
case LLM_TYPE_475M: return "475M";
|
|
43
44
|
case LLM_TYPE_770M: return "770M";
|
|
44
45
|
case LLM_TYPE_780M: return "780M";
|
|
45
46
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
@@ -79,6 +80,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
79
80
|
case LLM_TYPE_236B: return "236B";
|
|
80
81
|
case LLM_TYPE_290B: return "290B";
|
|
81
82
|
case LLM_TYPE_314B: return "314B";
|
|
83
|
+
case LLM_TYPE_405B: return "405B";
|
|
82
84
|
case LLM_TYPE_671B: return "671B";
|
|
83
85
|
case LLM_TYPE_SMALL: return "0.1B";
|
|
84
86
|
case LLM_TYPE_MEDIUM: return "0.4B";
|
|
@@ -115,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
|
|
|
115
117
|
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
|
116
118
|
};
|
|
117
119
|
|
|
120
|
+
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
|
|
121
|
+
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
|
|
122
|
+
}
|
|
123
|
+
|
|
118
124
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
|
119
125
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
|
120
126
|
if (kv.second == name) {
|
|
@@ -297,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
297
303
|
// add extra buffer types, only if no GPU device is present
|
|
298
304
|
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
|
299
305
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
306
|
+
if (cpu_dev == nullptr) {
|
|
307
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
308
|
+
}
|
|
309
|
+
|
|
300
310
|
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
301
311
|
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
302
312
|
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
@@ -581,6 +591,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
581
591
|
switch (hparams.n_layer) {
|
|
582
592
|
case 32: type = LLM_TYPE_7B; break;
|
|
583
593
|
case 80: type = LLM_TYPE_70B; break;
|
|
594
|
+
case 162: type = LLM_TYPE_405B; break;
|
|
584
595
|
default: type = LLM_TYPE_UNKNOWN;
|
|
585
596
|
}
|
|
586
597
|
} break;
|
|
@@ -707,7 +718,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
707
718
|
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
|
708
719
|
|
|
709
720
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
|
710
|
-
|
|
721
|
+
if (arch == LLM_ARCH_NOMIC_BERT) {
|
|
722
|
+
type = LLM_TYPE_137M;
|
|
723
|
+
} else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
|
|
724
|
+
type = LLM_TYPE_475M;
|
|
725
|
+
}
|
|
711
726
|
}
|
|
712
727
|
} break;
|
|
713
728
|
case LLM_ARCH_BLOOM:
|
|
@@ -768,6 +783,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
768
783
|
// fall through
|
|
769
784
|
case LLM_ARCH_QWEN2:
|
|
770
785
|
{
|
|
786
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
771
787
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
772
788
|
switch (hparams.n_layer) {
|
|
773
789
|
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
|
@@ -1373,6 +1389,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1373
1389
|
// Add additional layer/vocab/etc checks here for other model sizes
|
|
1374
1390
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1375
1391
|
}
|
|
1392
|
+
|
|
1393
|
+
// For Granite MoE Shared
|
|
1394
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
|
1376
1395
|
} break;
|
|
1377
1396
|
case LLM_ARCH_CHAMELEON:
|
|
1378
1397
|
{
|
|
@@ -1476,6 +1495,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1476
1495
|
}
|
|
1477
1496
|
|
|
1478
1497
|
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1498
|
+
if (cpu_dev == nullptr) {
|
|
1499
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
1500
|
+
}
|
|
1479
1501
|
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
|
1480
1502
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
|
1481
1503
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
|
@@ -1643,8 +1665,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1643
1665
|
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
|
1644
1666
|
std::regex pattern(overrides->pattern);
|
|
1645
1667
|
if (std::regex_search(tensor_name, pattern)) {
|
|
1646
|
-
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
|
|
1647
1668
|
buft = overrides->buft;
|
|
1669
|
+
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
|
1670
|
+
tensor_name.c_str(),
|
|
1671
|
+
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
|
1672
|
+
ggml_backend_buft_name(buft));
|
|
1648
1673
|
break;
|
|
1649
1674
|
}
|
|
1650
1675
|
}
|
|
@@ -1661,6 +1686,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1661
1686
|
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
|
1662
1687
|
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
|
1663
1688
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1689
|
+
if (!cpu_dev) {
|
|
1690
|
+
throw std::runtime_error("no CPU backend found");
|
|
1691
|
+
}
|
|
1664
1692
|
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
|
1665
1693
|
}
|
|
1666
1694
|
|
|
@@ -1747,6 +1775,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1747
1775
|
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
|
1748
1776
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
1749
1777
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
1778
|
+
|
|
1779
|
+
// For Granite MoE Shared
|
|
1780
|
+
if (hparams.n_ff_shexp > 0) {
|
|
1781
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
1782
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
1783
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
1784
|
+
}
|
|
1750
1785
|
}
|
|
1751
1786
|
}
|
|
1752
1787
|
} break;
|
|
@@ -1842,7 +1877,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1842
1877
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
1843
1878
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1844
1879
|
|
|
1845
|
-
|
|
1880
|
+
if (n_ff > 0) {
|
|
1881
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1882
|
+
}
|
|
1846
1883
|
|
|
1847
1884
|
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
|
1848
1885
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
@@ -1852,9 +1889,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1852
1889
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1853
1890
|
}
|
|
1854
1891
|
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1892
|
+
if (n_ff > 0) {
|
|
1893
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1894
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1895
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1896
|
+
}
|
|
1858
1897
|
|
|
1859
1898
|
// optional MLP bias
|
|
1860
1899
|
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
@@ -3498,7 +3537,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3498
3537
|
|
|
3499
3538
|
// output
|
|
3500
3539
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3501
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
3540
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3541
|
+
// if output is NULL, init from the input tok embed
|
|
3542
|
+
if (output == NULL) {
|
|
3543
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3544
|
+
}
|
|
3502
3545
|
|
|
3503
3546
|
for (int i = 0; i < n_layer; ++i) {
|
|
3504
3547
|
auto & layer = layers[i];
|
|
@@ -4103,6 +4146,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4103
4146
|
if (!dev) {
|
|
4104
4147
|
// FIXME: workaround for CPU backend buft having a NULL device
|
|
4105
4148
|
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
4149
|
+
if (!dev) {
|
|
4150
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
4151
|
+
}
|
|
4106
4152
|
}
|
|
4107
4153
|
ggml_backend_dev_props props;
|
|
4108
4154
|
ggml_backend_dev_get_props(dev, &props);
|
|
@@ -4232,7 +4278,7 @@ uint64_t llama_model::n_elements() const {
|
|
|
4232
4278
|
}
|
|
4233
4279
|
|
|
4234
4280
|
void llama_model::print_info() const {
|
|
4235
|
-
const
|
|
4281
|
+
const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
|
|
4236
4282
|
|
|
4237
4283
|
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
|
4238
4284
|
bool is_var = false;
|
|
@@ -4293,7 +4339,7 @@ void llama_model::print_info() const {
|
|
|
4293
4339
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
|
4294
4340
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
4295
4341
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
4296
|
-
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
|
4342
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
4297
4343
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
4298
4344
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
4299
4345
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
@@ -4349,10 +4395,13 @@ void llama_model::print_info() const {
|
|
|
4349
4395
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
4350
4396
|
}
|
|
4351
4397
|
|
|
4352
|
-
if (arch == LLM_ARCH_MINICPM ||
|
|
4398
|
+
if (arch == LLM_ARCH_MINICPM ||
|
|
4399
|
+
arch == LLM_ARCH_GRANITE ||
|
|
4400
|
+
arch == LLM_ARCH_GRANITE_MOE) {
|
|
4353
4401
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
4354
4402
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
4355
4403
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
4404
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
4356
4405
|
}
|
|
4357
4406
|
|
|
4358
4407
|
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
@@ -4440,6 +4489,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
|
4440
4489
|
return it->second;
|
|
4441
4490
|
}
|
|
4442
4491
|
|
|
4492
|
+
ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
|
|
4493
|
+
// choose long/short freq factors based on the context size
|
|
4494
|
+
if (layers[il].rope_freqs != nullptr) {
|
|
4495
|
+
return layers[il].rope_freqs;
|
|
4496
|
+
}
|
|
4497
|
+
|
|
4498
|
+
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
|
4499
|
+
return layers[il].rope_long;
|
|
4500
|
+
}
|
|
4501
|
+
|
|
4502
|
+
return layers[il].rope_short;
|
|
4503
|
+
}
|
|
4504
|
+
|
|
4443
4505
|
struct llm_build_llama : public llm_graph_context {
|
|
4444
4506
|
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
4445
4507
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -4480,7 +4542,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4480
4542
|
// self-attention
|
|
4481
4543
|
{
|
|
4482
4544
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4483
|
-
ggml_tensor * rope_factors =
|
|
4545
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
4484
4546
|
|
|
4485
4547
|
// compute Q and K and RoPE them
|
|
4486
4548
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -4549,11 +4611,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4549
4611
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4550
4612
|
}
|
|
4551
4613
|
|
|
4552
|
-
// For Granite architecture
|
|
4553
|
-
if (hparams.f_residual_scale) {
|
|
4554
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4555
|
-
}
|
|
4556
|
-
|
|
4557
4614
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
4558
4615
|
cb(ffn_inp, "ffn_inp", il);
|
|
4559
4616
|
|
|
@@ -4625,11 +4682,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4625
4682
|
cb(cur, "ffn_moe_out", il);
|
|
4626
4683
|
}
|
|
4627
4684
|
|
|
4628
|
-
// For Granite architecture
|
|
4629
|
-
if (hparams.f_residual_scale) {
|
|
4630
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4631
|
-
}
|
|
4632
|
-
|
|
4633
4685
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
4634
4686
|
cb(cur, "ffn_out", il);
|
|
4635
4687
|
|
|
@@ -4652,11 +4704,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4652
4704
|
// lm_head
|
|
4653
4705
|
cur = build_lora_mm(model.output, cur);
|
|
4654
4706
|
|
|
4655
|
-
// For Granite architecture
|
|
4656
|
-
if (hparams.f_logit_scale) {
|
|
4657
|
-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
4658
|
-
}
|
|
4659
|
-
|
|
4660
4707
|
cb(cur, "result_output", -1);
|
|
4661
4708
|
res->t_logits = cur;
|
|
4662
4709
|
|
|
@@ -4686,6 +4733,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4686
4733
|
ggml_tensor * inpSA = inpL;
|
|
4687
4734
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
4688
4735
|
const int64_t n_head = hparams.n_head(il);
|
|
4736
|
+
const int64_t n_ff = hparams.n_ff(il);
|
|
4689
4737
|
|
|
4690
4738
|
if (n_head == 0) {
|
|
4691
4739
|
// attention-free layer of Llama-3_1-Nemotron-51B
|
|
@@ -4705,7 +4753,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4705
4753
|
} else if (n_head > 0) {
|
|
4706
4754
|
// self-attention
|
|
4707
4755
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4708
|
-
ggml_tensor * rope_factors =
|
|
4756
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
4709
4757
|
|
|
4710
4758
|
// compute Q and K and RoPE them
|
|
4711
4759
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -4761,9 +4809,9 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4761
4809
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4762
4810
|
}
|
|
4763
4811
|
|
|
4764
|
-
//
|
|
4765
|
-
if (
|
|
4766
|
-
|
|
4812
|
+
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
|
|
4813
|
+
if (n_ff == 0) {
|
|
4814
|
+
continue;
|
|
4767
4815
|
}
|
|
4768
4816
|
|
|
4769
4817
|
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
|
@@ -4789,11 +4837,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4789
4837
|
cb(cur, "ffn_out", il);
|
|
4790
4838
|
}
|
|
4791
4839
|
|
|
4792
|
-
// For Granite architecture
|
|
4793
|
-
if (hparams.f_residual_scale) {
|
|
4794
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4795
|
-
}
|
|
4796
|
-
|
|
4797
4840
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
4798
4841
|
cb(cur, "ffn_out", il);
|
|
4799
4842
|
|
|
@@ -4816,11 +4859,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4816
4859
|
// lm_head
|
|
4817
4860
|
cur = build_lora_mm(model.output, cur);
|
|
4818
4861
|
|
|
4819
|
-
// For Granite architecture
|
|
4820
|
-
if (hparams.f_logit_scale) {
|
|
4821
|
-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
4822
|
-
}
|
|
4823
|
-
|
|
4824
4862
|
cb(cur, "result_output", -1);
|
|
4825
4863
|
res->t_logits = cur;
|
|
4826
4864
|
|
|
@@ -7187,7 +7225,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7187
7225
|
// self-attention
|
|
7188
7226
|
{
|
|
7189
7227
|
// rope freq factors for 128k context
|
|
7190
|
-
ggml_tensor * rope_factors =
|
|
7228
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
7191
7229
|
|
|
7192
7230
|
ggml_tensor* attn_norm_output = build_norm(inpL,
|
|
7193
7231
|
model.layers[il].attn_norm,
|
|
@@ -7939,7 +7977,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
7939
7977
|
for (int il = 0; il < n_layer; ++il) {
|
|
7940
7978
|
ggml_tensor * inpSA = inpL;
|
|
7941
7979
|
|
|
7942
|
-
ggml_tensor * rope_factors =
|
|
7980
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
7943
7981
|
|
|
7944
7982
|
// norm
|
|
7945
7983
|
cur = build_norm(inpL,
|
|
@@ -8706,7 +8744,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8706
8744
|
ggml_tensor * state_mask,
|
|
8707
8745
|
const llama_ubatch & ubatch,
|
|
8708
8746
|
int il) const {
|
|
8709
|
-
const
|
|
8747
|
+
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
|
8710
8748
|
|
|
8711
8749
|
const auto kv_head = kv_self->head;
|
|
8712
8750
|
|
|
@@ -9007,7 +9045,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
|
9007
9045
|
// self-attention
|
|
9008
9046
|
{
|
|
9009
9047
|
// rope freq factors for 128k context
|
|
9010
|
-
ggml_tensor * rope_factors =
|
|
9048
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
9011
9049
|
|
|
9012
9050
|
// compute Q and K and RoPE them
|
|
9013
9051
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -9945,7 +9983,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
9945
9983
|
// self-attention
|
|
9946
9984
|
{
|
|
9947
9985
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
9948
|
-
ggml_tensor * rope_factors =
|
|
9986
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
9949
9987
|
|
|
9950
9988
|
// compute Q and K and RoPE them
|
|
9951
9989
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -11309,7 +11347,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
11309
11347
|
// self-attention
|
|
11310
11348
|
{
|
|
11311
11349
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
11312
|
-
ggml_tensor * rope_factors =
|
|
11350
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
11313
11351
|
|
|
11314
11352
|
// compute Q and K and RoPE them
|
|
11315
11353
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -11454,7 +11492,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11454
11492
|
ggml_tensor * state_mask,
|
|
11455
11493
|
const llama_ubatch & ubatch,
|
|
11456
11494
|
int il) const {
|
|
11457
|
-
const
|
|
11495
|
+
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
|
11458
11496
|
|
|
11459
11497
|
const auto n_tokens = ubatch.n_tokens;
|
|
11460
11498
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -11850,7 +11888,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
11850
11888
|
ggml_tensor *& first_layer_value,
|
|
11851
11889
|
const llama_ubatch & ubatch,
|
|
11852
11890
|
int il) const {
|
|
11853
|
-
const
|
|
11891
|
+
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
|
11854
11892
|
|
|
11855
11893
|
const auto n_tokens = ubatch.n_tokens;
|
|
11856
11894
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -12159,6 +12197,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12159
12197
|
}
|
|
12160
12198
|
};
|
|
12161
12199
|
|
|
12200
|
+
|
|
12201
|
+
struct llm_build_granite : public llm_graph_context {
|
|
12202
|
+
llm_build_granite(
|
|
12203
|
+
const llama_model & model,
|
|
12204
|
+
const llm_graph_params & params,
|
|
12205
|
+
ggml_cgraph * gf,
|
|
12206
|
+
const bool use_rope = true)
|
|
12207
|
+
: llm_graph_context(params) {
|
|
12208
|
+
|
|
12209
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12210
|
+
|
|
12211
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
12212
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
12213
|
+
|
|
12214
|
+
ggml_tensor * cur;
|
|
12215
|
+
ggml_tensor * inpL;
|
|
12216
|
+
|
|
12217
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12218
|
+
|
|
12219
|
+
// inp_pos - built only if rope enabled
|
|
12220
|
+
ggml_tensor * inp_pos = nullptr;
|
|
12221
|
+
if (use_rope) {
|
|
12222
|
+
inp_pos = build_inp_pos();
|
|
12223
|
+
}
|
|
12224
|
+
|
|
12225
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12226
|
+
|
|
12227
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
12228
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12229
|
+
ggml_tensor * inpSA = inpL;
|
|
12230
|
+
|
|
12231
|
+
// norm
|
|
12232
|
+
cur = build_norm(inpL,
|
|
12233
|
+
model.layers[il].attn_norm, NULL,
|
|
12234
|
+
LLM_NORM_RMS, il);
|
|
12235
|
+
cb(cur, "attn_norm", il);
|
|
12236
|
+
|
|
12237
|
+
// self-attention
|
|
12238
|
+
{
|
|
12239
|
+
// compute Q and K and (optionally) RoPE them
|
|
12240
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
12241
|
+
cb(Qcur, "Qcur", il);
|
|
12242
|
+
if (model.layers[il].bq) {
|
|
12243
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
12244
|
+
cb(Qcur, "Qcur", il);
|
|
12245
|
+
}
|
|
12246
|
+
|
|
12247
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
12248
|
+
cb(Kcur, "Kcur", il);
|
|
12249
|
+
if (model.layers[il].bk) {
|
|
12250
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
12251
|
+
cb(Kcur, "Kcur", il);
|
|
12252
|
+
}
|
|
12253
|
+
|
|
12254
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
12255
|
+
cb(Vcur, "Vcur", il);
|
|
12256
|
+
if (model.layers[il].bv) {
|
|
12257
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12258
|
+
cb(Vcur, "Vcur", il);
|
|
12259
|
+
}
|
|
12260
|
+
|
|
12261
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12262
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12263
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12264
|
+
|
|
12265
|
+
if (use_rope) {
|
|
12266
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
12267
|
+
Qcur = ggml_rope_ext(
|
|
12268
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
12269
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12270
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12271
|
+
);
|
|
12272
|
+
|
|
12273
|
+
Kcur = ggml_rope_ext(
|
|
12274
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
12275
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12276
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12277
|
+
);
|
|
12278
|
+
}
|
|
12279
|
+
|
|
12280
|
+
cb(Qcur, "Qcur", il);
|
|
12281
|
+
cb(Kcur, "Kcur", il);
|
|
12282
|
+
cb(Vcur, "Vcur", il);
|
|
12283
|
+
|
|
12284
|
+
cur = build_attn(inp_attn, gf,
|
|
12285
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
12286
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12287
|
+
cb(cur, "attn_out", il);
|
|
12288
|
+
}
|
|
12289
|
+
|
|
12290
|
+
if (il == n_layer - 1) {
|
|
12291
|
+
// skip computing output for unused tokens
|
|
12292
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12293
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12294
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12295
|
+
}
|
|
12296
|
+
|
|
12297
|
+
// For Granite architectures - scale residual
|
|
12298
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
12299
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12300
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12301
|
+
|
|
12302
|
+
// feed-forward network (non-MoE)
|
|
12303
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
12304
|
+
|
|
12305
|
+
cur = build_norm(ffn_inp,
|
|
12306
|
+
model.layers[il].ffn_norm, NULL,
|
|
12307
|
+
LLM_NORM_RMS, il);
|
|
12308
|
+
cb(cur, "ffn_norm", il);
|
|
12309
|
+
|
|
12310
|
+
cur = build_ffn(cur,
|
|
12311
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
12312
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
12313
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
12314
|
+
NULL,
|
|
12315
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
12316
|
+
cb(cur, "ffn_out", il);
|
|
12317
|
+
|
|
12318
|
+
} else {
|
|
12319
|
+
// MoE branch
|
|
12320
|
+
cur = build_norm(ffn_inp,
|
|
12321
|
+
model.layers[il].ffn_norm, NULL,
|
|
12322
|
+
LLM_NORM_RMS, il);
|
|
12323
|
+
cb(cur, "ffn_norm", il);
|
|
12324
|
+
|
|
12325
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
12326
|
+
model.layers[il].ffn_gate_inp,
|
|
12327
|
+
model.layers[il].ffn_up_exps,
|
|
12328
|
+
model.layers[il].ffn_gate_exps,
|
|
12329
|
+
model.layers[il].ffn_down_exps,
|
|
12330
|
+
nullptr,
|
|
12331
|
+
n_expert, n_expert_used,
|
|
12332
|
+
LLM_FFN_SILU, true,
|
|
12333
|
+
false, 0.0,
|
|
12334
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
12335
|
+
il);
|
|
12336
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
12337
|
+
|
|
12338
|
+
// For Granite MoE Shared
|
|
12339
|
+
if (hparams.n_ff_shexp > 0) {
|
|
12340
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
12341
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
12342
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
12343
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
12344
|
+
NULL,
|
|
12345
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
12346
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
12347
|
+
|
|
12348
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
12349
|
+
cb(cur, "ffn_out", il);
|
|
12350
|
+
} else {
|
|
12351
|
+
cur = moe_out;
|
|
12352
|
+
}
|
|
12353
|
+
}
|
|
12354
|
+
|
|
12355
|
+
// For Granite architectures - scale residual
|
|
12356
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
12357
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
12358
|
+
cb(cur, "ffn_out", il);
|
|
12359
|
+
|
|
12360
|
+
cur = build_cvec(cur, il);
|
|
12361
|
+
cb(cur, "l_out", il);
|
|
12362
|
+
|
|
12363
|
+
// input for next layer
|
|
12364
|
+
inpL = cur;
|
|
12365
|
+
}
|
|
12366
|
+
|
|
12367
|
+
cur = inpL;
|
|
12368
|
+
|
|
12369
|
+
cur = build_norm(cur,
|
|
12370
|
+
model.output_norm, NULL,
|
|
12371
|
+
LLM_NORM_RMS, -1);
|
|
12372
|
+
|
|
12373
|
+
cb(cur, "result_norm", -1);
|
|
12374
|
+
res->t_embd = cur;
|
|
12375
|
+
|
|
12376
|
+
// lm_head
|
|
12377
|
+
cur = build_lora_mm(model.output, cur);
|
|
12378
|
+
|
|
12379
|
+
// For Granite architectures - scale logits
|
|
12380
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
12381
|
+
cb(cur, "result_output", -1);
|
|
12382
|
+
res->t_logits = cur;
|
|
12383
|
+
|
|
12384
|
+
ggml_build_forward_expand(gf, cur);
|
|
12385
|
+
}
|
|
12386
|
+
};
|
|
12387
|
+
|
|
12162
12388
|
// ref: https://github.com/facebookresearch/chameleon
|
|
12163
12389
|
// based on the original build_llama() function, changes:
|
|
12164
12390
|
// * qk-norm
|
|
@@ -12690,7 +12916,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
12690
12916
|
// self-attention
|
|
12691
12917
|
{
|
|
12692
12918
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
12693
|
-
ggml_tensor * rope_factors =
|
|
12919
|
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
|
12694
12920
|
|
|
12695
12921
|
// compute Q and K and RoPE them
|
|
12696
12922
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -12810,36 +13036,46 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
12810
13036
|
}
|
|
12811
13037
|
};
|
|
12812
13038
|
|
|
12813
|
-
llama_memory_i * llama_model::create_memory() const {
|
|
13039
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
12814
13040
|
llama_memory_i * res;
|
|
12815
13041
|
|
|
12816
13042
|
switch (arch) {
|
|
13043
|
+
case LLM_ARCH_BERT:
|
|
13044
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
13045
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
13046
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
13047
|
+
{
|
|
13048
|
+
res = nullptr;
|
|
13049
|
+
} break;
|
|
12817
13050
|
case LLM_ARCH_MAMBA:
|
|
12818
13051
|
case LLM_ARCH_RWKV6:
|
|
12819
13052
|
case LLM_ARCH_RWKV6QWEN2:
|
|
12820
13053
|
case LLM_ARCH_RWKV7:
|
|
12821
13054
|
case LLM_ARCH_ARWKV7:
|
|
12822
13055
|
{
|
|
12823
|
-
res = new
|
|
12824
|
-
|
|
12825
|
-
|
|
13056
|
+
res = new llama_kv_cache_recurrent(
|
|
13057
|
+
*this,
|
|
13058
|
+
GGML_TYPE_F32,
|
|
13059
|
+
GGML_TYPE_F32,
|
|
13060
|
+
cparams.offload_kqv,
|
|
13061
|
+
std::max((uint32_t) 1, cparams.n_seq_max));
|
|
12826
13062
|
} break;
|
|
12827
13063
|
default:
|
|
12828
13064
|
{
|
|
12829
|
-
|
|
12830
|
-
/*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
|
|
12831
|
-
// choose long/short freq factors based on the context size
|
|
12832
|
-
if (layers[il].rope_freqs != nullptr) {
|
|
12833
|
-
return layers[il].rope_freqs;
|
|
12834
|
-
}
|
|
13065
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
12835
13066
|
|
|
12836
|
-
|
|
12837
|
-
return layers[il].rope_long;
|
|
12838
|
-
}
|
|
13067
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
12839
13068
|
|
|
12840
|
-
|
|
12841
|
-
|
|
12842
|
-
|
|
13069
|
+
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
13070
|
+
|
|
13071
|
+
res = new llama_kv_cache_unified(
|
|
13072
|
+
*this,
|
|
13073
|
+
params.type_k,
|
|
13074
|
+
params.type_v,
|
|
13075
|
+
!cparams.flash_attn,
|
|
13076
|
+
cparams.offload_kqv,
|
|
13077
|
+
cparams.n_ctx,
|
|
13078
|
+
padding);
|
|
12843
13079
|
}
|
|
12844
13080
|
}
|
|
12845
13081
|
|
|
@@ -12856,8 +13092,6 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
12856
13092
|
case LLM_ARCH_LLAMA:
|
|
12857
13093
|
case LLM_ARCH_LLAMA4:
|
|
12858
13094
|
case LLM_ARCH_MINICPM:
|
|
12859
|
-
case LLM_ARCH_GRANITE:
|
|
12860
|
-
case LLM_ARCH_GRANITE_MOE:
|
|
12861
13095
|
{
|
|
12862
13096
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
|
12863
13097
|
} break;
|
|
@@ -13088,6 +13322,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13088
13322
|
{
|
|
13089
13323
|
llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
|
|
13090
13324
|
} break;
|
|
13325
|
+
case LLM_ARCH_GRANITE:
|
|
13326
|
+
case LLM_ARCH_GRANITE_MOE:
|
|
13327
|
+
{
|
|
13328
|
+
llm = std::make_unique<llm_build_granite>(*this, params, gf);
|
|
13329
|
+
} break;
|
|
13091
13330
|
case LLM_ARCH_CHAMELEON:
|
|
13092
13331
|
{
|
|
13093
13332
|
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
|
@@ -13221,8 +13460,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13221
13460
|
case LLM_ARCH_DECI:
|
|
13222
13461
|
case LLM_ARCH_BAICHUAN:
|
|
13223
13462
|
case LLM_ARCH_STARCODER:
|
|
13224
|
-
case LLM_ARCH_PLAMO:
|
|
13225
|
-
case LLM_ARCH_ORION:
|
|
13226
13463
|
case LLM_ARCH_INTERNLM2:
|
|
13227
13464
|
case LLM_ARCH_MINICPM:
|
|
13228
13465
|
case LLM_ARCH_XVERSE:
|
|
@@ -13260,6 +13497,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13260
13497
|
case LLM_ARCH_PHI2:
|
|
13261
13498
|
case LLM_ARCH_PHI3:
|
|
13262
13499
|
case LLM_ARCH_PHIMOE:
|
|
13500
|
+
case LLM_ARCH_PLAMO:
|
|
13263
13501
|
case LLM_ARCH_GEMMA:
|
|
13264
13502
|
case LLM_ARCH_GEMMA2:
|
|
13265
13503
|
case LLM_ARCH_GEMMA3:
|
|
@@ -13267,6 +13505,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13267
13505
|
case LLM_ARCH_OPENELM:
|
|
13268
13506
|
case LLM_ARCH_GPTNEOX:
|
|
13269
13507
|
case LLM_ARCH_CODESHELL:
|
|
13508
|
+
case LLM_ARCH_ORION:
|
|
13270
13509
|
case LLM_ARCH_NEMOTRON:
|
|
13271
13510
|
case LLM_ARCH_EXAONE:
|
|
13272
13511
|
case LLM_ARCH_MINICPM3:
|
|
@@ -13339,6 +13578,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
|
|
13339
13578
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
|
13340
13579
|
const auto & it = model->gguf_kv.find(key);
|
|
13341
13580
|
if (it == model->gguf_kv.end()) {
|
|
13581
|
+
// one-off fix for very popular models (so we are not flooded with issues)
|
|
13582
|
+
// do not extend this list unless absolutely necessary
|
|
13583
|
+
// Mistral-Small-2503 does not have built-in chat template
|
|
13584
|
+
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
|
13585
|
+
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
|
13586
|
+
return "mistral-v7-tekken";
|
|
13587
|
+
}
|
|
13588
|
+
|
|
13342
13589
|
return nullptr;
|
|
13343
13590
|
}
|
|
13344
13591
|
|