@fugood/llama.node 0.3.13 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +60 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +112 -11
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +110 -67
- package/src/llama.cpp/examples/server/server.cpp +82 -87
- package/src/llama.cpp/examples/server/utils.hpp +94 -107
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +5 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +8 -3
- package/src/llama.cpp/include/llama.h +19 -5
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +69 -5
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +147 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
#include <algorithm>
|
|
10
10
|
#include <cassert>
|
|
11
11
|
#include <cstring>
|
|
12
|
+
#include <cmath>
|
|
12
13
|
#include <functional>
|
|
13
14
|
#include <map>
|
|
14
15
|
#include <sstream>
|
|
@@ -864,6 +865,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
864
865
|
default: type = LLM_TYPE_UNKNOWN;
|
|
865
866
|
}
|
|
866
867
|
} break;
|
|
868
|
+
case LLM_ARCH_GEMMA3:
|
|
869
|
+
{
|
|
870
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
871
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
872
|
+
|
|
873
|
+
switch (hparams.n_layer) {
|
|
874
|
+
case 26: type = LLM_TYPE_1B; break;
|
|
875
|
+
case 34: type = LLM_TYPE_4B; break;
|
|
876
|
+
case 48: type = LLM_TYPE_12B; break;
|
|
877
|
+
case 62: type = LLM_TYPE_27B; break;
|
|
878
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
882
|
+
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
883
|
+
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
884
|
+
} break;
|
|
867
885
|
case LLM_ARCH_STARCODER2:
|
|
868
886
|
{
|
|
869
887
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1424,6 +1442,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1424
1442
|
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
|
1425
1443
|
}
|
|
1426
1444
|
|
|
1445
|
+
// skip unused tensors
|
|
1446
|
+
if (info.op == GGML_OP_NONE) {
|
|
1447
|
+
LLAMA_LOG_WARN("model has unused tensor %s -- ignoring\n", tn.str().c_str());
|
|
1448
|
+
ml.n_created++;
|
|
1449
|
+
|
|
1450
|
+
return nullptr;
|
|
1451
|
+
}
|
|
1452
|
+
|
|
1427
1453
|
// tensors with "bias" suffix are always used with GGML_OP_ADD
|
|
1428
1454
|
ggml_op op;
|
|
1429
1455
|
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
|
@@ -2194,13 +2220,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2194
2220
|
} break;
|
|
2195
2221
|
case LLM_ARCH_PHI3:
|
|
2196
2222
|
{
|
|
2197
|
-
const int64_t n_embd_head = n_embd / n_head;
|
|
2198
|
-
|
|
2199
2223
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
2200
2224
|
|
|
2201
2225
|
// output
|
|
2202
2226
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
2203
|
-
output
|
|
2227
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2228
|
+
|
|
2229
|
+
// if output is NULL, init from the input tok embed
|
|
2230
|
+
if (output == NULL) {
|
|
2231
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2232
|
+
}
|
|
2204
2233
|
|
|
2205
2234
|
for (int i = 0; i < n_layer; ++i) {
|
|
2206
2235
|
auto & layer = layers[i];
|
|
@@ -2215,8 +2244,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2215
2244
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
2216
2245
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
|
|
2217
2246
|
|
|
2218
|
-
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {
|
|
2219
|
-
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {
|
|
2247
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2248
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2220
2249
|
}
|
|
2221
2250
|
} break;
|
|
2222
2251
|
case LLM_ARCH_PHIMOE:
|
|
@@ -2443,6 +2472,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2443
2472
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2444
2473
|
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2445
2474
|
|
|
2475
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2476
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2477
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2478
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2479
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2480
|
+
}
|
|
2481
|
+
} break;
|
|
2482
|
+
case LLM_ARCH_GEMMA3:
|
|
2483
|
+
{
|
|
2484
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2485
|
+
|
|
2486
|
+
// output
|
|
2487
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2488
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
|
2489
|
+
|
|
2490
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2491
|
+
auto & layer = layers[i];
|
|
2492
|
+
|
|
2493
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2494
|
+
|
|
2495
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2496
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2497
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
2498
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2499
|
+
|
|
2500
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2501
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2502
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2503
|
+
|
|
2446
2504
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2447
2505
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2448
2506
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
@@ -3639,6 +3697,7 @@ void llama_model::print_info() const {
|
|
|
3639
3697
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
|
3640
3698
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
|
3641
3699
|
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
|
3700
|
+
LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
|
|
3642
3701
|
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
|
3643
3702
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
|
3644
3703
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
|
@@ -3830,6 +3889,10 @@ int32_t llama_model_n_head(const struct llama_model * model) {
|
|
|
3830
3889
|
return model->hparams.n_head();
|
|
3831
3890
|
}
|
|
3832
3891
|
|
|
3892
|
+
int32_t llama_model_n_head_kv(const struct llama_model * model) {
|
|
3893
|
+
return model->hparams.n_head_kv();
|
|
3894
|
+
}
|
|
3895
|
+
|
|
3833
3896
|
// deprecated
|
|
3834
3897
|
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
|
3835
3898
|
return llama_model_n_ctx_train(model);
|
|
@@ -3908,6 +3971,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
|
|
3908
3971
|
case LLM_ARCH_PHIMOE:
|
|
3909
3972
|
case LLM_ARCH_GEMMA:
|
|
3910
3973
|
case LLM_ARCH_GEMMA2:
|
|
3974
|
+
case LLM_ARCH_GEMMA3:
|
|
3911
3975
|
case LLM_ARCH_STARCODER2:
|
|
3912
3976
|
case LLM_ARCH_OPENELM:
|
|
3913
3977
|
case LLM_ARCH_GPTNEOX:
|
|
@@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1449
1449
|
const char ** trigger_words,
|
|
1450
1450
|
size_t num_trigger_words,
|
|
1451
1451
|
const llama_token * trigger_tokens,
|
|
1452
|
-
size_t num_trigger_tokens
|
|
1452
|
+
size_t num_trigger_tokens,
|
|
1453
|
+
const char ** trigger_patterns,
|
|
1454
|
+
size_t num_trigger_patterns);
|
|
1453
1455
|
|
|
1454
1456
|
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
1455
1457
|
auto * ctx = (llama_sampler_grammar *) smpl->ctx;
|
|
@@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
|
1457
1459
|
return;
|
|
1458
1460
|
}
|
|
1459
1461
|
|
|
1460
|
-
std::vector<const char *>
|
|
1461
|
-
|
|
1462
|
-
|
|
1462
|
+
std::vector<const char *> trigger_patterns_c;
|
|
1463
|
+
trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
|
|
1464
|
+
for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
|
|
1465
|
+
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
|
1463
1466
|
}
|
|
1467
|
+
|
|
1464
1468
|
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
|
|
1465
|
-
ctx->grammar->lazy,
|
|
1469
|
+
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
1466
1470
|
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
|
1467
1471
|
|
|
1468
1472
|
llama_grammar_free_impl(ctx->grammar);
|
|
@@ -1472,7 +1476,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
|
1472
1476
|
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
|
|
1473
1477
|
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
|
1474
1478
|
|
|
1475
|
-
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
|
|
1479
|
+
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
|
|
1476
1480
|
|
|
1477
1481
|
// copy the state
|
|
1478
1482
|
{
|
|
@@ -1516,15 +1520,33 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1516
1520
|
const char ** trigger_words,
|
|
1517
1521
|
size_t num_trigger_words,
|
|
1518
1522
|
const llama_token * trigger_tokens,
|
|
1519
|
-
size_t num_trigger_tokens
|
|
1523
|
+
size_t num_trigger_tokens,
|
|
1524
|
+
const char ** trigger_patterns,
|
|
1525
|
+
size_t num_trigger_patterns) {
|
|
1520
1526
|
auto * ctx = new llama_sampler_grammar;
|
|
1521
1527
|
|
|
1522
1528
|
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
|
1529
|
+
// TODO: remove trigger_words support.
|
|
1530
|
+
if (trigger_words != nullptr && num_trigger_words > 0) {
|
|
1531
|
+
GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
|
|
1532
|
+
std::string trigger_pattern("[\\s\\S]*?(");
|
|
1533
|
+
for (size_t i = 0; i < num_trigger_words; ++i) {
|
|
1534
|
+
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
|
1535
|
+
if (i > 0) {
|
|
1536
|
+
trigger_pattern += "|";
|
|
1537
|
+
}
|
|
1538
|
+
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
|
1539
|
+
}
|
|
1540
|
+
trigger_pattern += ")[\\s\\S]*";
|
|
1541
|
+
auto trigger_pattern_c = trigger_pattern.c_str();
|
|
1542
|
+
trigger_patterns = &trigger_pattern_c;
|
|
1543
|
+
num_trigger_patterns = 1;
|
|
1544
|
+
}
|
|
1523
1545
|
*ctx = {
|
|
1524
1546
|
/* .vocab = */ vocab,
|
|
1525
1547
|
/* .grammar_str = */ grammar_str,
|
|
1526
1548
|
/* .grammar_root = */ grammar_root,
|
|
1527
|
-
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy,
|
|
1549
|
+
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
|
1528
1550
|
};
|
|
1529
1551
|
} else {
|
|
1530
1552
|
*ctx = {
|
|
@@ -1545,7 +1567,7 @@ struct llama_sampler * llama_sampler_init_grammar(
|
|
|
1545
1567
|
const struct llama_vocab * vocab,
|
|
1546
1568
|
const char * grammar_str,
|
|
1547
1569
|
const char * grammar_root) {
|
|
1548
|
-
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
|
|
1570
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
|
|
1549
1571
|
}
|
|
1550
1572
|
|
|
1551
1573
|
struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
@@ -1556,7 +1578,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
|
1556
1578
|
size_t num_trigger_words,
|
|
1557
1579
|
const llama_token * trigger_tokens,
|
|
1558
1580
|
size_t num_trigger_tokens) {
|
|
1559
|
-
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
|
|
1581
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
|
|
1582
|
+
}
|
|
1583
|
+
|
|
1584
|
+
struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
|
|
1585
|
+
const struct llama_vocab * vocab,
|
|
1586
|
+
const char * grammar_str,
|
|
1587
|
+
const char * grammar_root,
|
|
1588
|
+
const char ** trigger_patterns,
|
|
1589
|
+
size_t num_trigger_patterns,
|
|
1590
|
+
const llama_token * trigger_tokens,
|
|
1591
|
+
size_t num_trigger_tokens) {
|
|
1592
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
|
|
1560
1593
|
}
|
|
1561
1594
|
|
|
1562
1595
|
// penalties
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
#include <queue>
|
|
17
17
|
#include <set>
|
|
18
18
|
#include <unordered_map>
|
|
19
|
+
#include <cctype>
|
|
19
20
|
|
|
20
21
|
//
|
|
21
22
|
// helpers
|
|
@@ -392,6 +393,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
392
393
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
393
394
|
};
|
|
394
395
|
break;
|
|
396
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT4O:
|
|
397
|
+
regex_exprs = {
|
|
398
|
+
// original regex from tokenizer.json
|
|
399
|
+
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
400
|
+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
401
|
+
};
|
|
402
|
+
break;
|
|
395
403
|
default:
|
|
396
404
|
// default regex for BPE tokenization pre-processing
|
|
397
405
|
regex_exprs = {
|
|
@@ -1592,6 +1600,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1592
1600
|
} else if (
|
|
1593
1601
|
tokenizer_pre == "megrez") {
|
|
1594
1602
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1603
|
+
} else if (
|
|
1604
|
+
tokenizer_pre == "gpt-4o") {
|
|
1605
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
|
1606
|
+
clean_spaces = false;
|
|
1595
1607
|
} else {
|
|
1596
1608
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
1597
1609
|
}
|
|
@@ -4978,6 +4978,149 @@ struct llm_build_context {
|
|
|
4978
4978
|
return gf;
|
|
4979
4979
|
}
|
|
4980
4980
|
|
|
4981
|
+
struct ggml_cgraph * build_gemma3() {
|
|
4982
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
|
4983
|
+
|
|
4984
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
4985
|
+
|
|
4986
|
+
struct ggml_tensor * cur;
|
|
4987
|
+
struct ggml_tensor * inpL;
|
|
4988
|
+
|
|
4989
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
|
4990
|
+
|
|
4991
|
+
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
4992
|
+
if (ubatch.token) {
|
|
4993
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
|
4994
|
+
cb(inpL, "inp_scaled", -1);
|
|
4995
|
+
}
|
|
4996
|
+
|
|
4997
|
+
// inp_pos - contains the positions
|
|
4998
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
4999
|
+
|
|
5000
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
5001
|
+
// gemma3 requires different mask for layers using sliding window (SWA)
|
|
5002
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
|
|
5003
|
+
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
|
|
5004
|
+
|
|
5005
|
+
// "5-to-1 interleaved attention"
|
|
5006
|
+
// 5 layers of local attention followed by 1 layer of global attention
|
|
5007
|
+
static const int sliding_window_pattern = 6;
|
|
5008
|
+
|
|
5009
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
5010
|
+
const bool is_sliding = (il + 1) % sliding_window_pattern;
|
|
5011
|
+
const float freq_base_l = is_sliding ? 10000.0f : freq_base;
|
|
5012
|
+
const float freq_scale_l = is_sliding ? 1.0f : freq_scale;
|
|
5013
|
+
struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
|
5014
|
+
|
|
5015
|
+
// norm
|
|
5016
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
5017
|
+
model.layers[il].attn_norm, NULL,
|
|
5018
|
+
LLM_NORM_RMS, cb, il);
|
|
5019
|
+
cb(cur, "attn_norm", il);
|
|
5020
|
+
|
|
5021
|
+
// self-attention
|
|
5022
|
+
{
|
|
5023
|
+
// compute Q and K and RoPE them
|
|
5024
|
+
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
|
5025
|
+
cb(Qcur, "Qcur", il);
|
|
5026
|
+
|
|
5027
|
+
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
|
5028
|
+
cb(Kcur, "Kcur", il);
|
|
5029
|
+
|
|
5030
|
+
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
|
5031
|
+
cb(Vcur, "Vcur", il);
|
|
5032
|
+
|
|
5033
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens);
|
|
5034
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
|
5035
|
+
model.layers[il].attn_q_norm,
|
|
5036
|
+
NULL,
|
|
5037
|
+
LLM_NORM_RMS, cb, il);
|
|
5038
|
+
cb(Qcur, "Qcur_normed", il);
|
|
5039
|
+
|
|
5040
|
+
Qcur = ggml_rope_ext(
|
|
5041
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
5042
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
5043
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
5044
|
+
cb(Qcur, "Qcur", il);
|
|
5045
|
+
|
|
5046
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens);
|
|
5047
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
|
5048
|
+
model.layers[il].attn_k_norm,
|
|
5049
|
+
NULL,
|
|
5050
|
+
LLM_NORM_RMS, cb, il);
|
|
5051
|
+
cb(Kcur, "Kcur_normed", il);
|
|
5052
|
+
|
|
5053
|
+
Kcur = ggml_rope_ext(
|
|
5054
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
5055
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
5056
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
5057
|
+
cb(Kcur, "Kcur", il);
|
|
5058
|
+
|
|
5059
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
|
5060
|
+
model.layers[il].wo, NULL,
|
|
5061
|
+
Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il);
|
|
5062
|
+
}
|
|
5063
|
+
|
|
5064
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
5065
|
+
model.layers[il].attn_post_norm, NULL,
|
|
5066
|
+
LLM_NORM_RMS, cb, il);
|
|
5067
|
+
cb(cur, "attn_post_norm", il);
|
|
5068
|
+
|
|
5069
|
+
if (il == n_layer - 1) {
|
|
5070
|
+
// skip computing output for unused tokens
|
|
5071
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5072
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5073
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5074
|
+
}
|
|
5075
|
+
|
|
5076
|
+
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
5077
|
+
cb(sa_out, "sa_out", il);
|
|
5078
|
+
|
|
5079
|
+
cur = llm_build_norm(ctx0, sa_out, hparams,
|
|
5080
|
+
model.layers[il].ffn_norm, NULL,
|
|
5081
|
+
LLM_NORM_RMS, cb, il);
|
|
5082
|
+
cb(cur, "ffn_norm", il);
|
|
5083
|
+
|
|
5084
|
+
// feed-forward network
|
|
5085
|
+
{
|
|
5086
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
|
5087
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
5088
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
5089
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
5090
|
+
NULL,
|
|
5091
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
|
5092
|
+
cb(cur, "ffn_out", il);
|
|
5093
|
+
}
|
|
5094
|
+
|
|
5095
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
5096
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
5097
|
+
LLM_NORM_RMS, cb, -1);
|
|
5098
|
+
cb(cur, "ffn_post_norm", -1);
|
|
5099
|
+
|
|
5100
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
|
5101
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
|
5102
|
+
cb(cur, "l_out", il);
|
|
5103
|
+
|
|
5104
|
+
// input for next layer
|
|
5105
|
+
inpL = cur;
|
|
5106
|
+
}
|
|
5107
|
+
|
|
5108
|
+
cur = inpL;
|
|
5109
|
+
|
|
5110
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
5111
|
+
model.output_norm, NULL,
|
|
5112
|
+
LLM_NORM_RMS, cb, -1);
|
|
5113
|
+
cb(cur, "result_norm", -1);
|
|
5114
|
+
|
|
5115
|
+
// lm_head
|
|
5116
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
5117
|
+
|
|
5118
|
+
cb(cur, "result_output", -1);
|
|
5119
|
+
|
|
5120
|
+
ggml_build_forward_expand(gf, cur);
|
|
5121
|
+
|
|
5122
|
+
return gf;
|
|
5123
|
+
}
|
|
4981
5124
|
|
|
4982
5125
|
struct ggml_cgraph * build_starcoder2() {
|
|
4983
5126
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
|
@@ -8298,6 +8441,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
8298
8441
|
{
|
|
8299
8442
|
result = llm.build_gemma2();
|
|
8300
8443
|
} break;
|
|
8444
|
+
case LLM_ARCH_GEMMA3:
|
|
8445
|
+
{
|
|
8446
|
+
result = llm.build_gemma3();
|
|
8447
|
+
} break;
|
|
8301
8448
|
case LLM_ARCH_STARCODER2:
|
|
8302
8449
|
{
|
|
8303
8450
|
result = llm.build_starcoder2();
|