@fugood/llama.node 0.3.9 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.js +2 -2
- package/lib/binding.ts +47 -8
- package/lib/index.js +21 -1
- package/lib/index.ts +31 -1
- package/package.json +12 -3
- package/src/LlamaCompletionWorker.cpp +33 -6
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +336 -28
- package/src/LlamaContext.h +2 -0
- package/src/common.hpp +19 -2
- package/src/llama.cpp/.github/workflows/build.yml +289 -107
- package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
- package/src/llama.cpp/.github/workflows/docker.yml +2 -1
- package/src/llama.cpp/.github/workflows/server.yml +25 -2
- package/src/llama.cpp/CMakeLists.txt +10 -19
- package/src/llama.cpp/cmake/build-info.cmake +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +32 -0
- package/src/llama.cpp/common/arg.cpp +66 -16
- package/src/llama.cpp/common/chat-template.hpp +515 -0
- package/src/llama.cpp/common/chat.cpp +966 -0
- package/src/llama.cpp/common/chat.hpp +52 -0
- package/src/llama.cpp/common/common.cpp +159 -36
- package/src/llama.cpp/common/common.h +56 -14
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
- package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
- package/src/llama.cpp/common/llguidance.cpp +270 -0
- package/src/llama.cpp/common/log.cpp +1 -10
- package/src/llama.cpp/common/log.h +10 -0
- package/src/llama.cpp/common/minja.hpp +2868 -0
- package/src/llama.cpp/common/sampling.cpp +22 -1
- package/src/llama.cpp/common/sampling.h +3 -0
- package/src/llama.cpp/docs/build.md +54 -9
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
- package/src/llama.cpp/examples/llava/clip.cpp +133 -14
- package/src/llama.cpp/examples/llava/clip.h +2 -0
- package/src/llama.cpp/examples/llava/llava.cpp +22 -8
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
- package/src/llama.cpp/examples/main/main.cpp +26 -25
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
- package/src/llama.cpp/examples/run/run.cpp +224 -69
- package/src/llama.cpp/examples/server/server.cpp +252 -81
- package/src/llama.cpp/examples/server/utils.hpp +73 -21
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
- package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
- package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +23 -13
- package/src/llama.cpp/include/llama.h +14 -1
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
- package/src/llama.cpp/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/src/llama-arch.cpp +7 -2
- package/src/llama.cpp/src/llama-arch.h +3 -1
- package/src/llama.cpp/src/llama-chat.cpp +11 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +86 -6
- package/src/llama.cpp/src/llama-grammar.h +22 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +76 -6
- package/src/llama.cpp/src/llama-sampling.cpp +47 -4
- package/src/llama.cpp/src/llama-vocab.cpp +10 -4
- package/src/llama.cpp/src/llama.cpp +181 -123
- package/src/llama.cpp/tests/CMakeLists.txt +4 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
- package/src/llama.cpp/tests/test-chat.cpp +607 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
|
@@ -1093,8 +1093,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1093
1093
|
{
|
|
1094
1094
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1095
1095
|
switch (hparams.n_layer) {
|
|
1096
|
-
case 28:
|
|
1097
|
-
|
|
1096
|
+
case 28: {
|
|
1097
|
+
if (hparams.n_head(0) == 16) {
|
|
1098
|
+
type = LLM_TYPE_1_5B;
|
|
1099
|
+
} else {
|
|
1100
|
+
type = LLM_TYPE_6B;
|
|
1101
|
+
}
|
|
1102
|
+
} break;
|
|
1103
|
+
case 40: {
|
|
1104
|
+
if (hparams.n_head(0) == 24) {
|
|
1105
|
+
type = LLM_TYPE_4B;
|
|
1106
|
+
} else {
|
|
1107
|
+
type = LLM_TYPE_9B;
|
|
1108
|
+
}
|
|
1109
|
+
} break;
|
|
1098
1110
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1099
1111
|
}
|
|
1100
1112
|
} break;
|
|
@@ -1263,6 +1275,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1263
1275
|
|
|
1264
1276
|
const bool use_mmap_buffer = true;
|
|
1265
1277
|
|
|
1278
|
+
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, use_mmap_buffer ? "true" : "false");
|
|
1279
|
+
|
|
1266
1280
|
// build a list of buffer types for the CPU and GPU devices
|
|
1267
1281
|
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
|
1268
1282
|
for (auto * dev : devices) {
|
|
@@ -1303,10 +1317,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1303
1317
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
|
1304
1318
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
|
1305
1319
|
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
|
1320
|
+
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
|
|
1306
1321
|
return {cpu_dev, &pimpl->cpu_buft_list};
|
|
1307
1322
|
}
|
|
1308
1323
|
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
|
1309
1324
|
auto * dev = devices.at(layer_gpu);
|
|
1325
|
+
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
|
|
1310
1326
|
return {dev, &pimpl->gpu_buft_list.at(dev)};
|
|
1311
1327
|
};
|
|
1312
1328
|
|
|
@@ -2203,6 +2219,50 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2203
2219
|
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2204
2220
|
}
|
|
2205
2221
|
} break;
|
|
2222
|
+
case LLM_ARCH_PHIMOE:
|
|
2223
|
+
{
|
|
2224
|
+
const int64_t n_embd_head = n_embd / n_head;
|
|
2225
|
+
|
|
2226
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
2227
|
+
|
|
2228
|
+
// output
|
|
2229
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
2230
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
2231
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
|
|
2232
|
+
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
|
|
2233
|
+
|
|
2234
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2235
|
+
auto & layer = layers[i];
|
|
2236
|
+
|
|
2237
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
2238
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
|
|
2239
|
+
|
|
2240
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
2241
|
+
if (layer.wqkv == nullptr) {
|
|
2242
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2243
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
2244
|
+
|
|
2245
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2246
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
|
2247
|
+
|
|
2248
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2249
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
2250
|
+
}
|
|
2251
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
|
2252
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
|
|
2253
|
+
|
|
2254
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
|
2255
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
|
|
2256
|
+
|
|
2257
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2258
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
2259
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
|
|
2260
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
2261
|
+
|
|
2262
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2263
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2264
|
+
}
|
|
2265
|
+
} break;
|
|
2206
2266
|
case LLM_ARCH_PLAMO:
|
|
2207
2267
|
{
|
|
2208
2268
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3022,9 +3082,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3022
3082
|
auto & layer = layers[i];
|
|
3023
3083
|
|
|
3024
3084
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3085
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3086
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3025
3087
|
|
|
3026
|
-
layer.wqkv
|
|
3027
|
-
|
|
3088
|
+
if (layer.wqkv == nullptr) {
|
|
3089
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3090
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3091
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3092
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3093
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3094
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3095
|
+
}
|
|
3028
3096
|
|
|
3029
3097
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3030
3098
|
|
|
@@ -3911,8 +3979,10 @@ uint64_t llama_model_size(const struct llama_model * model) {
|
|
|
3911
3979
|
return model->size();
|
|
3912
3980
|
}
|
|
3913
3981
|
|
|
3914
|
-
const char * llama_model_chat_template(const struct llama_model * model) {
|
|
3915
|
-
const auto
|
|
3982
|
+
const char * llama_model_chat_template(const struct llama_model * model, const char * name) {
|
|
3983
|
+
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
|
|
3984
|
+
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
|
3985
|
+
const auto & it = model->gguf_kv.find(key);
|
|
3916
3986
|
if (it == model->gguf_kv.end()) {
|
|
3917
3987
|
return nullptr;
|
|
3918
3988
|
}
|
|
@@ -1433,13 +1433,30 @@ static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token
|
|
|
1433
1433
|
}
|
|
1434
1434
|
}
|
|
1435
1435
|
|
|
1436
|
+
// Fwd declare to break reset --> init_impl --> llama_sampler_grammar_i --> reset cycle.
|
|
1437
|
+
static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
1438
|
+
const struct llama_vocab * vocab,
|
|
1439
|
+
const char * grammar_str,
|
|
1440
|
+
const char * grammar_root,
|
|
1441
|
+
bool lazy,
|
|
1442
|
+
const char ** trigger_words,
|
|
1443
|
+
size_t num_trigger_words,
|
|
1444
|
+
const llama_token * trigger_tokens,
|
|
1445
|
+
size_t num_trigger_tokens);
|
|
1446
|
+
|
|
1436
1447
|
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
1437
1448
|
auto * ctx = (llama_sampler_grammar *) smpl->ctx;
|
|
1438
1449
|
if (!ctx->grammar) {
|
|
1439
1450
|
return;
|
|
1440
1451
|
}
|
|
1441
1452
|
|
|
1442
|
-
|
|
1453
|
+
std::vector<const char *> trigger_words;
|
|
1454
|
+
for (auto & word : ctx->grammar->trigger_words) {
|
|
1455
|
+
trigger_words.push_back(word.c_str());
|
|
1456
|
+
}
|
|
1457
|
+
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
|
|
1458
|
+
ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
|
|
1459
|
+
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
|
1443
1460
|
|
|
1444
1461
|
llama_grammar_free_impl(ctx->grammar);
|
|
1445
1462
|
ctx->grammar = grammar_new;
|
|
@@ -1448,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
|
1448
1465
|
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
|
|
1449
1466
|
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
|
1450
1467
|
|
|
1451
|
-
auto * result =
|
|
1468
|
+
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
|
|
1452
1469
|
|
|
1453
1470
|
// copy the state
|
|
1454
1471
|
{
|
|
@@ -1484,7 +1501,15 @@ static struct llama_sampler_i llama_sampler_grammar_i = {
|
|
|
1484
1501
|
/* .free = */ llama_sampler_grammar_free,
|
|
1485
1502
|
};
|
|
1486
1503
|
|
|
1487
|
-
struct llama_sampler *
|
|
1504
|
+
static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
1505
|
+
const struct llama_vocab * vocab,
|
|
1506
|
+
const char * grammar_str,
|
|
1507
|
+
const char * grammar_root,
|
|
1508
|
+
bool lazy,
|
|
1509
|
+
const char ** trigger_words,
|
|
1510
|
+
size_t num_trigger_words,
|
|
1511
|
+
const llama_token * trigger_tokens,
|
|
1512
|
+
size_t num_trigger_tokens) {
|
|
1488
1513
|
auto * ctx = new llama_sampler_grammar;
|
|
1489
1514
|
|
|
1490
1515
|
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
|
@@ -1492,7 +1517,7 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_vocab * voc
|
|
|
1492
1517
|
/* .vocab = */ vocab,
|
|
1493
1518
|
/* .grammar_str = */ grammar_str,
|
|
1494
1519
|
/* .grammar_root = */ grammar_root,
|
|
1495
|
-
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root),
|
|
1520
|
+
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
|
|
1496
1521
|
};
|
|
1497
1522
|
} else {
|
|
1498
1523
|
*ctx = {
|
|
@@ -1509,6 +1534,24 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_vocab * voc
|
|
|
1509
1534
|
};
|
|
1510
1535
|
}
|
|
1511
1536
|
|
|
1537
|
+
struct llama_sampler * llama_sampler_init_grammar(
|
|
1538
|
+
const struct llama_vocab * vocab,
|
|
1539
|
+
const char * grammar_str,
|
|
1540
|
+
const char * grammar_root) {
|
|
1541
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
1545
|
+
const struct llama_vocab * vocab,
|
|
1546
|
+
const char * grammar_str,
|
|
1547
|
+
const char * grammar_root,
|
|
1548
|
+
const char ** trigger_words,
|
|
1549
|
+
size_t num_trigger_words,
|
|
1550
|
+
const llama_token * trigger_tokens,
|
|
1551
|
+
size_t num_trigger_tokens) {
|
|
1552
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
|
|
1553
|
+
}
|
|
1554
|
+
|
|
1512
1555
|
// penalties
|
|
1513
1556
|
|
|
1514
1557
|
struct llama_sampler_penalties {
|
|
@@ -1245,8 +1245,13 @@ struct llama_vocab::impl {
|
|
|
1245
1245
|
|
|
1246
1246
|
std::vector<llama_token> cache_special_tokens;
|
|
1247
1247
|
std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
|
|
1248
|
-
|
|
1249
|
-
|
|
1248
|
+
struct pair_hash {
|
|
1249
|
+
size_t operator()(const std::pair<std::string, std::string> & p) const {
|
|
1250
|
+
return std::hash<std::string>{}(p.first) ^ //create some hash for pair
|
|
1251
|
+
(std::hash<std::string>{}(p.second) << 1);
|
|
1252
|
+
}
|
|
1253
|
+
};
|
|
1254
|
+
std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
|
|
1250
1255
|
|
|
1251
1256
|
// set of all tokens that cause "end of generation"
|
|
1252
1257
|
std::set<llama_token> special_eog_ids;
|
|
@@ -1523,7 +1528,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1523
1528
|
pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
|
1524
1529
|
clean_spaces = false;
|
|
1525
1530
|
} else if (
|
|
1526
|
-
|
|
1531
|
+
tokenizer_pre == "qwen2" ||
|
|
1532
|
+
tokenizer_pre == "deepseek-r1-qwen") {
|
|
1527
1533
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1528
1534
|
clean_spaces = false;
|
|
1529
1535
|
} else if (
|
|
@@ -1686,7 +1692,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1686
1692
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
|
1687
1693
|
linefeed_id = ids[0];
|
|
1688
1694
|
} else {
|
|
1689
|
-
const std::vector<int> ids = tokenize("\
|
|
1695
|
+
const std::vector<int> ids = tokenize("\n", false);
|
|
1690
1696
|
|
|
1691
1697
|
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
|
1692
1698
|
if (ids.empty()) {
|