@fugood/llama.node 0.3.11 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -0
- package/lib/index.js +26 -20
- package/lib/index.ts +32 -28
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +13 -4
- package/src/llama.cpp/.github/workflows/build.yml +35 -3
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +20 -3
- package/src/llama.cpp/common/arg.cpp +180 -3
- package/src/llama.cpp/common/chat-template.hpp +21 -7
- package/src/llama.cpp/common/chat.cpp +220 -101
- package/src/llama.cpp/common/chat.hpp +3 -0
- package/src/llama.cpp/common/common.h +15 -7
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/minja.hpp +24 -9
- package/src/llama.cpp/common/sampling.cpp +52 -46
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/run/run.cpp +5 -12
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +58 -47
- package/src/llama.cpp/examples/server/utils.hpp +7 -5
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
- package/src/llama.cpp/ggml/src/ggml.c +1 -1
- package/src/llama.cpp/include/llama.h +14 -10
- package/src/llama.cpp/src/llama-grammar.cpp +1 -1
- package/src/llama.cpp/src/llama-grammar.h +1 -1
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +131 -57
- package/src/llama.cpp/src/llama.cpp +7 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
- package/src/llama.cpp/tests/test-chat.cpp +237 -69
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
|
@@ -365,6 +365,112 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
|
|
|
365
365
|
print_options(specific_options);
|
|
366
366
|
}
|
|
367
367
|
|
|
368
|
+
static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
369
|
+
std::vector<common_arg *> common_options;
|
|
370
|
+
std::vector<common_arg *> sparam_options;
|
|
371
|
+
std::vector<common_arg *> specific_options;
|
|
372
|
+
|
|
373
|
+
for (auto & opt : ctx_arg.options) {
|
|
374
|
+
if (opt.is_sparam) {
|
|
375
|
+
sparam_options.push_back(&opt);
|
|
376
|
+
} else if (opt.in_example(ctx_arg.ex)) {
|
|
377
|
+
specific_options.push_back(&opt);
|
|
378
|
+
} else {
|
|
379
|
+
common_options.push_back(&opt);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
printf("_llama_completions() {\n");
|
|
384
|
+
printf(" local cur prev opts\n");
|
|
385
|
+
printf(" COMPREPLY=()\n");
|
|
386
|
+
printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
|
|
387
|
+
printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
|
|
388
|
+
|
|
389
|
+
printf(" opts=\"");
|
|
390
|
+
auto print_options = [](const std::vector<common_arg *> & options) {
|
|
391
|
+
for (const common_arg * opt : options) {
|
|
392
|
+
for (const char * arg : opt->args) {
|
|
393
|
+
printf("%s ", arg);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
|
|
398
|
+
print_options(common_options);
|
|
399
|
+
print_options(sparam_options);
|
|
400
|
+
print_options(specific_options);
|
|
401
|
+
printf("\"\n\n");
|
|
402
|
+
|
|
403
|
+
printf(" case \"$prev\" in\n");
|
|
404
|
+
printf(" --model)\n");
|
|
405
|
+
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
406
|
+
printf(" return 0\n");
|
|
407
|
+
printf(" ;;\n");
|
|
408
|
+
printf(" --grammar-file)\n");
|
|
409
|
+
printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
410
|
+
printf(" return 0\n");
|
|
411
|
+
printf(" ;;\n");
|
|
412
|
+
printf(" --chat-template-file)\n");
|
|
413
|
+
printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
414
|
+
printf(" return 0\n");
|
|
415
|
+
printf(" ;;\n");
|
|
416
|
+
printf(" *)\n");
|
|
417
|
+
printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
|
|
418
|
+
printf(" return 0\n");
|
|
419
|
+
printf(" ;;\n");
|
|
420
|
+
printf(" esac\n");
|
|
421
|
+
printf("}\n\n");
|
|
422
|
+
|
|
423
|
+
std::set<std::string> executables = {
|
|
424
|
+
"llama-batched",
|
|
425
|
+
"llama-batched-bench",
|
|
426
|
+
"llama-bench",
|
|
427
|
+
"llama-cli",
|
|
428
|
+
"llama-convert-llama2c-to-ggml",
|
|
429
|
+
"llama-cvector-generator",
|
|
430
|
+
"llama-embedding",
|
|
431
|
+
"llama-eval-callback",
|
|
432
|
+
"llama-export-lora",
|
|
433
|
+
"llama-gbnf-validator",
|
|
434
|
+
"llama-gen-docs",
|
|
435
|
+
"llama-gguf",
|
|
436
|
+
"llama-gguf-hash",
|
|
437
|
+
"llama-gguf-split",
|
|
438
|
+
"llama-gritlm",
|
|
439
|
+
"llama-imatrix",
|
|
440
|
+
"llama-infill",
|
|
441
|
+
"llama-llava-cli",
|
|
442
|
+
"llama-llava-clip-quantize-cli",
|
|
443
|
+
"llama-lookahead",
|
|
444
|
+
"llama-lookup",
|
|
445
|
+
"llama-lookup-create",
|
|
446
|
+
"llama-lookup-merge",
|
|
447
|
+
"llama-lookup-stats",
|
|
448
|
+
"llama-minicpmv-cli",
|
|
449
|
+
"llama-parallel",
|
|
450
|
+
"llama-passkey",
|
|
451
|
+
"llama-perplexity",
|
|
452
|
+
"llama-q8dot",
|
|
453
|
+
"llama-quantize",
|
|
454
|
+
"llama-quantize-stats",
|
|
455
|
+
"llama-qwen2vl-cli",
|
|
456
|
+
"llama-retrieval",
|
|
457
|
+
"llama-run",
|
|
458
|
+
"llama-save-load-state",
|
|
459
|
+
"llama-server",
|
|
460
|
+
"llama-simple",
|
|
461
|
+
"llama-simple-chat",
|
|
462
|
+
"llama-speculative",
|
|
463
|
+
"llama-speculative-simple",
|
|
464
|
+
"llama-tokenize",
|
|
465
|
+
"llama-tts",
|
|
466
|
+
"llama-vdot"
|
|
467
|
+
};
|
|
468
|
+
|
|
469
|
+
for (const auto& exe : executables) {
|
|
470
|
+
printf("complete -F _llama_completions %s\n", exe.c_str());
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
368
474
|
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
|
|
369
475
|
std::vector<ggml_backend_dev_t> devices;
|
|
370
476
|
auto dev_names = string_split<std::string>(value, ',');
|
|
@@ -426,6 +532,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
|
|
426
532
|
}
|
|
427
533
|
exit(0);
|
|
428
534
|
}
|
|
535
|
+
if (ctx_arg.params.completion) {
|
|
536
|
+
common_params_print_completion(ctx_arg);
|
|
537
|
+
exit(0);
|
|
538
|
+
}
|
|
429
539
|
} catch (const std::invalid_argument & ex) {
|
|
430
540
|
fprintf(stderr, "%s\n", ex.what());
|
|
431
541
|
ctx_arg.params = params_org;
|
|
@@ -494,6 +604,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
494
604
|
exit(0);
|
|
495
605
|
}
|
|
496
606
|
));
|
|
607
|
+
add_opt(common_arg(
|
|
608
|
+
{"--completion-bash"},
|
|
609
|
+
"print source-able bash completion script for llama.cpp",
|
|
610
|
+
[](common_params & params) {
|
|
611
|
+
params.completion = true;
|
|
612
|
+
}
|
|
613
|
+
));
|
|
497
614
|
add_opt(common_arg(
|
|
498
615
|
{"--verbose-prompt"},
|
|
499
616
|
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
|
@@ -674,7 +791,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
674
791
|
));
|
|
675
792
|
add_opt(common_arg(
|
|
676
793
|
{"--no-context-shift"},
|
|
677
|
-
string_format("disables context shift on
|
|
794
|
+
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
678
795
|
[](common_params & params) {
|
|
679
796
|
params.ctx_shift = false;
|
|
680
797
|
}
|
|
@@ -946,6 +1063,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
946
1063
|
params.sampling.min_p = std::stof(value);
|
|
947
1064
|
}
|
|
948
1065
|
).set_sparam());
|
|
1066
|
+
add_opt(common_arg(
|
|
1067
|
+
{"--top-nsigma"}, "N",
|
|
1068
|
+
string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
|
|
1069
|
+
[](common_params & params, const std::string & value) {
|
|
1070
|
+
params.sampling.top_n_sigma = std::stof(value);
|
|
1071
|
+
}
|
|
1072
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
|
|
949
1073
|
add_opt(common_arg(
|
|
950
1074
|
{"--xtc-probability"}, "N",
|
|
951
1075
|
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
|
@@ -1445,7 +1569,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1445
1569
|
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
|
|
1446
1570
|
"- numactl: use the CPU map provided by numactl\n"
|
|
1447
1571
|
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
|
1448
|
-
"see https://github.com/
|
|
1572
|
+
"see https://github.com/ggml-org/llama.cpp/issues/1437",
|
|
1449
1573
|
[](common_params & params, const std::string & value) {
|
|
1450
1574
|
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
1451
1575
|
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
@@ -1975,6 +2099,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1975
2099
|
params.use_jinja = true;
|
|
1976
2100
|
}
|
|
1977
2101
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
|
2102
|
+
add_opt(common_arg(
|
|
2103
|
+
{"--reasoning-format"}, "FORMAT",
|
|
2104
|
+
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
|
|
2105
|
+
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
|
|
2106
|
+
"only supported for non-streamed responses",
|
|
2107
|
+
[](common_params & params, const std::string & value) {
|
|
2108
|
+
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2109
|
+
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2110
|
+
else { std::invalid_argument("invalid value"); }
|
|
2111
|
+
}
|
|
2112
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
|
1978
2113
|
add_opt(common_arg(
|
|
1979
2114
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
1980
2115
|
string_format(
|
|
@@ -2112,7 +2247,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2112
2247
|
).set_env("LLAMA_LOG_VERBOSITY"));
|
|
2113
2248
|
add_opt(common_arg(
|
|
2114
2249
|
{"--log-prefix"},
|
|
2115
|
-
"Enable
|
|
2250
|
+
"Enable prefix in log messages",
|
|
2116
2251
|
[](common_params &) {
|
|
2117
2252
|
common_log_set_prefix(common_log_main(), true);
|
|
2118
2253
|
}
|
|
@@ -2324,5 +2459,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2324
2459
|
}
|
|
2325
2460
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2326
2461
|
|
|
2462
|
+
add_opt(common_arg(
|
|
2463
|
+
{"--embd-bge-small-en-default"},
|
|
2464
|
+
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
|
|
2465
|
+
[](common_params & params) {
|
|
2466
|
+
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
2467
|
+
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
2468
|
+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2469
|
+
params.embd_normalize = 2;
|
|
2470
|
+
params.n_ctx = 512;
|
|
2471
|
+
params.verbose_prompt = true;
|
|
2472
|
+
params.embedding = true;
|
|
2473
|
+
}
|
|
2474
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2475
|
+
|
|
2476
|
+
add_opt(common_arg(
|
|
2477
|
+
{"--embd-e5-small-en-default"},
|
|
2478
|
+
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
|
2479
|
+
[](common_params & params) {
|
|
2480
|
+
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
2481
|
+
params.hf_file = "e5-small-v2-q8_0.gguf";
|
|
2482
|
+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2483
|
+
params.embd_normalize = 2;
|
|
2484
|
+
params.n_ctx = 512;
|
|
2485
|
+
params.verbose_prompt = true;
|
|
2486
|
+
params.embedding = true;
|
|
2487
|
+
}
|
|
2488
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2489
|
+
|
|
2490
|
+
add_opt(common_arg(
|
|
2491
|
+
{"--embd-gte-small-default"},
|
|
2492
|
+
string_format("use default gte-small model (note: can download weights from the internet)"),
|
|
2493
|
+
[](common_params & params) {
|
|
2494
|
+
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
|
2495
|
+
params.hf_file = "gte-small-q8_0.gguf";
|
|
2496
|
+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2497
|
+
params.embd_normalize = 2;
|
|
2498
|
+
params.n_ctx = 512;
|
|
2499
|
+
params.verbose_prompt = true;
|
|
2500
|
+
params.embedding = true;
|
|
2501
|
+
}
|
|
2502
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2503
|
+
|
|
2327
2504
|
return ctx_arg;
|
|
2328
2505
|
}
|
|
@@ -249,16 +249,30 @@ class chat_template {
|
|
|
249
249
|
inputs.add_generation_prompt = false;
|
|
250
250
|
full = apply(inputs);
|
|
251
251
|
}
|
|
252
|
-
|
|
253
|
-
if (
|
|
254
|
-
|
|
255
|
-
|
|
252
|
+
auto eos_pos_last = full.rfind(eos_token_);
|
|
253
|
+
if (eos_pos_last == prefix.size() - eos_token_.size() ||
|
|
254
|
+
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
|
|
255
|
+
full = full.substr(0, eos_pos_last);
|
|
256
|
+
}
|
|
257
|
+
size_t common_prefix_length = 0;
|
|
258
|
+
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
|
|
259
|
+
if (prefix[i] != full[i]) {
|
|
260
|
+
break;
|
|
256
261
|
}
|
|
262
|
+
if (prefix[i] == '<') {
|
|
263
|
+
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
|
|
264
|
+
// but it removes thinking tags for past messages.
|
|
265
|
+
// The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
common_prefix_length = i + 1;
|
|
257
269
|
}
|
|
258
|
-
|
|
270
|
+
auto example = full.substr(common_prefix_length);
|
|
271
|
+
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
|
|
259
272
|
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
|
|
273
|
+
} else {
|
|
274
|
+
tool_call_example_ = example;
|
|
260
275
|
}
|
|
261
|
-
tool_call_example_ = full.substr(prefix.size());
|
|
262
276
|
}
|
|
263
277
|
} catch (const std::exception & e) {
|
|
264
278
|
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
|
|
@@ -363,7 +377,7 @@ class chat_template {
|
|
|
363
377
|
if (polyfill_tools) {
|
|
364
378
|
adjusted_messages = add_system(inputs.messages,
|
|
365
379
|
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
|
|
366
|
-
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
|
|
380
|
+
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
|
|
367
381
|
} else {
|
|
368
382
|
adjusted_messages = inputs.messages;
|
|
369
383
|
}
|