@fugood/llama.node 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/arg.cpp +162 -134
- package/src/llama.cpp/common/chat-parser.cpp +2 -2
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-partial.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +5 -4
- package/src/llama.cpp/src/llama-hparams.cpp +5 -1
- package/src/llama.cpp/src/llama-model.cpp +4 -4
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.2.
|
|
4
|
+
"version": "1.2.5",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.2.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.2.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.2.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.2.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.2.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.2.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.2.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.2.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.2.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.2.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.2.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.2.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.2.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.2.5",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.2.5",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.2.5",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.2.5",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.2.5",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.2.5",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.2.5",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.2.5",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.2.5",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.2.5",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.2.5",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.2.5",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.2.5"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -3358,7 +3358,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3358
3358
|
add_opt(common_arg(
|
|
3359
3359
|
{"--chat-template-kwargs"}, "STRING",
|
|
3360
3360
|
string_format("sets additional params for the json template parser"),
|
|
3361
|
-
[](common_params & params, const std::string &
|
|
3361
|
+
[](common_params & params, const std::string & value) {
|
|
3362
3362
|
auto parsed = json::parse(value);
|
|
3363
3363
|
for (const auto & item : parsed.items()) {
|
|
3364
3364
|
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
@@ -3570,21 +3570,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3570
3570
|
common_log_set_file(common_log_main(), value.c_str());
|
|
3571
3571
|
}
|
|
3572
3572
|
));
|
|
3573
|
-
add_opt(common_arg(
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
|
|
3587
|
-
|
|
3573
|
+
add_opt(common_arg(
|
|
3574
|
+
{"--log-colors"}, "[on|off|auto]",
|
|
3575
|
+
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
|
3576
|
+
"'auto' enables colors when output is to a terminal",
|
|
3577
|
+
[](common_params &, const std::string & value) {
|
|
3578
|
+
if (is_truthy(value)) {
|
|
3579
|
+
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
|
|
3580
|
+
} else if (is_falsey(value)) {
|
|
3581
|
+
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
|
|
3582
|
+
} else if (is_autoy(value)) {
|
|
3583
|
+
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
|
|
3584
|
+
} else {
|
|
3585
|
+
throw std::invalid_argument(
|
|
3586
|
+
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
|
|
3587
|
+
}
|
|
3588
|
+
}
|
|
3589
|
+
).set_env("LLAMA_LOG_COLORS"));
|
|
3588
3590
|
add_opt(common_arg(
|
|
3589
3591
|
{"-v", "--verbose", "--log-verbose"},
|
|
3590
3592
|
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
|
@@ -3850,7 +3852,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3850
3852
|
}
|
|
3851
3853
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
3852
3854
|
|
|
3853
|
-
|
|
3855
|
+
add_opt(common_arg(
|
|
3856
|
+
{"--diffusion-steps"}, "N",
|
|
3857
|
+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
|
3858
|
+
[](common_params & params, int value) { params.diffusion.steps = value; }
|
|
3859
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3860
|
+
add_opt(common_arg(
|
|
3861
|
+
{"--diffusion-visual"},
|
|
3862
|
+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
|
|
3863
|
+
[](common_params & params) { params.diffusion.visual_mode = true; }
|
|
3864
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3865
|
+
add_opt(common_arg(
|
|
3866
|
+
{"--diffusion-eps"}, "F",
|
|
3867
|
+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
3868
|
+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
|
3869
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3870
|
+
add_opt(common_arg(
|
|
3871
|
+
{"--diffusion-algorithm"}, "N",
|
|
3872
|
+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
|
|
3873
|
+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
3874
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3875
|
+
add_opt(common_arg(
|
|
3876
|
+
{"--diffusion-alg-temp"}, "F",
|
|
3877
|
+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3878
|
+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
3879
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3880
|
+
add_opt(common_arg(
|
|
3881
|
+
{"--diffusion-block-length"}, "N",
|
|
3882
|
+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
|
|
3883
|
+
[](common_params & params, int value) { params.diffusion.block_length = value; }
|
|
3884
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3885
|
+
add_opt(common_arg(
|
|
3886
|
+
{"--diffusion-cfg-scale"}, "F",
|
|
3887
|
+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
|
|
3888
|
+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
|
|
3889
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3890
|
+
add_opt(common_arg(
|
|
3891
|
+
{"--diffusion-add-gumbel-noise"}, "F",
|
|
3892
|
+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
|
3893
|
+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
|
3894
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3895
|
+
add_opt(common_arg(
|
|
3896
|
+
{ "-lr", "--learning-rate" }, "ALPHA",
|
|
3897
|
+
string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
|
|
3898
|
+
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
|
|
3899
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3900
|
+
add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
|
|
3901
|
+
string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
|
3902
|
+
(double) params.lr.lr_min),
|
|
3903
|
+
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
|
|
3904
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3905
|
+
add_opt(common_arg(
|
|
3906
|
+
{"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
|
|
3907
|
+
string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
|
|
3908
|
+
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
|
|
3909
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3910
|
+
add_opt(common_arg(
|
|
3911
|
+
{"-wd", "--weight-decay"}, "WD",
|
|
3912
|
+
string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
|
|
3913
|
+
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
|
|
3914
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3915
|
+
add_opt(common_arg(
|
|
3916
|
+
{"-val-split", "--val-split"}, "FRACTION",
|
|
3917
|
+
string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
|
|
3918
|
+
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
|
|
3919
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3920
|
+
add_opt(common_arg(
|
|
3921
|
+
{"-epochs", "--epochs"}, "N",
|
|
3922
|
+
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
|
3923
|
+
[](common_params & params, int epochs) { params.lr.epochs = epochs; }
|
|
3924
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3925
|
+
add_opt(common_arg(
|
|
3926
|
+
{"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
|
|
3927
|
+
[](common_params & params, const std::string & name) {
|
|
3928
|
+
params.optimizer = common_opt_get_optimizer(name.c_str());
|
|
3929
|
+
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
|
3930
|
+
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
|
3931
|
+
}
|
|
3932
|
+
}
|
|
3933
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3934
|
+
|
|
3935
|
+
// presets
|
|
3854
3936
|
add_opt(common_arg(
|
|
3855
3937
|
{"--tts-oute-default"},
|
|
3856
3938
|
string_format("use default OuteTTS models (note: can download weights from the internet)"),
|
|
@@ -3863,39 +3945,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3863
3945
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
3864
3946
|
|
|
3865
3947
|
add_opt(common_arg(
|
|
3866
|
-
{"--embd-
|
|
3867
|
-
string_format("use default
|
|
3868
|
-
[](common_params & params) {
|
|
3869
|
-
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
3870
|
-
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
3871
|
-
params.embd_normalize = 2;
|
|
3872
|
-
params.n_ctx = 512;
|
|
3873
|
-
params.verbose_prompt = true;
|
|
3874
|
-
params.embedding = true;
|
|
3875
|
-
}
|
|
3876
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
3877
|
-
|
|
3878
|
-
add_opt(common_arg(
|
|
3879
|
-
{"--embd-e5-small-en-default"},
|
|
3880
|
-
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
|
3881
|
-
[](common_params & params) {
|
|
3882
|
-
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
3883
|
-
params.model.hf_file = "e5-small-v2-q8_0.gguf";
|
|
3884
|
-
params.embd_normalize = 2;
|
|
3885
|
-
params.n_ctx = 512;
|
|
3886
|
-
params.verbose_prompt = true;
|
|
3887
|
-
params.embedding = true;
|
|
3888
|
-
}
|
|
3889
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
3890
|
-
|
|
3891
|
-
add_opt(common_arg(
|
|
3892
|
-
{"--embd-gte-small-default"},
|
|
3893
|
-
string_format("use default gte-small model (note: can download weights from the internet)"),
|
|
3948
|
+
{"--embd-gemma-default"},
|
|
3949
|
+
string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
|
|
3894
3950
|
[](common_params & params) {
|
|
3895
|
-
params.model.hf_repo = "ggml-org/
|
|
3896
|
-
params.model.hf_file = "
|
|
3897
|
-
params.
|
|
3898
|
-
params.
|
|
3951
|
+
params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
|
|
3952
|
+
params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
|
|
3953
|
+
params.port = 8011;
|
|
3954
|
+
params.n_ubatch = 2048;
|
|
3955
|
+
params.n_batch = 2048;
|
|
3956
|
+
params.n_parallel = 32;
|
|
3957
|
+
params.n_ctx = 2048*params.n_parallel;
|
|
3899
3958
|
params.verbose_prompt = true;
|
|
3900
3959
|
params.embedding = true;
|
|
3901
3960
|
}
|
|
@@ -3990,96 +4049,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3990
4049
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3991
4050
|
|
|
3992
4051
|
add_opt(common_arg(
|
|
3993
|
-
{
|
|
3994
|
-
string_format("
|
|
3995
|
-
[](common_params & params
|
|
3996
|
-
|
|
3997
|
-
|
|
3998
|
-
|
|
3999
|
-
|
|
4000
|
-
|
|
4001
|
-
|
|
4002
|
-
|
|
4052
|
+
{"--gpt-oss-20b-default"},
|
|
4053
|
+
string_format("use gpt-oss-20b (note: can download weights from the internet)"),
|
|
4054
|
+
[](common_params & params) {
|
|
4055
|
+
params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
|
|
4056
|
+
params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
|
|
4057
|
+
params.port = 8013;
|
|
4058
|
+
params.n_ubatch = 2048;
|
|
4059
|
+
params.n_batch = 32768;
|
|
4060
|
+
params.n_parallel = 2;
|
|
4061
|
+
params.n_ctx = 131072*params.n_parallel;
|
|
4062
|
+
params.sampling.temp = 1.0f;
|
|
4063
|
+
params.sampling.top_p = 1.0f;
|
|
4064
|
+
params.sampling.top_k = 0;
|
|
4065
|
+
params.sampling.min_p = 0.01f;
|
|
4066
|
+
params.use_jinja = true;
|
|
4067
|
+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
4068
|
+
}
|
|
4069
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
4003
4070
|
|
|
4004
4071
|
add_opt(common_arg(
|
|
4005
|
-
{
|
|
4006
|
-
string_format("
|
|
4007
|
-
[](common_params & params
|
|
4008
|
-
|
|
4009
|
-
|
|
4010
|
-
|
|
4011
|
-
|
|
4012
|
-
|
|
4013
|
-
|
|
4014
|
-
|
|
4015
|
-
|
|
4016
|
-
|
|
4017
|
-
|
|
4018
|
-
|
|
4019
|
-
|
|
4072
|
+
{"--gpt-oss-120b-default"},
|
|
4073
|
+
string_format("use gpt-oss-120b (note: can download weights from the internet)"),
|
|
4074
|
+
[](common_params & params) {
|
|
4075
|
+
params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
|
|
4076
|
+
params.port = 8013;
|
|
4077
|
+
params.n_ubatch = 2048;
|
|
4078
|
+
params.n_batch = 32768;
|
|
4079
|
+
params.n_parallel = 2;
|
|
4080
|
+
params.n_ctx = 131072*params.n_parallel;
|
|
4081
|
+
params.sampling.temp = 1.0f;
|
|
4082
|
+
params.sampling.top_p = 1.0f;
|
|
4083
|
+
params.sampling.top_k = 0;
|
|
4084
|
+
params.sampling.min_p = 0.01f;
|
|
4085
|
+
params.use_jinja = true;
|
|
4086
|
+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
4087
|
+
}
|
|
4088
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
4020
4089
|
|
|
4021
4090
|
add_opt(common_arg(
|
|
4022
|
-
{
|
|
4023
|
-
string_format("
|
|
4024
|
-
[](common_params & params
|
|
4025
|
-
|
|
4026
|
-
|
|
4027
|
-
|
|
4028
|
-
|
|
4029
|
-
|
|
4030
|
-
).set_examples({
|
|
4031
|
-
add_opt(common_arg(
|
|
4032
|
-
{ "--diffusion-add-gumbel-noise" }, "F",
|
|
4033
|
-
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
|
4034
|
-
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
|
4035
|
-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
4036
|
-
|
|
4091
|
+
{"--vision-gemma-4b-default"},
|
|
4092
|
+
string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
|
|
4093
|
+
[](common_params & params) {
|
|
4094
|
+
params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
|
|
4095
|
+
params.port = 8014;
|
|
4096
|
+
params.n_ctx = 0;
|
|
4097
|
+
params.use_jinja = true;
|
|
4098
|
+
}
|
|
4099
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
4037
4100
|
|
|
4038
|
-
add_opt(
|
|
4039
|
-
|
|
4040
|
-
|
|
4041
|
-
|
|
4042
|
-
|
|
4043
|
-
|
|
4044
|
-
.
|
|
4045
|
-
|
|
4046
|
-
|
|
4047
|
-
|
|
4048
|
-
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
|
4049
|
-
(double) params.lr.lr_min),
|
|
4050
|
-
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
|
|
4051
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4052
|
-
add_opt(
|
|
4053
|
-
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
|
|
4054
|
-
string_format(
|
|
4055
|
-
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
|
|
4056
|
-
(double) params.lr.decay_epochs),
|
|
4057
|
-
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
|
|
4058
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4059
|
-
add_opt(common_arg(
|
|
4060
|
-
{ "-wd", "--weight-decay" }, "WD",
|
|
4061
|
-
string_format(
|
|
4062
|
-
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
|
|
4063
|
-
(double) params.lr.wd),
|
|
4064
|
-
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
|
|
4065
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4066
|
-
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
|
|
4067
|
-
string_format("fraction of data to use as validation set for training (default: %.2g).",
|
|
4068
|
-
(double) params.val_split),
|
|
4069
|
-
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
|
|
4070
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4071
|
-
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
|
|
4072
|
-
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
|
4073
|
-
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
|
|
4074
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4075
|
-
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
|
|
4076
|
-
[](common_params & params, const std::string & name) {
|
|
4077
|
-
params.optimizer = common_opt_get_optimizer(name.c_str());
|
|
4078
|
-
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
|
4079
|
-
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
|
4080
|
-
}
|
|
4081
|
-
})
|
|
4082
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4101
|
+
add_opt(common_arg(
|
|
4102
|
+
{"--vision-gemma-12b-default"},
|
|
4103
|
+
string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
|
|
4104
|
+
[](common_params & params) {
|
|
4105
|
+
params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
|
|
4106
|
+
params.port = 8014;
|
|
4107
|
+
params.n_ctx = 0;
|
|
4108
|
+
params.use_jinja = true;
|
|
4109
|
+
}
|
|
4110
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
4083
4111
|
|
|
4084
4112
|
return ctx_arg;
|
|
4085
4113
|
}
|
|
@@ -432,7 +432,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
|
|
432
432
|
if (is_arguments_path({})) {
|
|
433
433
|
// Entire JSON is the arguments and was parsed fully.
|
|
434
434
|
return consume_json_result {
|
|
435
|
-
partial->json.dump(),
|
|
435
|
+
partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
|
|
436
436
|
/* .is_partial = */ false,
|
|
437
437
|
};
|
|
438
438
|
}
|
|
@@ -444,7 +444,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
|
|
444
444
|
std::vector<std::string> path;
|
|
445
445
|
std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
|
|
446
446
|
if (is_arguments_path(path)) {
|
|
447
|
-
auto arguments = j.dump();
|
|
447
|
+
auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
|
|
448
448
|
if (is_partial() && !partial->healing_marker.marker.empty()) {
|
|
449
449
|
auto idx = arguments.find(partial->healing_marker.json_dump_marker);
|
|
450
450
|
if (idx != std::string::npos) {
|
|
@@ -427,7 +427,7 @@ struct common_params {
|
|
|
427
427
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
428
428
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
429
429
|
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
|
|
430
|
-
int32_t cache_ram_mib = 8192; //
|
|
430
|
+
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
|
431
431
|
|
|
432
432
|
std::string hostname = "127.0.0.1";
|
|
433
433
|
std::string public_path = ""; // NOLINT
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include <nlohmann/json.hpp>
|
|
6
6
|
|
|
7
7
|
#include <string>
|
|
8
|
+
#include <regex>
|
|
8
9
|
|
|
9
10
|
using json = nlohmann::ordered_json;
|
|
10
11
|
|
|
@@ -168,6 +169,47 @@ bool common_json_parse(
|
|
|
168
169
|
}
|
|
169
170
|
}
|
|
170
171
|
|
|
172
|
+
// Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
|
|
173
|
+
static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
|
|
174
|
+
|
|
175
|
+
auto is_high_surrogate = [&](const std::string & s) {
|
|
176
|
+
// Check if a partial of a high surrogate (U+D800-U+DBFF)
|
|
177
|
+
return s.length() >= 4 &&
|
|
178
|
+
s[0] == '\\' && s[1] == 'u' &&
|
|
179
|
+
std::tolower(s[2]) == 'd' &&
|
|
180
|
+
(s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
// Initialize the unicode marker to a low surrogate to handle the edge case
|
|
184
|
+
// where a high surrogate (U+D800-U+DBFF) is immediately followed by a
|
|
185
|
+
// backslash (\)
|
|
186
|
+
std::string unicode_marker_padding = "udc00";
|
|
187
|
+
std::smatch last_unicode_seq;
|
|
188
|
+
|
|
189
|
+
if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
|
|
190
|
+
std::smatch second_last_seq;
|
|
191
|
+
std::string prelude = str.substr(0, last_unicode_seq.position());
|
|
192
|
+
|
|
193
|
+
// Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
|
|
194
|
+
unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
|
|
195
|
+
|
|
196
|
+
if (is_high_surrogate(last_unicode_seq.str())) {
|
|
197
|
+
// If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
|
|
198
|
+
unicode_marker_padding += "\\udc00";
|
|
199
|
+
} else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
|
|
200
|
+
if (is_high_surrogate(second_last_seq.str())) {
|
|
201
|
+
// If this follows a high surrogate, pad it to be a low surrogate
|
|
202
|
+
if (last_unicode_seq.length() == 2) {
|
|
203
|
+
unicode_marker_padding = "dc00";
|
|
204
|
+
} else if (last_unicode_seq.length() == 3) {
|
|
205
|
+
unicode_marker_padding = "c00";
|
|
206
|
+
} else {
|
|
207
|
+
// The original unicode_marker_padding is already padded with 0s
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
171
213
|
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
|
|
172
214
|
|
|
173
215
|
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
|
|
@@ -186,6 +228,9 @@ bool common_json_parse(
|
|
|
186
228
|
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
|
187
229
|
// Was inside an object value string after an escape
|
|
188
230
|
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
|
231
|
+
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
|
|
232
|
+
// Was inside an object value string after a partial unicode escape
|
|
233
|
+
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
|
|
189
234
|
} else {
|
|
190
235
|
// find last :
|
|
191
236
|
auto last_pos = str.find_last_of(':');
|
|
@@ -205,6 +250,9 @@ bool common_json_parse(
|
|
|
205
250
|
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
|
206
251
|
// Was inside an array value string after an escape
|
|
207
252
|
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
|
253
|
+
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
|
|
254
|
+
// Was inside an array value string after a partial unicode escape
|
|
255
|
+
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
|
|
208
256
|
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
|
|
209
257
|
// Had just finished a value
|
|
210
258
|
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
|
|
@@ -230,6 +278,9 @@ bool common_json_parse(
|
|
|
230
278
|
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
|
|
231
279
|
// Was inside an object key string after an escape
|
|
232
280
|
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
|
|
281
|
+
} else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
|
|
282
|
+
// Was inside an object key string after a partial unicode escape
|
|
283
|
+
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
|
|
233
284
|
} else {
|
|
234
285
|
auto last_pos = str.find_last_of(':');
|
|
235
286
|
if (last_pos == std::string::npos) {
|
|
@@ -144,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
144
144
|
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
145
145
|
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
|
146
146
|
|
|
147
|
-
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8
|
|
147
|
+
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
|
|
148
148
|
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
|
149
149
|
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
|
150
150
|
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
|
151
151
|
|
|
152
152
|
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
|
153
153
|
|
|
154
|
-
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8
|
|
154
|
+
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
|
|
155
155
|
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
|
156
156
|
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
|
157
157
|
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
|
@@ -160,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
160
160
|
|
|
161
161
|
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
|
162
162
|
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
|
163
|
-
|
|
163
|
+
ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
|
164
164
|
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
|
165
165
|
|
|
166
166
|
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
@@ -820,7 +820,8 @@ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_f
|
|
|
820
820
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
|
821
821
|
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
822
822
|
for (int i = 0; i < n; ++i) {
|
|
823
|
-
|
|
823
|
+
const float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
824
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
|
|
824
825
|
}
|
|
825
826
|
}
|
|
826
827
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
|
@@ -140,7 +140,11 @@ uint32_t llama_hparams::n_embd_s() const {
|
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
bool llama_hparams::is_recurrent(uint32_t il) const {
|
|
143
|
-
|
|
143
|
+
if (il < n_layer) {
|
|
144
|
+
return recurrent_layer_arr[il];
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
|
|
144
148
|
}
|
|
145
149
|
|
|
146
150
|
uint32_t llama_hparams::n_pos_per_embd() const {
|
|
@@ -16313,10 +16313,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
16313
16313
|
}
|
|
16314
16314
|
|
|
16315
16315
|
ggml_tensor * build_layer_ffn(
|
|
16316
|
-
ggml_tensor
|
|
16317
|
-
ggml_tensor
|
|
16318
|
-
const llama_model
|
|
16319
|
-
const int
|
|
16316
|
+
ggml_tensor * cur,
|
|
16317
|
+
ggml_tensor * inpSA,
|
|
16318
|
+
const llama_model & model,
|
|
16319
|
+
const int il) {
|
|
16320
16320
|
|
|
16321
16321
|
// For Granite architectures - scale residual
|
|
16322
16322
|
if (hparams.f_residual_scale) {
|