@fugood/llama.node 1.2.4 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/arg.cpp +162 -134
- package/src/llama.cpp/common/chat-parser.cpp +2 -2
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-partial.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +5 -4
- package/src/llama.cpp/src/llama-graph.cpp +74 -43
- package/src/llama.cpp/src/llama-graph.h +7 -3
- package/src/llama.cpp/src/llama-hparams.cpp +5 -1
- package/src/llama.cpp/src/llama-model.cpp +9 -10
- package/src/llama.cpp/src/llama.cpp +1 -0
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.2.
|
|
4
|
+
"version": "1.2.6",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.2.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.2.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.2.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.2.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.2.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.2.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.2.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.2.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.2.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.2.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.2.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.2.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.2.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.2.6",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.2.6",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.2.6",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.2.6",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.2.6",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.2.6",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.2.6",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.2.6",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.2.6",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.2.6",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.2.6",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.2.6",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.2.6"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -3358,7 +3358,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3358
3358
|
add_opt(common_arg(
|
|
3359
3359
|
{"--chat-template-kwargs"}, "STRING",
|
|
3360
3360
|
string_format("sets additional params for the json template parser"),
|
|
3361
|
-
[](common_params & params, const std::string &
|
|
3361
|
+
[](common_params & params, const std::string & value) {
|
|
3362
3362
|
auto parsed = json::parse(value);
|
|
3363
3363
|
for (const auto & item : parsed.items()) {
|
|
3364
3364
|
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
@@ -3570,21 +3570,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3570
3570
|
common_log_set_file(common_log_main(), value.c_str());
|
|
3571
3571
|
}
|
|
3572
3572
|
));
|
|
3573
|
-
add_opt(common_arg(
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
|
|
3587
|
-
|
|
3573
|
+
add_opt(common_arg(
|
|
3574
|
+
{"--log-colors"}, "[on|off|auto]",
|
|
3575
|
+
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
|
3576
|
+
"'auto' enables colors when output is to a terminal",
|
|
3577
|
+
[](common_params &, const std::string & value) {
|
|
3578
|
+
if (is_truthy(value)) {
|
|
3579
|
+
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
|
|
3580
|
+
} else if (is_falsey(value)) {
|
|
3581
|
+
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
|
|
3582
|
+
} else if (is_autoy(value)) {
|
|
3583
|
+
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
|
|
3584
|
+
} else {
|
|
3585
|
+
throw std::invalid_argument(
|
|
3586
|
+
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
|
|
3587
|
+
}
|
|
3588
|
+
}
|
|
3589
|
+
).set_env("LLAMA_LOG_COLORS"));
|
|
3588
3590
|
add_opt(common_arg(
|
|
3589
3591
|
{"-v", "--verbose", "--log-verbose"},
|
|
3590
3592
|
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
|
@@ -3850,7 +3852,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3850
3852
|
}
|
|
3851
3853
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
3852
3854
|
|
|
3853
|
-
|
|
3855
|
+
add_opt(common_arg(
|
|
3856
|
+
{"--diffusion-steps"}, "N",
|
|
3857
|
+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
|
3858
|
+
[](common_params & params, int value) { params.diffusion.steps = value; }
|
|
3859
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3860
|
+
add_opt(common_arg(
|
|
3861
|
+
{"--diffusion-visual"},
|
|
3862
|
+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
|
|
3863
|
+
[](common_params & params) { params.diffusion.visual_mode = true; }
|
|
3864
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3865
|
+
add_opt(common_arg(
|
|
3866
|
+
{"--diffusion-eps"}, "F",
|
|
3867
|
+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
3868
|
+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
|
3869
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3870
|
+
add_opt(common_arg(
|
|
3871
|
+
{"--diffusion-algorithm"}, "N",
|
|
3872
|
+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
|
|
3873
|
+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
3874
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3875
|
+
add_opt(common_arg(
|
|
3876
|
+
{"--diffusion-alg-temp"}, "F",
|
|
3877
|
+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3878
|
+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
3879
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3880
|
+
add_opt(common_arg(
|
|
3881
|
+
{"--diffusion-block-length"}, "N",
|
|
3882
|
+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
|
|
3883
|
+
[](common_params & params, int value) { params.diffusion.block_length = value; }
|
|
3884
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3885
|
+
add_opt(common_arg(
|
|
3886
|
+
{"--diffusion-cfg-scale"}, "F",
|
|
3887
|
+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
|
|
3888
|
+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
|
|
3889
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3890
|
+
add_opt(common_arg(
|
|
3891
|
+
{"--diffusion-add-gumbel-noise"}, "F",
|
|
3892
|
+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
|
3893
|
+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
|
3894
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3895
|
+
add_opt(common_arg(
|
|
3896
|
+
{ "-lr", "--learning-rate" }, "ALPHA",
|
|
3897
|
+
string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
|
|
3898
|
+
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
|
|
3899
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3900
|
+
add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
|
|
3901
|
+
string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
|
3902
|
+
(double) params.lr.lr_min),
|
|
3903
|
+
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
|
|
3904
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3905
|
+
add_opt(common_arg(
|
|
3906
|
+
{"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
|
|
3907
|
+
string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
|
|
3908
|
+
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
|
|
3909
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3910
|
+
add_opt(common_arg(
|
|
3911
|
+
{"-wd", "--weight-decay"}, "WD",
|
|
3912
|
+
string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
|
|
3913
|
+
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
|
|
3914
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3915
|
+
add_opt(common_arg(
|
|
3916
|
+
{"-val-split", "--val-split"}, "FRACTION",
|
|
3917
|
+
string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
|
|
3918
|
+
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
|
|
3919
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3920
|
+
add_opt(common_arg(
|
|
3921
|
+
{"-epochs", "--epochs"}, "N",
|
|
3922
|
+
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
|
3923
|
+
[](common_params & params, int epochs) { params.lr.epochs = epochs; }
|
|
3924
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3925
|
+
add_opt(common_arg(
|
|
3926
|
+
{"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
|
|
3927
|
+
[](common_params & params, const std::string & name) {
|
|
3928
|
+
params.optimizer = common_opt_get_optimizer(name.c_str());
|
|
3929
|
+
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
|
3930
|
+
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
|
3931
|
+
}
|
|
3932
|
+
}
|
|
3933
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3934
|
+
|
|
3935
|
+
// presets
|
|
3854
3936
|
add_opt(common_arg(
|
|
3855
3937
|
{"--tts-oute-default"},
|
|
3856
3938
|
string_format("use default OuteTTS models (note: can download weights from the internet)"),
|
|
@@ -3863,39 +3945,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3863
3945
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
3864
3946
|
|
|
3865
3947
|
add_opt(common_arg(
|
|
3866
|
-
{"--embd-
|
|
3867
|
-
string_format("use default
|
|
3868
|
-
[](common_params & params) {
|
|
3869
|
-
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
3870
|
-
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
3871
|
-
params.embd_normalize = 2;
|
|
3872
|
-
params.n_ctx = 512;
|
|
3873
|
-
params.verbose_prompt = true;
|
|
3874
|
-
params.embedding = true;
|
|
3875
|
-
}
|
|
3876
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
3877
|
-
|
|
3878
|
-
add_opt(common_arg(
|
|
3879
|
-
{"--embd-e5-small-en-default"},
|
|
3880
|
-
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
|
3881
|
-
[](common_params & params) {
|
|
3882
|
-
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
3883
|
-
params.model.hf_file = "e5-small-v2-q8_0.gguf";
|
|
3884
|
-
params.embd_normalize = 2;
|
|
3885
|
-
params.n_ctx = 512;
|
|
3886
|
-
params.verbose_prompt = true;
|
|
3887
|
-
params.embedding = true;
|
|
3888
|
-
}
|
|
3889
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
3890
|
-
|
|
3891
|
-
add_opt(common_arg(
|
|
3892
|
-
{"--embd-gte-small-default"},
|
|
3893
|
-
string_format("use default gte-small model (note: can download weights from the internet)"),
|
|
3948
|
+
{"--embd-gemma-default"},
|
|
3949
|
+
string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
|
|
3894
3950
|
[](common_params & params) {
|
|
3895
|
-
params.model.hf_repo = "ggml-org/
|
|
3896
|
-
params.model.hf_file = "
|
|
3897
|
-
params.
|
|
3898
|
-
params.
|
|
3951
|
+
params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
|
|
3952
|
+
params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
|
|
3953
|
+
params.port = 8011;
|
|
3954
|
+
params.n_ubatch = 2048;
|
|
3955
|
+
params.n_batch = 2048;
|
|
3956
|
+
params.n_parallel = 32;
|
|
3957
|
+
params.n_ctx = 2048*params.n_parallel;
|
|
3899
3958
|
params.verbose_prompt = true;
|
|
3900
3959
|
params.embedding = true;
|
|
3901
3960
|
}
|
|
@@ -3990,96 +4049,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3990
4049
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3991
4050
|
|
|
3992
4051
|
add_opt(common_arg(
|
|
3993
|
-
{
|
|
3994
|
-
string_format("
|
|
3995
|
-
[](common_params & params
|
|
3996
|
-
|
|
3997
|
-
|
|
3998
|
-
|
|
3999
|
-
|
|
4000
|
-
|
|
4001
|
-
|
|
4002
|
-
|
|
4052
|
+
{"--gpt-oss-20b-default"},
|
|
4053
|
+
string_format("use gpt-oss-20b (note: can download weights from the internet)"),
|
|
4054
|
+
[](common_params & params) {
|
|
4055
|
+
params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
|
|
4056
|
+
params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
|
|
4057
|
+
params.port = 8013;
|
|
4058
|
+
params.n_ubatch = 2048;
|
|
4059
|
+
params.n_batch = 32768;
|
|
4060
|
+
params.n_parallel = 2;
|
|
4061
|
+
params.n_ctx = 131072*params.n_parallel;
|
|
4062
|
+
params.sampling.temp = 1.0f;
|
|
4063
|
+
params.sampling.top_p = 1.0f;
|
|
4064
|
+
params.sampling.top_k = 0;
|
|
4065
|
+
params.sampling.min_p = 0.01f;
|
|
4066
|
+
params.use_jinja = true;
|
|
4067
|
+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
4068
|
+
}
|
|
4069
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
4003
4070
|
|
|
4004
4071
|
add_opt(common_arg(
|
|
4005
|
-
{
|
|
4006
|
-
string_format("
|
|
4007
|
-
[](common_params & params
|
|
4008
|
-
|
|
4009
|
-
|
|
4010
|
-
|
|
4011
|
-
|
|
4012
|
-
|
|
4013
|
-
|
|
4014
|
-
|
|
4015
|
-
|
|
4016
|
-
|
|
4017
|
-
|
|
4018
|
-
|
|
4019
|
-
|
|
4072
|
+
{"--gpt-oss-120b-default"},
|
|
4073
|
+
string_format("use gpt-oss-120b (note: can download weights from the internet)"),
|
|
4074
|
+
[](common_params & params) {
|
|
4075
|
+
params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
|
|
4076
|
+
params.port = 8013;
|
|
4077
|
+
params.n_ubatch = 2048;
|
|
4078
|
+
params.n_batch = 32768;
|
|
4079
|
+
params.n_parallel = 2;
|
|
4080
|
+
params.n_ctx = 131072*params.n_parallel;
|
|
4081
|
+
params.sampling.temp = 1.0f;
|
|
4082
|
+
params.sampling.top_p = 1.0f;
|
|
4083
|
+
params.sampling.top_k = 0;
|
|
4084
|
+
params.sampling.min_p = 0.01f;
|
|
4085
|
+
params.use_jinja = true;
|
|
4086
|
+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
4087
|
+
}
|
|
4088
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
4020
4089
|
|
|
4021
4090
|
add_opt(common_arg(
|
|
4022
|
-
{
|
|
4023
|
-
string_format("
|
|
4024
|
-
[](common_params & params
|
|
4025
|
-
|
|
4026
|
-
|
|
4027
|
-
|
|
4028
|
-
|
|
4029
|
-
|
|
4030
|
-
).set_examples({
|
|
4031
|
-
add_opt(common_arg(
|
|
4032
|
-
{ "--diffusion-add-gumbel-noise" }, "F",
|
|
4033
|
-
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
|
4034
|
-
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
|
4035
|
-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
4036
|
-
|
|
4091
|
+
{"--vision-gemma-4b-default"},
|
|
4092
|
+
string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
|
|
4093
|
+
[](common_params & params) {
|
|
4094
|
+
params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
|
|
4095
|
+
params.port = 8014;
|
|
4096
|
+
params.n_ctx = 0;
|
|
4097
|
+
params.use_jinja = true;
|
|
4098
|
+
}
|
|
4099
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
4037
4100
|
|
|
4038
|
-
add_opt(
|
|
4039
|
-
|
|
4040
|
-
|
|
4041
|
-
|
|
4042
|
-
|
|
4043
|
-
|
|
4044
|
-
.
|
|
4045
|
-
|
|
4046
|
-
|
|
4047
|
-
|
|
4048
|
-
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
|
4049
|
-
(double) params.lr.lr_min),
|
|
4050
|
-
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
|
|
4051
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4052
|
-
add_opt(
|
|
4053
|
-
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
|
|
4054
|
-
string_format(
|
|
4055
|
-
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
|
|
4056
|
-
(double) params.lr.decay_epochs),
|
|
4057
|
-
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
|
|
4058
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4059
|
-
add_opt(common_arg(
|
|
4060
|
-
{ "-wd", "--weight-decay" }, "WD",
|
|
4061
|
-
string_format(
|
|
4062
|
-
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
|
|
4063
|
-
(double) params.lr.wd),
|
|
4064
|
-
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
|
|
4065
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4066
|
-
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
|
|
4067
|
-
string_format("fraction of data to use as validation set for training (default: %.2g).",
|
|
4068
|
-
(double) params.val_split),
|
|
4069
|
-
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
|
|
4070
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4071
|
-
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
|
|
4072
|
-
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
|
4073
|
-
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
|
|
4074
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4075
|
-
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
|
|
4076
|
-
[](common_params & params, const std::string & name) {
|
|
4077
|
-
params.optimizer = common_opt_get_optimizer(name.c_str());
|
|
4078
|
-
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
|
4079
|
-
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
|
4080
|
-
}
|
|
4081
|
-
})
|
|
4082
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
4101
|
+
add_opt(common_arg(
|
|
4102
|
+
{"--vision-gemma-12b-default"},
|
|
4103
|
+
string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
|
|
4104
|
+
[](common_params & params) {
|
|
4105
|
+
params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
|
|
4106
|
+
params.port = 8014;
|
|
4107
|
+
params.n_ctx = 0;
|
|
4108
|
+
params.use_jinja = true;
|
|
4109
|
+
}
|
|
4110
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
4083
4111
|
|
|
4084
4112
|
return ctx_arg;
|
|
4085
4113
|
}
|
|
@@ -432,7 +432,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
|
|
432
432
|
if (is_arguments_path({})) {
|
|
433
433
|
// Entire JSON is the arguments and was parsed fully.
|
|
434
434
|
return consume_json_result {
|
|
435
|
-
partial->json.dump(),
|
|
435
|
+
partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
|
|
436
436
|
/* .is_partial = */ false,
|
|
437
437
|
};
|
|
438
438
|
}
|
|
@@ -444,7 +444,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
|
|
444
444
|
std::vector<std::string> path;
|
|
445
445
|
std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
|
|
446
446
|
if (is_arguments_path(path)) {
|
|
447
|
-
auto arguments = j.dump();
|
|
447
|
+
auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
|
|
448
448
|
if (is_partial() && !partial->healing_marker.marker.empty()) {
|
|
449
449
|
auto idx = arguments.find(partial->healing_marker.json_dump_marker);
|
|
450
450
|
if (idx != std::string::npos) {
|
|
@@ -427,7 +427,7 @@ struct common_params {
|
|
|
427
427
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
428
428
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
429
429
|
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
|
|
430
|
-
int32_t cache_ram_mib = 8192; //
|
|
430
|
+
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
|
431
431
|
|
|
432
432
|
std::string hostname = "127.0.0.1";
|
|
433
433
|
std::string public_path = ""; // NOLINT
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include <nlohmann/json.hpp>
|
|
6
6
|
|
|
7
7
|
#include <string>
|
|
8
|
+
#include <regex>
|
|
8
9
|
|
|
9
10
|
using json = nlohmann::ordered_json;
|
|
10
11
|
|
|
@@ -168,6 +169,47 @@ bool common_json_parse(
|
|
|
168
169
|
}
|
|
169
170
|
}
|
|
170
171
|
|
|
172
|
+
// Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
|
|
173
|
+
static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
|
|
174
|
+
|
|
175
|
+
auto is_high_surrogate = [&](const std::string & s) {
|
|
176
|
+
// Check if a partial of a high surrogate (U+D800-U+DBFF)
|
|
177
|
+
return s.length() >= 4 &&
|
|
178
|
+
s[0] == '\\' && s[1] == 'u' &&
|
|
179
|
+
std::tolower(s[2]) == 'd' &&
|
|
180
|
+
(s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
// Initialize the unicode marker to a low surrogate to handle the edge case
|
|
184
|
+
// where a high surrogate (U+D800-U+DBFF) is immediately followed by a
|
|
185
|
+
// backslash (\)
|
|
186
|
+
std::string unicode_marker_padding = "udc00";
|
|
187
|
+
std::smatch last_unicode_seq;
|
|
188
|
+
|
|
189
|
+
if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
|
|
190
|
+
std::smatch second_last_seq;
|
|
191
|
+
std::string prelude = str.substr(0, last_unicode_seq.position());
|
|
192
|
+
|
|
193
|
+
// Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
|
|
194
|
+
unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
|
|
195
|
+
|
|
196
|
+
if (is_high_surrogate(last_unicode_seq.str())) {
|
|
197
|
+
// If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
|
|
198
|
+
unicode_marker_padding += "\\udc00";
|
|
199
|
+
} else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
|
|
200
|
+
if (is_high_surrogate(second_last_seq.str())) {
|
|
201
|
+
// If this follows a high surrogate, pad it to be a low surrogate
|
|
202
|
+
if (last_unicode_seq.length() == 2) {
|
|
203
|
+
unicode_marker_padding = "dc00";
|
|
204
|
+
} else if (last_unicode_seq.length() == 3) {
|
|
205
|
+
unicode_marker_padding = "c00";
|
|
206
|
+
} else {
|
|
207
|
+
// The original unicode_marker_padding is already padded with 0s
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
171
213
|
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
|
|
172
214
|
|
|
173
215
|
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
|
|
@@ -186,6 +228,9 @@ bool common_json_parse(
|
|
|
186
228
|
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
|
187
229
|
// Was inside an object value string after an escape
|
|
188
230
|
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
|
231
|
+
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
|
|
232
|
+
// Was inside an object value string after a partial unicode escape
|
|
233
|
+
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
|
|
189
234
|
} else {
|
|
190
235
|
// find last :
|
|
191
236
|
auto last_pos = str.find_last_of(':');
|
|
@@ -205,6 +250,9 @@ bool common_json_parse(
|
|
|
205
250
|
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
|
206
251
|
// Was inside an array value string after an escape
|
|
207
252
|
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
|
253
|
+
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
|
|
254
|
+
// Was inside an array value string after a partial unicode escape
|
|
255
|
+
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
|
|
208
256
|
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
|
|
209
257
|
// Had just finished a value
|
|
210
258
|
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
|
|
@@ -230,6 +278,9 @@ bool common_json_parse(
|
|
|
230
278
|
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
|
|
231
279
|
// Was inside an object key string after an escape
|
|
232
280
|
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
|
|
281
|
+
} else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
|
|
282
|
+
// Was inside an object key string after a partial unicode escape
|
|
283
|
+
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
|
|
233
284
|
} else {
|
|
234
285
|
auto last_pos = str.find_last_of(':');
|
|
235
286
|
if (last_pos == std::string::npos) {
|
|
@@ -689,8 +689,13 @@ bool ggml_is_numa(void) {
|
|
|
689
689
|
#endif
|
|
690
690
|
|
|
691
691
|
static void ggml_init_arm_arch_features(void) {
|
|
692
|
-
#if defined(
|
|
692
|
+
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
|
|
693
|
+
#if defined(__linux__)
|
|
693
694
|
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
|
695
|
+
#else
|
|
696
|
+
// TODO: add support of SVE for non-linux systems
|
|
697
|
+
#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
|
|
698
|
+
#endif
|
|
694
699
|
#endif
|
|
695
700
|
}
|
|
696
701
|
|
|
@@ -463,9 +463,9 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa
|
|
|
463
463
|
#endif
|
|
464
464
|
for (; i < n; ++i) {
|
|
465
465
|
float val = x[i] - mean;
|
|
466
|
+
y[i] = val;
|
|
466
467
|
val *= val;
|
|
467
468
|
sum += (ggml_float)val;
|
|
468
|
-
y[i] = val;
|
|
469
469
|
}
|
|
470
470
|
return sum/n;
|
|
471
471
|
}
|
|
@@ -144,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
144
144
|
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
145
145
|
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
|
146
146
|
|
|
147
|
-
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8
|
|
147
|
+
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
|
|
148
148
|
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
|
149
149
|
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
|
150
150
|
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
|
151
151
|
|
|
152
152
|
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
|
153
153
|
|
|
154
|
-
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8
|
|
154
|
+
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
|
|
155
155
|
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
|
156
156
|
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
|
157
157
|
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
|
@@ -160,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
160
160
|
|
|
161
161
|
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
|
162
162
|
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
|
163
|
-
|
|
163
|
+
ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
|
164
164
|
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
|
165
165
|
|
|
166
166
|
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
@@ -820,7 +820,8 @@ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_f
|
|
|
820
820
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
|
821
821
|
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
822
822
|
for (int i = 0; i < n; ++i) {
|
|
823
|
-
|
|
823
|
+
const float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
824
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
|
|
824
825
|
}
|
|
825
826
|
}
|
|
826
827
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
|
@@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
|
|
261
261
|
}
|
|
262
262
|
}
|
|
263
263
|
|
|
264
|
-
static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
|
264
|
+
static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
|
265
265
|
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
|
|
266
|
-
const char * swa_type_str =
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
266
|
+
const char * swa_type_str = "unknown";
|
|
267
|
+
|
|
268
|
+
switch (swa_type) {
|
|
269
|
+
case LLAMA_SWA_TYPE_NONE: swa_type_str = "LLAMA_SWA_TYPE_NONE"; break;
|
|
270
|
+
case LLAMA_SWA_TYPE_STANDARD: swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break;
|
|
271
|
+
case LLAMA_SWA_TYPE_CHUNKED: swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break;
|
|
272
|
+
case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break;
|
|
273
|
+
};
|
|
274
|
+
|
|
270
275
|
LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
|
|
271
276
|
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
|
|
272
277
|
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
|
|
@@ -295,50 +300,67 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
|
295
300
|
const int64_t n_kv = ubatch->n_tokens;
|
|
296
301
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
297
302
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
// [TAG_NO_CACHE_ISWA]
|
|
304
|
-
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
|
|
303
|
+
const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
|
|
304
|
+
for (int h = 0; h < 1; ++h) {
|
|
305
|
+
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
|
306
|
+
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
|
307
|
+
const llama_pos p1 = ubatch->pos[i1];
|
|
305
308
|
|
|
306
|
-
|
|
307
|
-
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
|
308
|
-
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
|
309
|
+
const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
|
|
309
310
|
|
|
310
|
-
|
|
311
|
-
float f = -INFINITY;
|
|
312
|
-
|
|
313
|
-
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
|
|
311
|
+
for (int i0 = 0; i0 < n_tokens; ++i0) {
|
|
314
312
|
const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
|
313
|
+
const llama_pos p0 = ubatch->pos[i0];
|
|
315
314
|
|
|
315
|
+
// mask different sequences
|
|
316
316
|
if (s0 != s1) {
|
|
317
|
-
continue;
|
|
317
|
+
continue;
|
|
318
318
|
}
|
|
319
319
|
|
|
320
|
-
|
|
321
|
-
|
|
320
|
+
// mask future tokens
|
|
321
|
+
if (cparams.causal_attn && p0 > p1) {
|
|
322
|
+
continue;
|
|
322
323
|
}
|
|
323
324
|
|
|
324
|
-
//
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
//}
|
|
328
|
-
|
|
329
|
-
// TODO: reimplement this like in llama_kv_cache_unified
|
|
330
|
-
if (hparams.use_alibi) {
|
|
331
|
-
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
|
332
|
-
} else {
|
|
333
|
-
f = 0.0f;
|
|
325
|
+
// apply SWA if any
|
|
326
|
+
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
|
|
327
|
+
continue;
|
|
334
328
|
}
|
|
329
|
+
|
|
330
|
+
data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
|
|
335
331
|
}
|
|
336
|
-
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
|
|
337
332
|
}
|
|
338
333
|
}
|
|
334
|
+
};
|
|
335
|
+
|
|
336
|
+
{
|
|
337
|
+
GGML_ASSERT(self_kq_mask);
|
|
338
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
|
|
339
|
+
|
|
340
|
+
float * data = (float *) self_kq_mask->data;
|
|
341
|
+
|
|
342
|
+
std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);
|
|
343
|
+
|
|
344
|
+
fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);
|
|
345
|
+
|
|
346
|
+
if (debug) {
|
|
347
|
+
print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
|
|
348
|
+
}
|
|
339
349
|
}
|
|
340
|
-
|
|
341
|
-
|
|
350
|
+
|
|
351
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
352
|
+
GGML_ASSERT(self_kq_mask_swa);
|
|
353
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
|
|
354
|
+
|
|
355
|
+
float * data = (float *) self_kq_mask_swa->data;
|
|
356
|
+
|
|
357
|
+
std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);
|
|
358
|
+
|
|
359
|
+
fill_mask(data, hparams.n_swa, hparams.swa_type);
|
|
360
|
+
|
|
361
|
+
if (debug) {
|
|
362
|
+
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
|
|
363
|
+
}
|
|
342
364
|
}
|
|
343
365
|
}
|
|
344
366
|
|
|
@@ -1299,12 +1321,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1299
1321
|
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
|
1300
1322
|
v = ggml_permute(ctx0, v, 0, 2, 1, 3);
|
|
1301
1323
|
|
|
1302
|
-
const auto n_kv = k->ne[1];
|
|
1303
|
-
|
|
1304
1324
|
ggml_tensor * cur;
|
|
1305
1325
|
|
|
1306
|
-
|
|
1307
|
-
if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
|
|
1326
|
+
if (cparams.flash_attn && kq_b == nullptr) {
|
|
1308
1327
|
GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
|
|
1309
1328
|
|
|
1310
1329
|
if (v_trans) {
|
|
@@ -1419,10 +1438,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
|
|
1419
1438
|
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
|
1420
1439
|
|
|
1421
1440
|
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
|
1422
|
-
inp->
|
|
1423
|
-
ggml_set_input(inp->
|
|
1441
|
+
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
|
1442
|
+
ggml_set_input(inp->self_kq_mask);
|
|
1443
|
+
|
|
1444
|
+
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
1424
1445
|
|
|
1425
|
-
|
|
1446
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
1447
|
+
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
|
1448
|
+
ggml_set_input(inp->self_kq_mask_swa);
|
|
1449
|
+
|
|
1450
|
+
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
|
1451
|
+
} else {
|
|
1452
|
+
inp->self_kq_mask_swa = nullptr;
|
|
1453
|
+
inp->self_kq_mask_swa_cnv = nullptr;
|
|
1454
|
+
}
|
|
1426
1455
|
|
|
1427
1456
|
return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
|
|
1428
1457
|
}
|
|
@@ -1447,7 +1476,9 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1447
1476
|
ggml_build_forward_expand(gf, k_cur);
|
|
1448
1477
|
ggml_build_forward_expand(gf, v_cur);
|
|
1449
1478
|
|
|
1450
|
-
const
|
|
1479
|
+
const bool is_swa = hparams.is_swa(il);
|
|
1480
|
+
|
|
1481
|
+
const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
|
|
1451
1482
|
|
|
1452
1483
|
// [TAG_NO_CACHE_PAD]
|
|
1453
1484
|
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
|
@@ -257,10 +257,14 @@ public:
|
|
|
257
257
|
|
|
258
258
|
void set_input(const llama_ubatch * ubatch) override;
|
|
259
259
|
|
|
260
|
-
ggml_tensor * get_kq_mask()
|
|
260
|
+
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
261
|
+
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
|
|
261
262
|
|
|
262
|
-
|
|
263
|
-
ggml_tensor *
|
|
263
|
+
// n_tokens == n_batch
|
|
264
|
+
ggml_tensor * self_kq_mask = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
|
|
265
|
+
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
|
|
266
|
+
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
|
|
267
|
+
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
|
|
264
268
|
|
|
265
269
|
const llama_hparams hparams;
|
|
266
270
|
const llama_cparams cparams;
|
|
@@ -140,7 +140,11 @@ uint32_t llama_hparams::n_embd_s() const {
|
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
bool llama_hparams::is_recurrent(uint32_t il) const {
|
|
143
|
-
|
|
143
|
+
if (il < n_layer) {
|
|
144
|
+
return recurrent_layer_arr[il];
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
|
|
144
148
|
}
|
|
145
149
|
|
|
146
150
|
uint32_t llama_hparams::n_pos_per_embd() const {
|
|
@@ -11358,8 +11358,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
11358
11358
|
}
|
|
11359
11359
|
};
|
|
11360
11360
|
|
|
11361
|
-
struct
|
|
11362
|
-
|
|
11361
|
+
struct llm_build_gemma_embedding : public llm_graph_context {
|
|
11362
|
+
llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11363
11363
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
11364
11364
|
|
|
11365
11365
|
ggml_tensor * cur;
|
|
@@ -11376,8 +11376,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
|
|
|
11376
11376
|
// inp_pos - contains the positions
|
|
11377
11377
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11378
11378
|
|
|
11379
|
-
|
|
11380
|
-
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
11379
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
11381
11380
|
|
|
11382
11381
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11383
11382
|
|
|
@@ -16313,10 +16312,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
16313
16312
|
}
|
|
16314
16313
|
|
|
16315
16314
|
ggml_tensor * build_layer_ffn(
|
|
16316
|
-
ggml_tensor
|
|
16317
|
-
ggml_tensor
|
|
16318
|
-
const llama_model
|
|
16319
|
-
const int
|
|
16315
|
+
ggml_tensor * cur,
|
|
16316
|
+
ggml_tensor * inpSA,
|
|
16317
|
+
const llama_model & model,
|
|
16318
|
+
const int il) {
|
|
16320
16319
|
|
|
16321
16320
|
// For Granite architectures - scale residual
|
|
16322
16321
|
if (hparams.f_residual_scale) {
|
|
@@ -19378,7 +19377,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19378
19377
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
19379
19378
|
case LLM_ARCH_NEO_BERT:
|
|
19380
19379
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
19381
|
-
|
|
19380
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
19382
19381
|
case LLM_ARCH_DREAM:
|
|
19383
19382
|
case LLM_ARCH_LLADA:
|
|
19384
19383
|
case LLM_ARCH_LLADA_MOE:
|
|
@@ -19671,7 +19670,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19671
19670
|
} break;
|
|
19672
19671
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
19673
19672
|
{
|
|
19674
|
-
llm = std::make_unique<
|
|
19673
|
+
llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
|
|
19675
19674
|
} break;
|
|
19676
19675
|
case LLM_ARCH_STARCODER2:
|
|
19677
19676
|
{
|