@fugood/llama.node 1.1.3 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/src/LlamaCompletionWorker.cpp +45 -5
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +60 -7
- package/src/llama.cpp/common/chat.cpp +6 -6
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +14 -5
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/include/llama.h +8 -4
- package/src/llama.cpp/src/llama-arch.cpp +40 -0
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +20 -1
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +11 -2
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -139
- package/src/llama.cpp/src/llama-graph.h +31 -32
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +400 -21
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
package/CMakeLists.txt
CHANGED
|
@@ -114,6 +114,9 @@ set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
|
|
|
114
114
|
set(LLAMA_CURL OFF CACHE BOOL "Build curl")
|
|
115
115
|
|
|
116
116
|
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
|
|
117
|
+
|
|
118
|
+
add_definitions(-DGGML_MAX_NAME=80)
|
|
119
|
+
|
|
117
120
|
add_subdirectory("src/llama.cpp")
|
|
118
121
|
add_subdirectory("src/llama.cpp/tools/mtmd")
|
|
119
122
|
|
package/lib/binding.ts
CHANGED
|
@@ -65,6 +65,14 @@ export type LlamaModelOptions = {
|
|
|
65
65
|
lora?: string
|
|
66
66
|
lora_scaled?: number
|
|
67
67
|
lora_list?: { path: string; scaled: number }[]
|
|
68
|
+
/**
|
|
69
|
+
* RoPE base frequency, use 0 to use model default (recommended)
|
|
70
|
+
*/
|
|
71
|
+
rope_freq_base?: number
|
|
72
|
+
/**
|
|
73
|
+
* RoPE frequency scaling factor, use 0 to use model default (recommended)
|
|
74
|
+
*/
|
|
75
|
+
rope_freq_scale?: number
|
|
68
76
|
}
|
|
69
77
|
|
|
70
78
|
export type CompletionResponseFormat = {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.5",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.5",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.5",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.5",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.5",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.5",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.5",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.5",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.5",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.5",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.5",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.5",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.5",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.5"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -64,6 +64,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
64
64
|
size_t n_input = 0;
|
|
65
65
|
const auto model = _sess->model();
|
|
66
66
|
auto vocab = llama_model_get_vocab(model);
|
|
67
|
+
const bool is_enc_dec = llama_model_has_encoder(model);
|
|
67
68
|
|
|
68
69
|
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
69
70
|
auto ctx = _sess->context();
|
|
@@ -110,7 +111,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
110
111
|
} else {
|
|
111
112
|
// Text-only path
|
|
112
113
|
std::vector<llama_token> prompt_tokens =
|
|
113
|
-
::common_tokenize(ctx, _params.prompt, add_bos, true);
|
|
114
|
+
::common_tokenize(ctx, _params.prompt, add_bos || is_enc_dec, true);
|
|
114
115
|
n_input = prompt_tokens.size();
|
|
115
116
|
|
|
116
117
|
if (_sess->tokens_ptr()->size() > 0) {
|
|
@@ -126,9 +127,47 @@ void LlamaCompletionWorker::Execute() {
|
|
|
126
127
|
}
|
|
127
128
|
|
|
128
129
|
const int max_len = _params.n_predict < 0 ? std::numeric_limits<int>::max() : _params.n_predict;
|
|
129
|
-
_sess->tokens_ptr()->reserve(_sess->tokens_ptr()->size() + max_len);
|
|
130
|
-
|
|
131
130
|
auto embd = _sess->tokens_ptr();
|
|
131
|
+
embd->reserve(embd->size() + max_len);
|
|
132
|
+
|
|
133
|
+
if (is_enc_dec) {
|
|
134
|
+
if (n_input > 0) {
|
|
135
|
+
// Decode tokens in batches using n_batch as chunk size
|
|
136
|
+
int n_past_batch = n_cur;
|
|
137
|
+
int n_remaining = n_input;
|
|
138
|
+
|
|
139
|
+
while (n_remaining > 0) {
|
|
140
|
+
int n_eval = n_remaining;
|
|
141
|
+
if (n_eval > _params.n_batch) {
|
|
142
|
+
n_eval = _params.n_batch;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
int ret = llama_encode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
|
|
146
|
+
if (ret < 0) {
|
|
147
|
+
SetError("Failed to encode token batch, code: " + std::to_string(ret) +
|
|
148
|
+
", n_eval: " + std::to_string(n_eval) +
|
|
149
|
+
", n_past_batch: " + std::to_string(n_past_batch));
|
|
150
|
+
_sess->get_mutex().unlock();
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
n_past_batch += n_eval;
|
|
155
|
+
n_remaining -= n_eval;
|
|
156
|
+
n_cur += n_eval;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
_result.tokens_evaluated += n_input;
|
|
160
|
+
|
|
161
|
+
llama_token decode_bos = llama_model_decoder_start_token(model);
|
|
162
|
+
if (decode_bos == LLAMA_TOKEN_NULL) {
|
|
163
|
+
decode_bos = llama_vocab_bos(vocab);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
embd->emplace_back(decode_bos);
|
|
167
|
+
common_sampler_accept(sampling.get(), decode_bos, false);
|
|
168
|
+
n_input = 1;
|
|
169
|
+
}
|
|
170
|
+
|
|
132
171
|
for (int i = 0; (i < max_len || _interrupted) && !_params.vocab_only; i++) {
|
|
133
172
|
// check if we need to remove some tokens
|
|
134
173
|
if (embd->size() >= _params.n_ctx) {
|
|
@@ -166,13 +205,14 @@ void LlamaCompletionWorker::Execute() {
|
|
|
166
205
|
if (n_eval > _params.n_batch) {
|
|
167
206
|
n_eval = _params.n_batch;
|
|
168
207
|
}
|
|
169
|
-
|
|
208
|
+
|
|
170
209
|
int ret = llama_decode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
|
|
171
210
|
if (ret < 0) {
|
|
172
211
|
SetError("Failed to decode token batch, code: " + std::to_string(ret) +
|
|
173
212
|
", n_eval: " + std::to_string(n_eval) +
|
|
174
213
|
", n_past_batch: " + std::to_string(n_past_batch));
|
|
175
|
-
|
|
214
|
+
_sess->get_mutex().unlock();
|
|
215
|
+
return;
|
|
176
216
|
}
|
|
177
217
|
|
|
178
218
|
n_past_batch += n_eval;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -250,6 +250,9 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
250
250
|
params.kv_unified = get_option<bool>(options, "kv_unified", false);
|
|
251
251
|
params.swa_full = get_option<bool>(options, "swa_full", false);
|
|
252
252
|
|
|
253
|
+
params.rope_freq_base = get_option<float>(options, "rope_freq_base", 0.0f);
|
|
254
|
+
params.rope_freq_scale = get_option<float>(options, "rope_freq_scale", 0.0f);
|
|
255
|
+
|
|
253
256
|
params.use_mlock = get_option<bool>(options, "use_mlock", false);
|
|
254
257
|
params.use_mmap = get_option<bool>(options, "use_mmap", true);
|
|
255
258
|
params.numa =
|
|
@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
977
977
|
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
|
978
978
|
string_process_escapes(seq_breaker);
|
|
979
979
|
}
|
|
980
|
+
for (auto & pair : params.speculative.replacements) {
|
|
981
|
+
string_process_escapes(pair.first);
|
|
982
|
+
string_process_escapes(pair.second);
|
|
983
|
+
}
|
|
980
984
|
}
|
|
981
985
|
|
|
982
986
|
if (!params.kv_overrides.empty()) {
|
|
@@ -2091,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2091
2095
|
params.no_kv_offload = true;
|
|
2092
2096
|
}
|
|
2093
2097
|
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
|
2098
|
+
add_opt(common_arg(
|
|
2099
|
+
{"-nr", "--no-repack"},
|
|
2100
|
+
"disable weight repacking",
|
|
2101
|
+
[](common_params & params) {
|
|
2102
|
+
params.no_extra_bufts = true;
|
|
2103
|
+
}
|
|
2104
|
+
).set_env("LLAMA_ARG_NO_REPACK"));
|
|
2094
2105
|
add_opt(common_arg(
|
|
2095
2106
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
|
2096
2107
|
string_format(
|
|
@@ -2369,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2369
2380
|
}
|
|
2370
2381
|
}
|
|
2371
2382
|
));
|
|
2383
|
+
add_opt(common_arg(
|
|
2384
|
+
{"--cpu-moe"},
|
|
2385
|
+
"use CPU for Mixture of Experts (MoE) weights",
|
|
2386
|
+
[](common_params & params) {
|
|
2387
|
+
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
|
2388
|
+
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
|
2389
|
+
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
|
2390
|
+
}
|
|
2391
|
+
).set_env("LLAMA_ARG_CPU_MOE"));
|
|
2372
2392
|
add_opt(common_arg(
|
|
2373
2393
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
2374
2394
|
"number of layers to store in VRAM",
|
|
@@ -2627,6 +2647,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2627
2647
|
params.n_out_freq = value;
|
|
2628
2648
|
}
|
|
2629
2649
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2650
|
+
add_opt(common_arg(
|
|
2651
|
+
{"--output-format"}, "{gguf,dat}",
|
|
2652
|
+
string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
|
|
2653
|
+
[](common_params & params, const std::string & value) {
|
|
2654
|
+
/**/ if (value == "gguf") { params.imat_dat = false; }
|
|
2655
|
+
else if (value == "dat") { params.imat_dat = true; }
|
|
2656
|
+
else { throw std::invalid_argument("invalid output format"); }
|
|
2657
|
+
}
|
|
2658
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2630
2659
|
add_opt(common_arg(
|
|
2631
2660
|
{"--save-frequency"}, "N",
|
|
2632
2661
|
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
|
@@ -3249,6 +3278,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3249
3278
|
params.speculative.model.path = value;
|
|
3250
3279
|
}
|
|
3251
3280
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
3281
|
+
add_opt(common_arg(
|
|
3282
|
+
{"--spec-replace"}, "TARGET", "DRAFT",
|
|
3283
|
+
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
|
|
3284
|
+
[](common_params & params, const std::string & tgt, const std::string & dft) {
|
|
3285
|
+
params.speculative.replacements.push_back({ tgt, dft });
|
|
3286
|
+
}
|
|
3287
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
3252
3288
|
add_opt(common_arg(
|
|
3253
3289
|
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
|
3254
3290
|
string_format(
|
|
@@ -3438,12 +3474,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3438
3474
|
}
|
|
3439
3475
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3440
3476
|
|
|
3441
|
-
// diffusion parameters
|
|
3442
3477
|
add_opt(common_arg(
|
|
3443
3478
|
{ "--diffusion-steps" }, "N",
|
|
3444
3479
|
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
|
3445
3480
|
[](common_params & params, int value) { params.diffusion.steps = value; }
|
|
3446
3481
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3482
|
+
add_opt(common_arg(
|
|
3483
|
+
{ "--diffusion-visual" },
|
|
3484
|
+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
|
3485
|
+
params.diffusion.visual_mode ? "true" : "false"),
|
|
3486
|
+
[](common_params & params) { params.diffusion.visual_mode = true; }
|
|
3487
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3488
|
+
|
|
3447
3489
|
add_opt(common_arg(
|
|
3448
3490
|
{ "--diffusion-eps" }, "F",
|
|
3449
3491
|
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
@@ -3451,21 +3493,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3451
3493
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3452
3494
|
add_opt(common_arg(
|
|
3453
3495
|
{ "--diffusion-algorithm" }, "N",
|
|
3454
|
-
string_format("diffusion algorithm: 0=ORIGIN, 1=
|
|
3496
|
+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
|
|
3455
3497
|
params.diffusion.algorithm),
|
|
3456
3498
|
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
3457
3499
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3458
3500
|
add_opt(common_arg(
|
|
3459
3501
|
{ "--diffusion-alg-temp" }, "F",
|
|
3460
|
-
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3502
|
+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3461
3503
|
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
3462
3504
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3505
|
+
|
|
3463
3506
|
add_opt(common_arg(
|
|
3464
|
-
{ "--diffusion-
|
|
3465
|
-
string_format("
|
|
3466
|
-
|
|
3467
|
-
|
|
3507
|
+
{ "--diffusion-block-length" }, "N",
|
|
3508
|
+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
|
|
3509
|
+
[](common_params & params, int value) { params.diffusion.block_length = value; }
|
|
3510
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3511
|
+
add_opt(common_arg(
|
|
3512
|
+
{ "--diffusion-cfg-scale" }, "F",
|
|
3513
|
+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
|
|
3514
|
+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
|
|
3468
3515
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3516
|
+
add_opt(common_arg(
|
|
3517
|
+
{ "--diffusion-add-gumbel-noise" }, "F",
|
|
3518
|
+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
|
3519
|
+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
|
3520
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3521
|
+
|
|
3469
3522
|
|
|
3470
3523
|
return ctx_arg;
|
|
3471
3524
|
}
|
|
@@ -1635,7 +1635,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1635
1635
|
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
|
|
1636
1636
|
);
|
|
1637
1637
|
|
|
1638
|
-
|
|
1638
|
+
while (auto res = builder.try_find_regex(open_regex)) {
|
|
1639
1639
|
const auto & block_start = res->groups[1];
|
|
1640
1640
|
std::string block_end = block_start.empty() ? "" : "```";
|
|
1641
1641
|
|
|
@@ -1657,7 +1657,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1657
1657
|
builder.consume_literal(block_end);
|
|
1658
1658
|
builder.consume_spaces();
|
|
1659
1659
|
}
|
|
1660
|
-
builder.add_content(builder.consume_rest());
|
|
1661
1660
|
} else {
|
|
1662
1661
|
throw common_chat_msg_partial_exception("failed to parse tool call");
|
|
1663
1662
|
}
|
|
@@ -1682,11 +1681,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1682
1681
|
builder.consume_spaces();
|
|
1683
1682
|
}
|
|
1684
1683
|
}
|
|
1685
|
-
builder.add_content(builder.consume_rest());
|
|
1686
1684
|
}
|
|
1687
|
-
} else {
|
|
1688
|
-
builder.add_content(builder.consume_rest());
|
|
1689
1685
|
}
|
|
1686
|
+
|
|
1687
|
+
builder.add_content(builder.consume_rest());
|
|
1690
1688
|
}
|
|
1691
1689
|
|
|
1692
1690
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
@@ -1933,6 +1931,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
|
|
1933
1931
|
}
|
|
1934
1932
|
}
|
|
1935
1933
|
auto msg = builder.result();
|
|
1936
|
-
|
|
1934
|
+
if (!is_partial) {
|
|
1935
|
+
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
|
1936
|
+
}
|
|
1937
1937
|
return msg;
|
|
1938
1938
|
}
|
|
@@ -1123,6 +1123,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1123
1123
|
mparams.use_mmap = params.use_mmap;
|
|
1124
1124
|
mparams.use_mlock = params.use_mlock;
|
|
1125
1125
|
mparams.check_tensors = params.check_tensors;
|
|
1126
|
+
mparams.use_extra_bufts = !params.no_extra_bufts;
|
|
1126
1127
|
|
|
1127
1128
|
if (params.kv_overrides.empty()) {
|
|
1128
1129
|
mparams.kv_overrides = NULL;
|
|
@@ -201,6 +201,7 @@ struct common_params_speculative {
|
|
|
201
201
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
202
202
|
float p_split = 0.1f; // speculative decoding split probability
|
|
203
203
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
204
|
+
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
|
204
205
|
|
|
205
206
|
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
206
207
|
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
@@ -220,11 +221,17 @@ struct common_params_vocoder {
|
|
|
220
221
|
};
|
|
221
222
|
|
|
222
223
|
struct common_params_diffusion {
|
|
223
|
-
int32_t steps
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
float
|
|
227
|
-
|
|
224
|
+
int32_t steps = 128;
|
|
225
|
+
bool visual_mode = false;
|
|
226
|
+
|
|
227
|
+
float eps = 0; // epsilon for timesteps
|
|
228
|
+
int32_t block_length = 0; // block length for generation
|
|
229
|
+
|
|
230
|
+
int32_t algorithm = 4; // default algorithm: low-confidence
|
|
231
|
+
float alg_temp = 0.0f; // algorithm temperature
|
|
232
|
+
|
|
233
|
+
float cfg_scale = 0; // classifier-free guidance scale
|
|
234
|
+
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
|
228
235
|
};
|
|
229
236
|
|
|
230
237
|
enum common_reasoning_format {
|
|
@@ -353,6 +360,7 @@ struct common_params {
|
|
|
353
360
|
bool warmup = true; // warmup run
|
|
354
361
|
bool check_tensors = false; // validate tensor data
|
|
355
362
|
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
|
363
|
+
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
|
|
356
364
|
|
|
357
365
|
bool single_turn = false; // single turn chat conversation
|
|
358
366
|
|
|
@@ -432,6 +440,7 @@ struct common_params {
|
|
|
432
440
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
|
433
441
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
|
434
442
|
int32_t i_chunk = 0; // start processing from this chunk
|
|
443
|
+
bool imat_dat = false; // whether the legacy imatrix.dat format should be output
|
|
435
444
|
|
|
436
445
|
bool process_output = false; // collect data for the output tensor
|
|
437
446
|
bool compute_ppl = true; // whether to compute perplexity
|