@fugood/llama.node 1.4.6 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -26
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +272 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +680 -47
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +99 -29
- package/src/llama.cpp/src/llama-context.h +9 -3
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +85 -17
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +123 -52
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +8 -7
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -104,9 +104,10 @@ struct ring_buffer {
|
|
|
104
104
|
struct common_sampler {
|
|
105
105
|
common_params_sampling params;
|
|
106
106
|
|
|
107
|
-
struct llama_sampler * grmr;
|
|
108
107
|
struct llama_sampler * chain;
|
|
109
108
|
|
|
109
|
+
bool grammar;
|
|
110
|
+
|
|
110
111
|
ring_buffer<llama_token> prev;
|
|
111
112
|
|
|
112
113
|
std::vector<llama_token_data> cur;
|
|
@@ -116,7 +117,6 @@ struct common_sampler {
|
|
|
116
117
|
void reset() {
|
|
117
118
|
prev.clear();
|
|
118
119
|
|
|
119
|
-
llama_sampler_reset(grmr);
|
|
120
120
|
llama_sampler_reset(chain);
|
|
121
121
|
}
|
|
122
122
|
|
|
@@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
167
167
|
|
|
168
168
|
lparams.no_perf = params.no_perf;
|
|
169
169
|
|
|
170
|
-
|
|
170
|
+
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
|
171
|
+
|
|
172
|
+
bool grammar = false;
|
|
173
|
+
std::vector<llama_sampler *> samplers;
|
|
174
|
+
|
|
171
175
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
|
172
176
|
#ifdef LLAMA_USE_LLGUIDANCE
|
|
173
|
-
|
|
177
|
+
samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
|
|
178
|
+
grammar = true;
|
|
174
179
|
#else
|
|
175
180
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
176
181
|
#endif // LLAMA_USE_LLGUIDANCE
|
|
@@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
217
222
|
trigger_patterns_c.push_back(regex.c_str());
|
|
218
223
|
}
|
|
219
224
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
225
|
+
if (!params.grammar.empty()) {
|
|
226
|
+
if (params.grammar_lazy) {
|
|
227
|
+
samplers.push_back(
|
|
228
|
+
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
|
229
|
+
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
230
|
+
trigger_tokens.data(), trigger_tokens.size()));
|
|
231
|
+
} else {
|
|
232
|
+
samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
grammar = true;
|
|
227
236
|
}
|
|
228
237
|
}
|
|
229
238
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
234
|
-
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
235
|
-
/* .cur = */ {},
|
|
236
|
-
/* .cur_p = */ {},
|
|
237
|
-
};
|
|
238
|
-
|
|
239
|
-
llama_sampler_chain_add(result->chain,
|
|
240
|
-
llama_sampler_init_logit_bias(
|
|
241
|
-
llama_vocab_n_tokens(vocab),
|
|
242
|
-
params.logit_bias.size(),
|
|
243
|
-
params.logit_bias.data()));
|
|
239
|
+
if (params.has_logit_bias()) {
|
|
240
|
+
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
|
|
241
|
+
}
|
|
244
242
|
|
|
245
243
|
if (params.mirostat == 0) {
|
|
246
244
|
for (const auto & cnstr : params.samplers) {
|
|
@@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
253
251
|
c_breakers.push_back(str.c_str());
|
|
254
252
|
}
|
|
255
253
|
|
|
256
|
-
|
|
254
|
+
samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
257
255
|
}
|
|
258
256
|
break;
|
|
259
257
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
|
260
|
-
|
|
258
|
+
samplers.push_back(llama_sampler_init_top_k (params.top_k));
|
|
261
259
|
break;
|
|
262
260
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
|
263
|
-
|
|
261
|
+
samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
|
|
264
262
|
break;
|
|
265
263
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
|
266
|
-
|
|
264
|
+
samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
|
|
267
265
|
break;
|
|
268
266
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
|
269
|
-
|
|
267
|
+
samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
|
|
270
268
|
break;
|
|
271
269
|
case COMMON_SAMPLER_TYPE_XTC:
|
|
272
|
-
|
|
270
|
+
samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
|
273
271
|
break;
|
|
274
272
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
|
275
|
-
|
|
273
|
+
samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
|
|
276
274
|
break;
|
|
277
275
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
|
278
|
-
|
|
276
|
+
samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
|
279
277
|
break;
|
|
280
278
|
case COMMON_SAMPLER_TYPE_INFILL:
|
|
281
|
-
|
|
279
|
+
samplers.push_back(llama_sampler_init_infill (vocab));
|
|
282
280
|
break;
|
|
283
281
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
|
284
|
-
|
|
282
|
+
samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
|
285
283
|
break;
|
|
286
284
|
default:
|
|
287
285
|
GGML_ASSERT(false && "unknown sampler type");
|
|
288
286
|
}
|
|
289
287
|
}
|
|
290
|
-
|
|
288
|
+
|
|
289
|
+
samplers.push_back(llama_sampler_init_dist(params.seed));
|
|
291
290
|
} else if (params.mirostat == 1) {
|
|
292
|
-
|
|
293
|
-
|
|
291
|
+
samplers.push_back(llama_sampler_init_temp(params.temp));
|
|
292
|
+
samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
|
294
293
|
} else if (params.mirostat == 2) {
|
|
295
|
-
|
|
296
|
-
|
|
294
|
+
samplers.push_back(llama_sampler_init_temp(params.temp));
|
|
295
|
+
samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
|
297
296
|
} else {
|
|
298
297
|
GGML_ASSERT(false && "unknown mirostat version");
|
|
299
298
|
}
|
|
300
299
|
|
|
300
|
+
for (auto * smpl : samplers) {
|
|
301
|
+
llama_sampler_chain_add(chain, smpl);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
auto * result = new common_sampler {
|
|
305
|
+
/* .params = */ params,
|
|
306
|
+
/* .chain = */ chain,
|
|
307
|
+
/* .grammar = */ grammar,
|
|
308
|
+
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
309
|
+
/* .cur = */ {},
|
|
310
|
+
/* .cur_p = */ {},
|
|
311
|
+
};
|
|
312
|
+
|
|
301
313
|
return result;
|
|
302
314
|
}
|
|
303
315
|
|
|
304
316
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
|
305
317
|
if (gsmpl) {
|
|
306
|
-
llama_sampler_free(gsmpl->grmr);
|
|
307
|
-
|
|
308
318
|
llama_sampler_free(gsmpl->chain);
|
|
309
319
|
|
|
310
320
|
delete gsmpl;
|
|
@@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
|
|
314
324
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
|
315
325
|
const auto tm = gsmpl->tm();
|
|
316
326
|
|
|
317
|
-
if (
|
|
318
|
-
|
|
319
|
-
}
|
|
327
|
+
if (gsmpl->grammar) {
|
|
328
|
+
const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
|
|
320
329
|
|
|
321
|
-
|
|
330
|
+
for (int i = 0; i < n_smpl; i++) {
|
|
331
|
+
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
|
332
|
+
|
|
333
|
+
// the grammar sampler is always the first one
|
|
334
|
+
if (i == 0) {
|
|
335
|
+
if (accept_grammar) {
|
|
336
|
+
llama_sampler_accept(smpl, token);
|
|
337
|
+
}
|
|
338
|
+
} else {
|
|
339
|
+
llama_sampler_accept(smpl, token);
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
} else {
|
|
343
|
+
llama_sampler_accept(gsmpl->chain, token);
|
|
344
|
+
}
|
|
322
345
|
|
|
323
346
|
gsmpl->prev.push_back(token);
|
|
324
347
|
}
|
|
@@ -329,12 +352,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
|
|
329
352
|
|
|
330
353
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
|
331
354
|
return new common_sampler {
|
|
332
|
-
/* .params
|
|
333
|
-
/* .
|
|
334
|
-
/* .
|
|
335
|
-
/* .prev
|
|
336
|
-
/* .cur
|
|
337
|
-
/* .cur_p
|
|
355
|
+
/* .params = */ gsmpl->params,
|
|
356
|
+
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
|
357
|
+
/* .grammar = */ gsmpl->grammar,
|
|
358
|
+
/* .prev = */ gsmpl->prev,
|
|
359
|
+
/* .cur = */ gsmpl->cur,
|
|
360
|
+
/* .cur_p = */ gsmpl->cur_p,
|
|
338
361
|
};
|
|
339
362
|
}
|
|
340
363
|
|
|
@@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|
|
383
406
|
}
|
|
384
407
|
}
|
|
385
408
|
|
|
386
|
-
|
|
409
|
+
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
|
410
|
+
return gsmpl->chain;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
|
387
414
|
llama_synchronize(ctx);
|
|
388
415
|
|
|
389
416
|
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
|
390
417
|
const auto tm = gsmpl->tm();
|
|
391
418
|
|
|
392
|
-
|
|
419
|
+
llama_token id = LLAMA_TOKEN_NULL;
|
|
393
420
|
|
|
394
|
-
auto & grmr = gsmpl->grmr;
|
|
395
421
|
auto & chain = gsmpl->chain;
|
|
396
422
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
|
397
423
|
|
|
398
|
-
|
|
399
|
-
llama_sampler_apply(grmr, &cur_p);
|
|
400
|
-
}
|
|
424
|
+
gsmpl->set_logits(ctx, idx);
|
|
401
425
|
|
|
402
426
|
llama_sampler_apply(chain, &cur_p);
|
|
403
427
|
|
|
404
428
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
|
405
429
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
if (grammar_first) {
|
|
409
|
-
return id;
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
// check if it the sampled token fits the grammar
|
|
413
|
-
{
|
|
414
|
-
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
|
415
|
-
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
|
416
|
-
|
|
417
|
-
llama_sampler_apply(grmr, &single_token_data_array);
|
|
418
|
-
|
|
419
|
-
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
420
|
-
if (is_valid) {
|
|
421
|
-
return id;
|
|
422
|
-
}
|
|
423
|
-
}
|
|
424
|
-
|
|
425
|
-
// resampling:
|
|
426
|
-
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
|
427
|
-
gsmpl->set_logits(ctx, idx);
|
|
428
|
-
|
|
429
|
-
llama_sampler_apply(grmr, &cur_p);
|
|
430
|
-
llama_sampler_apply(chain, &cur_p);
|
|
431
|
-
|
|
432
|
-
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
|
430
|
+
id = cur_p.data[cur_p.selected].id;
|
|
433
431
|
|
|
434
|
-
return
|
|
432
|
+
return id;
|
|
435
433
|
}
|
|
436
434
|
|
|
437
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft
|
|
435
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
|
438
436
|
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
|
439
437
|
|
|
440
438
|
std::vector<llama_token> result;
|
|
@@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|
|
442
440
|
|
|
443
441
|
size_t i = 0;
|
|
444
442
|
for (; i < draft.size(); i++) {
|
|
445
|
-
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]
|
|
443
|
+
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
|
446
444
|
|
|
447
445
|
common_sampler_accept(gsmpl, id, true);
|
|
448
446
|
|
|
@@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|
|
454
452
|
}
|
|
455
453
|
|
|
456
454
|
if (i == draft.size()) {
|
|
457
|
-
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]
|
|
455
|
+
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
|
458
456
|
|
|
459
457
|
common_sampler_accept(gsmpl, id, true);
|
|
460
458
|
|
|
@@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|
|
464
462
|
return result;
|
|
465
463
|
}
|
|
466
464
|
|
|
467
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft
|
|
465
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
|
468
466
|
std::vector<int> idxs(draft.size() + 1);
|
|
469
467
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
|
470
468
|
idxs[i] = i;
|
|
471
469
|
}
|
|
472
470
|
|
|
473
|
-
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft
|
|
471
|
+
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
|
474
472
|
}
|
|
475
473
|
|
|
476
474
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
@@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
|
|
515
513
|
|
|
516
514
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
|
517
515
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
|
518
|
-
result += std::string("-> ")
|
|
516
|
+
result += std::string("-> ");
|
|
517
|
+
result += std::string(llama_sampler_name(smpl)) + " ";
|
|
519
518
|
}
|
|
520
519
|
|
|
521
520
|
return result;
|
|
@@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|
|
48
48
|
// arguments can be nullptr to skip printing
|
|
49
49
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
|
50
50
|
|
|
51
|
+
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
|
52
|
+
|
|
51
53
|
// extended sampling implementation:
|
|
52
54
|
//
|
|
53
55
|
// - set logits
|
|
@@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|
|
55
57
|
// - check if the token fits the grammar (if any)
|
|
56
58
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
|
57
59
|
//
|
|
58
|
-
|
|
59
|
-
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
|
60
|
-
//
|
|
61
|
-
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
|
60
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
|
|
62
61
|
|
|
63
62
|
// generalized version of common_sampler_sample
|
|
64
63
|
//
|
|
@@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
76
75
|
//
|
|
77
76
|
// returns at least 1 token, up to idxs.size()
|
|
78
77
|
//
|
|
79
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft
|
|
78
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
|
80
79
|
|
|
81
80
|
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
|
82
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft
|
|
81
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
|
83
82
|
|
|
84
83
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
|
85
84
|
|
|
@@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
|
|
|
107
106
|
|
|
108
107
|
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
|
109
108
|
const char * grammar_kind, const char * grammar_data);
|
|
109
|
+
|
|
110
|
+
struct common_sampler_deleter {
|
|
111
|
+
void operator()(common_sampler * s) { common_sampler_free(s); }
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
|
|
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
315
315
|
for (int i = 0; i < params.n_draft; ++i) {
|
|
316
316
|
common_batch_clear(batch);
|
|
317
317
|
|
|
318
|
-
common_sampler_sample(smpl, ctx_dft, 0
|
|
318
|
+
common_sampler_sample(smpl, ctx_dft, 0);
|
|
319
319
|
|
|
320
320
|
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
|
321
321
|
|
|
@@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
|
|
54
54
|
# TODO
|
|
55
55
|
else()
|
|
56
56
|
set(GGML_STANDALONE OFF)
|
|
57
|
+
|
|
58
|
+
if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
|
|
59
|
+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
|
60
|
+
endif()
|
|
57
61
|
endif()
|
|
58
62
|
|
|
59
63
|
if (EMSCRIPTEN)
|
|
@@ -168,6 +172,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
|
|
168
172
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
|
169
173
|
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
|
170
174
|
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
|
175
|
+
option(GGML_RV_ZIHINTPAUSE "ggml: enable riscv zihintpause " ON)
|
|
171
176
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
172
177
|
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
|
|
173
178
|
|
|
@@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
|
|
53
53
|
// call with a worst-case graph to avoid buffer reallocations
|
|
54
54
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
|
55
55
|
// returns false if the buffer allocation failed
|
|
56
|
+
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
|
56
57
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
|
58
|
+
GGML_API void ggml_gallocr_reserve_n_size(
|
|
59
|
+
ggml_gallocr_t galloc,
|
|
60
|
+
struct ggml_cgraph * graph,
|
|
61
|
+
const int * node_buffer_ids,
|
|
62
|
+
const int * leaf_buffer_ids,
|
|
63
|
+
size_t * sizes);
|
|
57
64
|
GGML_API bool ggml_gallocr_reserve_n(
|
|
58
65
|
ggml_gallocr_t galloc,
|
|
59
66
|
struct ggml_cgraph * graph,
|
|
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
|
|
68
75
|
|
|
69
76
|
// Utils
|
|
70
77
|
// Create a buffer and allocate all the tensors in a ggml_context
|
|
78
|
+
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
|
79
|
+
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
|
71
80
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
|
72
81
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
|
73
82
|
|
|
@@ -307,6 +307,7 @@ extern "C" {
|
|
|
307
307
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
308
308
|
|
|
309
309
|
// Initialize backend buffers from a measure graph
|
|
310
|
+
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
|
310
311
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
|
311
312
|
|
|
312
313
|
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
|
@@ -99,6 +99,7 @@ extern "C" {
|
|
|
99
99
|
GGML_BACKEND_API int ggml_cpu_has_sme (void);
|
|
100
100
|
// other
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
|
+
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
|
|
102
103
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
104
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
105
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
@@ -2305,13 +2305,11 @@ extern "C" {
|
|
|
2305
2305
|
float stop,
|
|
2306
2306
|
float step);
|
|
2307
2307
|
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
//
|
|
2311
|
-
//
|
|
2312
|
-
//
|
|
2313
|
-
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
|
2314
|
-
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
|
2308
|
+
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
|
2309
|
+
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
|
2310
|
+
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
|
2311
|
+
// mask: [n_kv, n_batch, ne32, ne33]
|
|
2312
|
+
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
|
2315
2313
|
//
|
|
2316
2314
|
// broadcast:
|
|
2317
2315
|
// n_head % n_head_kv == 0
|
|
@@ -2617,7 +2615,8 @@ extern "C" {
|
|
|
2617
2615
|
|
|
2618
2616
|
// Set callback for all future logging events.
|
|
2619
2617
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
|
2620
|
-
GGML_API void
|
|
2618
|
+
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
|
|
2619
|
+
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
|
2621
2620
|
|
|
2622
2621
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
|
2623
2622
|
|
|
@@ -386,6 +386,9 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
386
386
|
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
|
387
387
|
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
388
388
|
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
|
389
|
+
ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
|
|
390
|
+
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
|
|
391
|
+
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
|
389
392
|
elseif (APPLE)
|
|
390
393
|
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
|
391
394
|
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
|
@@ -469,6 +469,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
469
469
|
if (GGML_RV_ZICBOP)
|
|
470
470
|
string(APPEND MARCH_STR "_zicbop")
|
|
471
471
|
endif()
|
|
472
|
+
if (GGML_RV_ZIHINTPAUSE)
|
|
473
|
+
string(APPEND MARCH_STR "_zihintpause")
|
|
474
|
+
endif()
|
|
472
475
|
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
|
473
476
|
else()
|
|
474
477
|
# Begin with the lowest baseline
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
|
|
25
25
|
#define UNUSED GGML_UNUSED
|
|
26
26
|
|
|
27
|
+
#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
|
|
27
28
|
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
|
28
29
|
int16x8_t * out_mins,
|
|
29
30
|
int8_t * out_scales) {
|
|
@@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
|
|
46
47
|
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
|
|
47
48
|
memcpy(out_scales, scales_u32, 8);
|
|
48
49
|
}
|
|
50
|
+
#endif
|
|
49
51
|
|
|
50
52
|
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
51
53
|
assert(QK8_0 == 32);
|