@fugood/llama.node 1.4.15 → 1.5.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +1 -5
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +76 -61
- package/src/LlamaContext.cpp +20 -32
- package/src/llama.cpp/common/CMakeLists.txt +12 -0
- package/src/llama.cpp/common/arg.cpp +20 -0
- package/src/llama.cpp/common/chat.cpp +289 -34
- package/src/llama.cpp/common/chat.h +16 -13
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +28 -25
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +5 -1
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
|
@@ -146,6 +146,7 @@ llama_context::llama_context(
|
|
|
146
146
|
}
|
|
147
147
|
|
|
148
148
|
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
|
149
|
+
cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
|
|
149
150
|
|
|
150
151
|
// with causal attention, the batch size is limited by the context size
|
|
151
152
|
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
|
@@ -155,6 +156,9 @@ llama_context::llama_context(
|
|
|
155
156
|
cparams.op_offload = params.op_offload;
|
|
156
157
|
cparams.kv_unified = params.kv_unified;
|
|
157
158
|
|
|
159
|
+
// intialized later
|
|
160
|
+
cparams.pipeline_parallel = false;
|
|
161
|
+
|
|
158
162
|
{
|
|
159
163
|
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
|
160
164
|
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
|
@@ -302,16 +306,6 @@ llama_context::llama_context(
|
|
|
302
306
|
|
|
303
307
|
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
|
304
308
|
|
|
305
|
-
const uint32_t n_seqs = cparams.n_seq_max;
|
|
306
|
-
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
307
|
-
|
|
308
|
-
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
|
309
|
-
|
|
310
|
-
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
|
311
|
-
|
|
312
|
-
gf_res_prev.reset(new llm_graph_result(max_nodes));
|
|
313
|
-
gf_res_reserve.reset(new llm_graph_result(max_nodes));
|
|
314
|
-
|
|
315
309
|
// TODO: move these checks to ggml_backend_sched
|
|
316
310
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
|
317
311
|
bool pipeline_parallel =
|
|
@@ -340,177 +334,218 @@ llama_context::llama_context(
|
|
|
340
334
|
}
|
|
341
335
|
}
|
|
342
336
|
|
|
343
|
-
|
|
337
|
+
cparams.pipeline_parallel = pipeline_parallel;
|
|
344
338
|
|
|
345
|
-
if (pipeline_parallel) {
|
|
346
|
-
LLAMA_LOG_INFO("%s: pipeline parallelism enabled
|
|
339
|
+
if (cparams.pipeline_parallel) {
|
|
340
|
+
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
|
|
347
341
|
}
|
|
348
342
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
throw std::runtime_error("failed to initialize memory module");
|
|
343
|
+
sched_reserve();
|
|
344
|
+
|
|
345
|
+
if (!cparams.flash_attn) {
|
|
346
|
+
if (ggml_is_quantized(params.type_v)) {
|
|
347
|
+
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
|
|
355
348
|
}
|
|
356
349
|
}
|
|
350
|
+
}
|
|
357
351
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
n_outputs = n_seqs;
|
|
362
|
-
|
|
363
|
-
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
352
|
+
// Initialize the full vocabulary token ids for backend samplers.
|
|
353
|
+
{
|
|
354
|
+
const int n_vocab = model.vocab.n_tokens();
|
|
364
355
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
356
|
+
sampling.token_ids_full_vocab.resize(n_vocab);
|
|
357
|
+
for (int i = 0; i < n_vocab; ++i) {
|
|
358
|
+
sampling.token_ids_full_vocab[i] = i;
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
371
362
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
continue;
|
|
378
|
-
}
|
|
379
|
-
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
|
380
|
-
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
|
363
|
+
llama_context::~llama_context() {
|
|
364
|
+
if (!model.hparams.no_alloc) {
|
|
365
|
+
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
|
366
|
+
ggml_backend_t backend = backend_ptrs[i];
|
|
367
|
+
ggml_backend_buffer_type_t buft = backend_buft[i];
|
|
381
368
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
|
388
|
-
"is assigned to device %s (usually due to missing support)\n",
|
|
389
|
-
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
|
390
|
-
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
|
391
|
-
fa_device_mismatch = true;
|
|
392
|
-
break;
|
|
393
|
-
}
|
|
394
|
-
}
|
|
395
|
-
if (fa_device_mismatch) {
|
|
396
|
-
cparams.flash_attn = false;
|
|
397
|
-
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
|
398
|
-
if (ggml_is_quantized(params.type_v)) {
|
|
399
|
-
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
|
|
400
|
-
}
|
|
369
|
+
const size_t size_exp = backend_buf_exp_size[i];
|
|
370
|
+
const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
371
|
+
if (size_exp == size_act) {
|
|
372
|
+
LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
|
373
|
+
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
401
374
|
} else {
|
|
402
|
-
|
|
403
|
-
|
|
375
|
+
LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
|
376
|
+
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
404
377
|
}
|
|
405
378
|
}
|
|
379
|
+
}
|
|
380
|
+
ggml_opt_free(opt_ctx);
|
|
381
|
+
}
|
|
406
382
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
383
|
+
void llama_context::sched_reserve() {
|
|
384
|
+
if (!sched_need_reserve) {
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
410
387
|
|
|
411
|
-
|
|
412
|
-
int n_nodes_tg = -1;
|
|
388
|
+
sched_need_reserve = false;
|
|
413
389
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
if (!gf) {
|
|
425
|
-
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
426
|
-
}
|
|
427
|
-
}
|
|
390
|
+
LLAMA_LOG_INFO("%s: reserving ...\n", __func__);
|
|
391
|
+
|
|
392
|
+
synchronize();
|
|
393
|
+
|
|
394
|
+
const int64_t t_start_us = ggml_time_us();
|
|
395
|
+
|
|
396
|
+
const uint32_t n_seqs = cparams.n_seq_max;
|
|
397
|
+
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
398
|
+
|
|
399
|
+
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
|
428
400
|
|
|
429
|
-
|
|
430
|
-
|
|
401
|
+
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
|
402
|
+
|
|
403
|
+
gf_res_prev.reset(new llm_graph_result(max_nodes));
|
|
404
|
+
gf_res_reserve.reset(new llm_graph_result(max_nodes));
|
|
405
|
+
|
|
406
|
+
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, cparams.pipeline_parallel, cparams.op_offload));
|
|
407
|
+
|
|
408
|
+
llama_memory_context_ptr mctx;
|
|
409
|
+
if (memory) {
|
|
410
|
+
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
|
411
|
+
mctx = memory->init_full();
|
|
412
|
+
if (!mctx) {
|
|
413
|
+
throw std::runtime_error("failed to initialize memory module");
|
|
431
414
|
}
|
|
415
|
+
}
|
|
432
416
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
throw std::runtime_error("failed to allocate compute tg buffers");
|
|
438
|
-
}
|
|
417
|
+
// avoid reserving graphs with zero outputs - assume one output per sequence
|
|
418
|
+
const int n_outputs = n_seqs;
|
|
419
|
+
|
|
420
|
+
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
439
421
|
|
|
440
|
-
|
|
441
|
-
|
|
422
|
+
// resolve automatic Flash Attention use
|
|
423
|
+
if (cparams.auto_fa) {
|
|
424
|
+
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
|
|
425
|
+
if (!gf) {
|
|
426
|
+
throw std::runtime_error("failed to split graph for Flash Attention check");
|
|
442
427
|
}
|
|
443
428
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
|
451
|
-
if (!gf) {
|
|
452
|
-
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
429
|
+
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
|
|
430
|
+
bool fa_device_mismatch = false;
|
|
431
|
+
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
|
432
|
+
ggml_tensor * n = ggml_graph_node(gf, i);
|
|
433
|
+
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
|
434
|
+
continue;
|
|
453
435
|
}
|
|
436
|
+
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
|
437
|
+
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
|
438
|
+
|
|
439
|
+
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
|
440
|
+
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
|
441
|
+
const int il = std::stoi(n->name + prefix_len);
|
|
442
|
+
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
|
443
|
+
if (device_fa != device_kv) {
|
|
444
|
+
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
|
445
|
+
"is assigned to device %s (usually due to missing support)\n",
|
|
446
|
+
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
|
447
|
+
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
|
448
|
+
fa_device_mismatch = true;
|
|
449
|
+
break;
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
if (fa_device_mismatch) {
|
|
453
|
+
cparams.flash_attn = false;
|
|
454
|
+
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
|
455
|
+
} else {
|
|
456
|
+
cparams.flash_attn = true;
|
|
457
|
+
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
|
|
454
458
|
}
|
|
455
459
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
460
|
+
cparams.auto_fa = false;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// reserve worst-case graph
|
|
464
|
+
int n_splits_pp = -1;
|
|
465
|
+
int n_nodes_pp = -1;
|
|
466
|
+
|
|
467
|
+
int n_splits_tg = -1;
|
|
468
|
+
int n_nodes_tg = -1;
|
|
469
|
+
|
|
470
|
+
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
|
471
|
+
{
|
|
472
|
+
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
|
473
|
+
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
|
474
|
+
if (!gf) {
|
|
475
|
+
if (cparams.pipeline_parallel) {
|
|
476
|
+
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
|
477
|
+
cparams.pipeline_parallel = false;
|
|
478
|
+
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
|
|
479
|
+
gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
461
480
|
}
|
|
462
|
-
if (
|
|
463
|
-
|
|
464
|
-
ggml_backend_buft_name(buft),
|
|
465
|
-
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
|
481
|
+
if (!gf) {
|
|
482
|
+
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
466
483
|
}
|
|
467
484
|
}
|
|
468
485
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
|
|
473
|
-
}
|
|
486
|
+
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
|
487
|
+
n_nodes_pp = ggml_graph_n_nodes(gf);
|
|
488
|
+
}
|
|
474
489
|
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
490
|
+
// reserve with tg (token generation) graph to get the number of splits and nodes
|
|
491
|
+
{
|
|
492
|
+
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
|
493
|
+
if (!gf) {
|
|
494
|
+
throw std::runtime_error("failed to allocate compute tg buffers");
|
|
479
495
|
}
|
|
496
|
+
|
|
497
|
+
n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
|
|
498
|
+
n_nodes_tg = ggml_graph_n_nodes(gf);
|
|
480
499
|
}
|
|
481
500
|
|
|
482
|
-
//
|
|
501
|
+
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
|
483
502
|
{
|
|
484
|
-
|
|
503
|
+
// TODO: not sure if the following graph would be worster case for multi-stream KV caches:
|
|
504
|
+
//
|
|
505
|
+
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
|
506
|
+
//
|
|
507
|
+
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
|
508
|
+
if (!gf) {
|
|
509
|
+
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
510
|
+
}
|
|
511
|
+
}
|
|
485
512
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
513
|
+
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
|
514
|
+
ggml_backend_t backend = backend_ptrs[i];
|
|
515
|
+
ggml_backend_buffer_type_t buft = backend_buft[i];
|
|
516
|
+
if (!model.hparams.no_alloc) {
|
|
517
|
+
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
518
|
+
}
|
|
519
|
+
if (backend_buf_exp_size[i] > 1) {
|
|
520
|
+
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
|
521
|
+
ggml_backend_buft_name(buft),
|
|
522
|
+
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
|
489
523
|
}
|
|
490
524
|
}
|
|
491
|
-
}
|
|
492
525
|
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
526
|
+
if (n_nodes_pp == n_nodes_tg) {
|
|
527
|
+
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
|
|
528
|
+
} else {
|
|
529
|
+
LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
|
|
530
|
+
}
|
|
498
531
|
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
504
|
-
} else {
|
|
505
|
-
LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
|
506
|
-
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
507
|
-
}
|
|
508
|
-
}
|
|
532
|
+
if (n_splits_pp == n_splits_tg) {
|
|
533
|
+
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
|
|
534
|
+
} else {
|
|
535
|
+
LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
|
|
509
536
|
}
|
|
510
|
-
|
|
537
|
+
|
|
538
|
+
const int64_t t_end_us = ggml_time_us();
|
|
539
|
+
|
|
540
|
+
LLAMA_LOG_INFO("%s: reserve took %.2f ms, sched copies = %d\n",
|
|
541
|
+
__func__, (t_end_us - t_start_us)/1000.0, ggml_backend_sched_get_n_copies(sched.get()));
|
|
511
542
|
}
|
|
512
543
|
|
|
513
544
|
void llama_context::synchronize() {
|
|
545
|
+
if (!sched) {
|
|
546
|
+
return;
|
|
547
|
+
}
|
|
548
|
+
|
|
514
549
|
ggml_backend_sched_synchronize(sched.get());
|
|
515
550
|
|
|
516
551
|
// FIXME: if multiple single tokens are evaluated without a synchronization,
|
|
@@ -951,21 +986,41 @@ void llama_context::set_embeddings(bool value) {
|
|
|
951
986
|
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
|
952
987
|
|
|
953
988
|
cparams.embeddings = value;
|
|
989
|
+
|
|
990
|
+
// TODO: not sure yet if we want to reserve here
|
|
991
|
+
//sched_need_reserve = true;
|
|
954
992
|
}
|
|
955
993
|
|
|
956
994
|
void llama_context::set_causal_attn(bool value) {
|
|
957
995
|
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
|
958
996
|
|
|
997
|
+
if (cparams.causal_attn == value) {
|
|
998
|
+
return;
|
|
999
|
+
}
|
|
1000
|
+
|
|
959
1001
|
cparams.causal_attn = value;
|
|
1002
|
+
|
|
1003
|
+
sched_need_reserve = true;
|
|
960
1004
|
}
|
|
961
1005
|
|
|
962
1006
|
void llama_context::set_warmup(bool value) {
|
|
963
1007
|
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
|
964
1008
|
|
|
1009
|
+
if (cparams.warmup == value) {
|
|
1010
|
+
return;
|
|
1011
|
+
}
|
|
1012
|
+
|
|
965
1013
|
cparams.warmup = value;
|
|
1014
|
+
|
|
1015
|
+
// warmups are usually with small batches, so no need to reserve
|
|
1016
|
+
//sched_need_reserve = true;
|
|
966
1017
|
}
|
|
967
1018
|
|
|
968
1019
|
bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
|
1020
|
+
if (!sampler && sampling.samplers.count(seq_id) == 0) {
|
|
1021
|
+
return true;
|
|
1022
|
+
}
|
|
1023
|
+
|
|
969
1024
|
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
|
|
970
1025
|
|
|
971
1026
|
const bool can_offload =
|
|
@@ -985,12 +1040,18 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
|
|
985
1040
|
|
|
986
1041
|
sampling.samplers[seq_id] = sampler;
|
|
987
1042
|
|
|
1043
|
+
sched_need_reserve = true;
|
|
1044
|
+
|
|
988
1045
|
return true;
|
|
989
1046
|
}
|
|
990
1047
|
|
|
991
1048
|
if (sampler && !can_offload) {
|
|
992
1049
|
LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
|
|
993
1050
|
|
|
1051
|
+
if (sampling.samplers.count(seq_id) > 0) {
|
|
1052
|
+
sched_need_reserve = true;
|
|
1053
|
+
}
|
|
1054
|
+
|
|
994
1055
|
sampling.samplers.erase(seq_id);
|
|
995
1056
|
|
|
996
1057
|
return false;
|
|
@@ -998,6 +1059,8 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
|
|
998
1059
|
|
|
999
1060
|
sampling.samplers.erase(seq_id);
|
|
1000
1061
|
|
|
1062
|
+
sched_need_reserve = true;
|
|
1063
|
+
|
|
1001
1064
|
return true;
|
|
1002
1065
|
}
|
|
1003
1066
|
|
|
@@ -1006,16 +1069,27 @@ void llama_context::set_adapter_lora(
|
|
|
1006
1069
|
float scale) {
|
|
1007
1070
|
LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
|
|
1008
1071
|
|
|
1072
|
+
if (auto it = loras.find(adapter); it != loras.end()) {
|
|
1073
|
+
if (it->second == scale) {
|
|
1074
|
+
return;
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1009
1078
|
loras[adapter] = scale;
|
|
1079
|
+
|
|
1080
|
+
sched_need_reserve = true;
|
|
1010
1081
|
}
|
|
1011
1082
|
|
|
1012
1083
|
bool llama_context::rm_adapter_lora(
|
|
1013
1084
|
llama_adapter_lora * adapter) {
|
|
1014
1085
|
LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
|
|
1015
1086
|
|
|
1016
|
-
auto
|
|
1017
|
-
if (
|
|
1018
|
-
loras.erase(
|
|
1087
|
+
auto it = loras.find(adapter);
|
|
1088
|
+
if (it != loras.end()) {
|
|
1089
|
+
loras.erase(it);
|
|
1090
|
+
|
|
1091
|
+
sched_need_reserve = true;
|
|
1092
|
+
|
|
1019
1093
|
return true;
|
|
1020
1094
|
}
|
|
1021
1095
|
|
|
@@ -1025,7 +1099,13 @@ bool llama_context::rm_adapter_lora(
|
|
|
1025
1099
|
void llama_context::clear_adapter_lora() {
|
|
1026
1100
|
LLAMA_LOG_DEBUG("%s: call\n", __func__);
|
|
1027
1101
|
|
|
1102
|
+
if (loras.empty()) {
|
|
1103
|
+
return;
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1028
1106
|
loras.clear();
|
|
1107
|
+
|
|
1108
|
+
sched_need_reserve = true;
|
|
1029
1109
|
}
|
|
1030
1110
|
|
|
1031
1111
|
bool llama_context::apply_adapter_cvec(
|
|
@@ -1036,6 +1116,8 @@ bool llama_context::apply_adapter_cvec(
|
|
|
1036
1116
|
int32_t il_end) {
|
|
1037
1117
|
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
|
|
1038
1118
|
|
|
1119
|
+
// TODO: should we reserve?
|
|
1120
|
+
|
|
1039
1121
|
return cvec.apply(model, data, len, n_embd, il_start, il_end);
|
|
1040
1122
|
}
|
|
1041
1123
|
|
|
@@ -1138,6 +1220,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
1138
1220
|
// TODO: this clear of the buffer can easily be forgotten - need something better
|
|
1139
1221
|
embd_seq.clear();
|
|
1140
1222
|
|
|
1223
|
+
sched_reserve();
|
|
1224
|
+
|
|
1141
1225
|
n_queued_tokens += n_tokens;
|
|
1142
1226
|
|
|
1143
1227
|
// reserve output buffer
|
|
@@ -1177,7 +1261,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
1177
1261
|
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
|
|
1178
1262
|
|
|
1179
1263
|
// extract logits
|
|
1180
|
-
|
|
1264
|
+
if (logits && t_logits) {
|
|
1181
1265
|
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
|
|
1182
1266
|
GGML_ASSERT(backend_res != nullptr);
|
|
1183
1267
|
GGML_ASSERT(logits != nullptr);
|
|
@@ -1451,6 +1535,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1451
1535
|
embd_seq.clear();
|
|
1452
1536
|
output_swaps.clear();
|
|
1453
1537
|
|
|
1538
|
+
sched_reserve();
|
|
1539
|
+
|
|
1454
1540
|
bool did_optimize = false;
|
|
1455
1541
|
|
|
1456
1542
|
// handle any pending shifts/copies
|
|
@@ -1955,7 +2041,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
|
|
1955
2041
|
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
|
1956
2042
|
}
|
|
1957
2043
|
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
|
1958
|
-
|
|
2044
|
+
for (const auto & lora : model.loras) {
|
|
2045
|
+
res += lora->get_n_nodes();
|
|
2046
|
+
}
|
|
1959
2047
|
return res;
|
|
1960
2048
|
}
|
|
1961
2049
|
|
|
@@ -40,6 +40,14 @@ struct llama_context {
|
|
|
40
40
|
|
|
41
41
|
~llama_context();
|
|
42
42
|
|
|
43
|
+
// reserve a new backend scheduler (if needed)
|
|
44
|
+
// for example, when:
|
|
45
|
+
// - changing loras
|
|
46
|
+
// - changing samplers
|
|
47
|
+
// - changing attention type
|
|
48
|
+
// - etc.
|
|
49
|
+
void sched_reserve();
|
|
50
|
+
|
|
43
51
|
void synchronize();
|
|
44
52
|
|
|
45
53
|
const llama_model & get_model() const;
|
|
@@ -314,6 +322,8 @@ private:
|
|
|
314
322
|
|
|
315
323
|
ggml_backend_sched_ptr sched;
|
|
316
324
|
|
|
325
|
+
bool sched_need_reserve = true;
|
|
326
|
+
|
|
317
327
|
ggml_backend_t backend_cpu = nullptr;
|
|
318
328
|
std::vector<ggml_backend_ptr> backends;
|
|
319
329
|
|
|
@@ -30,10 +30,12 @@ struct llama_cparams {
|
|
|
30
30
|
bool causal_attn;
|
|
31
31
|
bool offload_kqv;
|
|
32
32
|
bool flash_attn;
|
|
33
|
+
bool auto_fa;
|
|
33
34
|
bool no_perf;
|
|
34
35
|
bool warmup;
|
|
35
36
|
bool op_offload;
|
|
36
37
|
bool kv_unified;
|
|
38
|
+
bool pipeline_parallel;
|
|
37
39
|
|
|
38
40
|
enum llama_pooling_type pooling_type;
|
|
39
41
|
|
|
@@ -200,42 +200,6 @@ uint32_t llama_hparams::n_layer_kv() const {
|
|
|
200
200
|
return res;
|
|
201
201
|
}
|
|
202
202
|
|
|
203
|
-
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
|
204
|
-
assert(p0 >= 0 && p1 >= 0);
|
|
205
|
-
|
|
206
|
-
switch (swa_type) {
|
|
207
|
-
case LLAMA_SWA_TYPE_NONE:
|
|
208
|
-
{
|
|
209
|
-
} break;
|
|
210
|
-
case LLAMA_SWA_TYPE_STANDARD:
|
|
211
|
-
{
|
|
212
|
-
if (p1 - p0 >= (int32_t) n_swa) {
|
|
213
|
-
return true;
|
|
214
|
-
}
|
|
215
|
-
} break;
|
|
216
|
-
case LLAMA_SWA_TYPE_CHUNKED:
|
|
217
|
-
{
|
|
218
|
-
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
219
|
-
|
|
220
|
-
if (p0 < pos_chunk_start) {
|
|
221
|
-
return true;
|
|
222
|
-
}
|
|
223
|
-
} break;
|
|
224
|
-
case LLAMA_SWA_TYPE_SYMMETRIC:
|
|
225
|
-
{
|
|
226
|
-
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
|
227
|
-
const int32_t pos_diff = p1 - p0;
|
|
228
|
-
|
|
229
|
-
// Mask if outside the symmetric window
|
|
230
|
-
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
|
231
|
-
return true;
|
|
232
|
-
}
|
|
233
|
-
} break;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
return false;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
203
|
bool llama_hparams::use_mrope() const {
|
|
240
204
|
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
|
241
205
|
}
|