@fugood/llama.node 1.4.15 → 1.6.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/lib/binding.ts +1 -5
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +2 -2
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +76 -61
  6. package/src/LlamaContext.cpp +20 -32
  7. package/src/llama.cpp/common/CMakeLists.txt +12 -0
  8. package/src/llama.cpp/common/arg.cpp +20 -0
  9. package/src/llama.cpp/common/chat-parser.cpp +3 -3
  10. package/src/llama.cpp/common/chat-parser.h +4 -4
  11. package/src/llama.cpp/common/chat.cpp +289 -34
  12. package/src/llama.cpp/common/chat.h +32 -20
  13. package/src/llama.cpp/common/common.cpp +0 -1
  14. package/src/llama.cpp/common/common.h +31 -25
  15. package/src/llama.cpp/common/download.cpp +19 -14
  16. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  17. package/src/llama.cpp/common/jinja/caps.h +24 -0
  18. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  19. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  20. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  21. package/src/llama.cpp/common/jinja/parser.h +21 -0
  22. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  23. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  24. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  25. package/src/llama.cpp/common/jinja/string.h +58 -0
  26. package/src/llama.cpp/common/jinja/utils.h +49 -0
  27. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  28. package/src/llama.cpp/common/jinja/value.h +464 -0
  29. package/src/llama.cpp/common/json-partial.h +1 -0
  30. package/src/llama.cpp/common/sampling.cpp +52 -19
  31. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  34. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  36. package/src/llama.cpp/include/llama-cpp.h +3 -1
  37. package/src/llama.cpp/include/llama.h +29 -2
  38. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  39. package/src/llama.cpp/src/llama-adapter.h +1 -3
  40. package/src/llama.cpp/src/llama-context.cpp +232 -144
  41. package/src/llama.cpp/src/llama-context.h +10 -0
  42. package/src/llama.cpp/src/llama-cparams.h +2 -0
  43. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  44. package/src/llama.cpp/src/llama-hparams.h +38 -1
  45. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  46. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  47. package/src/llama.cpp/src/llama-mmap.cpp +5 -1
  48. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  49. package/src/llama.cpp/src/llama-model.cpp +5 -1
  50. package/src/llama.cpp/src/llama-model.h +3 -2
  51. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
@@ -146,6 +146,7 @@ llama_context::llama_context(
146
146
  }
147
147
 
148
148
  cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
149
+ cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
149
150
 
150
151
  // with causal attention, the batch size is limited by the context size
151
152
  cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -155,6 +156,9 @@ llama_context::llama_context(
155
156
  cparams.op_offload = params.op_offload;
156
157
  cparams.kv_unified = params.kv_unified;
157
158
 
159
+ // intialized later
160
+ cparams.pipeline_parallel = false;
161
+
158
162
  {
159
163
  const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
160
164
  graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
@@ -302,16 +306,6 @@ llama_context::llama_context(
302
306
 
303
307
  LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
304
308
 
305
- const uint32_t n_seqs = cparams.n_seq_max;
306
- const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
307
-
308
- const size_t max_nodes = this->graph_max_nodes(n_tokens);
309
-
310
- LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
311
-
312
- gf_res_prev.reset(new llm_graph_result(max_nodes));
313
- gf_res_reserve.reset(new llm_graph_result(max_nodes));
314
-
315
309
  // TODO: move these checks to ggml_backend_sched
316
310
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
317
311
  bool pipeline_parallel =
@@ -340,177 +334,218 @@ llama_context::llama_context(
340
334
  }
341
335
  }
342
336
 
343
- sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload));
337
+ cparams.pipeline_parallel = pipeline_parallel;
344
338
 
345
- if (pipeline_parallel) {
346
- LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
339
+ if (cparams.pipeline_parallel) {
340
+ LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
347
341
  }
348
342
 
349
- llama_memory_context_ptr mctx;
350
- if (memory) {
351
- LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
352
- mctx = memory->init_full();
353
- if (!mctx) {
354
- throw std::runtime_error("failed to initialize memory module");
343
+ sched_reserve();
344
+
345
+ if (!cparams.flash_attn) {
346
+ if (ggml_is_quantized(params.type_v)) {
347
+ throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
355
348
  }
356
349
  }
350
+ }
357
351
 
358
- cross.v_embd.clear();
359
-
360
- // avoid reserving graphs with zero outputs - assume one output per sequence
361
- n_outputs = n_seqs;
362
-
363
- LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
352
+ // Initialize the full vocabulary token ids for backend samplers.
353
+ {
354
+ const int n_vocab = model.vocab.n_tokens();
364
355
 
365
- // resolve automatic Flash Attention use
366
- if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
367
- auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
368
- if (!gf) {
369
- throw std::runtime_error("failed to split graph for Flash Attention check");
370
- }
356
+ sampling.token_ids_full_vocab.resize(n_vocab);
357
+ for (int i = 0; i < n_vocab; ++i) {
358
+ sampling.token_ids_full_vocab[i] = i;
359
+ }
360
+ }
361
+ }
371
362
 
372
- const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
373
- bool fa_device_mismatch = false;
374
- for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
375
- ggml_tensor * n = ggml_graph_node(gf, i);
376
- if (n->op != GGML_OP_FLASH_ATTN_EXT) {
377
- continue;
378
- }
379
- ggml_backend_dev_t device_fa = ggml_backend_get_device(
380
- ggml_backend_sched_get_tensor_backend(sched.get(), n));
363
+ llama_context::~llama_context() {
364
+ if (!model.hparams.no_alloc) {
365
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
366
+ ggml_backend_t backend = backend_ptrs[i];
367
+ ggml_backend_buffer_type_t buft = backend_buft[i];
381
368
 
382
- // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
383
- GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
384
- const int il = std::stoi(n->name + prefix_len);
385
- ggml_backend_dev_t device_kv = model.dev_layer(il);
386
- if (device_fa != device_kv) {
387
- LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
388
- "is assigned to device %s (usually due to missing support)\n",
389
- __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
390
- // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
391
- fa_device_mismatch = true;
392
- break;
393
- }
394
- }
395
- if (fa_device_mismatch) {
396
- cparams.flash_attn = false;
397
- LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
398
- if (ggml_is_quantized(params.type_v)) {
399
- throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
400
- }
369
+ const size_t size_exp = backend_buf_exp_size[i];
370
+ const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
371
+ if (size_exp == size_act) {
372
+ LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
373
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
401
374
  } else {
402
- cparams.flash_attn = true;
403
- LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
375
+ LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
376
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
404
377
  }
405
378
  }
379
+ }
380
+ ggml_opt_free(opt_ctx);
381
+ }
406
382
 
407
- // reserve worst-case graph
408
- int n_splits_pp = -1;
409
- int n_nodes_pp = -1;
383
+ void llama_context::sched_reserve() {
384
+ if (!sched_need_reserve) {
385
+ return;
386
+ }
410
387
 
411
- int n_splits_tg = -1;
412
- int n_nodes_tg = -1;
388
+ sched_need_reserve = false;
413
389
 
414
- // reserve pp (prompt processing) graph first so that buffers are only allocated once
415
- {
416
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
417
- model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
418
- if (!gf) {
419
- if (pipeline_parallel) {
420
- LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
421
- sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
422
- gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
423
- }
424
- if (!gf) {
425
- throw std::runtime_error("failed to allocate compute pp buffers");
426
- }
427
- }
390
+ LLAMA_LOG_INFO("%s: reserving ...\n", __func__);
391
+
392
+ synchronize();
393
+
394
+ const int64_t t_start_us = ggml_time_us();
395
+
396
+ const uint32_t n_seqs = cparams.n_seq_max;
397
+ const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
398
+
399
+ const size_t max_nodes = this->graph_max_nodes(n_tokens);
428
400
 
429
- n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
430
- n_nodes_pp = ggml_graph_n_nodes(gf);
401
+ LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
402
+
403
+ gf_res_prev.reset(new llm_graph_result(max_nodes));
404
+ gf_res_reserve.reset(new llm_graph_result(max_nodes));
405
+
406
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, cparams.pipeline_parallel, cparams.op_offload));
407
+
408
+ llama_memory_context_ptr mctx;
409
+ if (memory) {
410
+ LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
411
+ mctx = memory->init_full();
412
+ if (!mctx) {
413
+ throw std::runtime_error("failed to initialize memory module");
431
414
  }
415
+ }
432
416
 
433
- // reserve with tg (token generation) graph to get the number of splits and nodes
434
- {
435
- auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
436
- if (!gf) {
437
- throw std::runtime_error("failed to allocate compute tg buffers");
438
- }
417
+ // avoid reserving graphs with zero outputs - assume one output per sequence
418
+ const int n_outputs = n_seqs;
419
+
420
+ LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
439
421
 
440
- n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
441
- n_nodes_tg = ggml_graph_n_nodes(gf);
422
+ // resolve automatic Flash Attention use
423
+ if (cparams.auto_fa) {
424
+ auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
425
+ if (!gf) {
426
+ throw std::runtime_error("failed to split graph for Flash Attention check");
442
427
  }
443
428
 
444
- // reserve again with pp graph to avoid ggml-alloc reallocations during inference
445
- {
446
- // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
447
- //
448
- // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
449
- //
450
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
451
- if (!gf) {
452
- throw std::runtime_error("failed to allocate compute pp buffers");
429
+ const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
430
+ bool fa_device_mismatch = false;
431
+ for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
432
+ ggml_tensor * n = ggml_graph_node(gf, i);
433
+ if (n->op != GGML_OP_FLASH_ATTN_EXT) {
434
+ continue;
453
435
  }
436
+ ggml_backend_dev_t device_fa = ggml_backend_get_device(
437
+ ggml_backend_sched_get_tensor_backend(sched.get(), n));
438
+
439
+ // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
440
+ GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
441
+ const int il = std::stoi(n->name + prefix_len);
442
+ ggml_backend_dev_t device_kv = model.dev_layer(il);
443
+ if (device_fa != device_kv) {
444
+ LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
445
+ "is assigned to device %s (usually due to missing support)\n",
446
+ __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
447
+ // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
448
+ fa_device_mismatch = true;
449
+ break;
450
+ }
451
+ }
452
+ if (fa_device_mismatch) {
453
+ cparams.flash_attn = false;
454
+ LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
455
+ } else {
456
+ cparams.flash_attn = true;
457
+ LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
454
458
  }
455
459
 
456
- for (size_t i = 0; i < backend_ptrs.size(); ++i) {
457
- ggml_backend_t backend = backend_ptrs[i];
458
- ggml_backend_buffer_type_t buft = backend_buft[i];
459
- if (!model.hparams.no_alloc) {
460
- backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
460
+ cparams.auto_fa = false;
461
+ }
462
+
463
+ // reserve worst-case graph
464
+ int n_splits_pp = -1;
465
+ int n_nodes_pp = -1;
466
+
467
+ int n_splits_tg = -1;
468
+ int n_nodes_tg = -1;
469
+
470
+ // reserve pp (prompt processing) graph first so that buffers are only allocated once
471
+ {
472
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
473
+ model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
474
+ if (!gf) {
475
+ if (cparams.pipeline_parallel) {
476
+ LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
477
+ cparams.pipeline_parallel = false;
478
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
479
+ gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
461
480
  }
462
- if (backend_buf_exp_size[i] > 1) {
463
- LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
464
- ggml_backend_buft_name(buft),
465
- backend_buf_exp_size[i] / 1024.0 / 1024.0);
481
+ if (!gf) {
482
+ throw std::runtime_error("failed to allocate compute pp buffers");
466
483
  }
467
484
  }
468
485
 
469
- if (n_nodes_pp == n_nodes_tg) {
470
- LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
471
- } else {
472
- LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
473
- }
486
+ n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
487
+ n_nodes_pp = ggml_graph_n_nodes(gf);
488
+ }
474
489
 
475
- if (n_splits_pp == n_splits_tg) {
476
- LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
477
- } else {
478
- LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
490
+ // reserve with tg (token generation) graph to get the number of splits and nodes
491
+ {
492
+ auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
493
+ if (!gf) {
494
+ throw std::runtime_error("failed to allocate compute tg buffers");
479
495
  }
496
+
497
+ n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
498
+ n_nodes_tg = ggml_graph_n_nodes(gf);
480
499
  }
481
500
 
482
- // Initialize the full vocabulary token ids for backend samplers.
501
+ // reserve again with pp graph to avoid ggml-alloc reallocations during inference
483
502
  {
484
- const int n_vocab = model.vocab.n_tokens();
503
+ // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
504
+ //
505
+ // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
506
+ //
507
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
508
+ if (!gf) {
509
+ throw std::runtime_error("failed to allocate compute pp buffers");
510
+ }
511
+ }
485
512
 
486
- sampling.token_ids_full_vocab.resize(n_vocab);
487
- for (int i = 0; i < n_vocab; ++i) {
488
- sampling.token_ids_full_vocab[i] = i;
513
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
514
+ ggml_backend_t backend = backend_ptrs[i];
515
+ ggml_backend_buffer_type_t buft = backend_buft[i];
516
+ if (!model.hparams.no_alloc) {
517
+ backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
518
+ }
519
+ if (backend_buf_exp_size[i] > 1) {
520
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
521
+ ggml_backend_buft_name(buft),
522
+ backend_buf_exp_size[i] / 1024.0 / 1024.0);
489
523
  }
490
524
  }
491
- }
492
525
 
493
- llama_context::~llama_context() {
494
- if (!model.hparams.no_alloc) {
495
- for (size_t i = 0; i < backend_ptrs.size(); ++i) {
496
- ggml_backend_t backend = backend_ptrs[i];
497
- ggml_backend_buffer_type_t buft = backend_buft[i];
526
+ if (n_nodes_pp == n_nodes_tg) {
527
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
528
+ } else {
529
+ LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
530
+ }
498
531
 
499
- const size_t size_exp = backend_buf_exp_size[i];
500
- const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
501
- if (size_exp == size_act) {
502
- LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
503
- __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
504
- } else {
505
- LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
506
- __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
507
- }
508
- }
532
+ if (n_splits_pp == n_splits_tg) {
533
+ LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
534
+ } else {
535
+ LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
509
536
  }
510
- ggml_opt_free(opt_ctx);
537
+
538
+ const int64_t t_end_us = ggml_time_us();
539
+
540
+ LLAMA_LOG_INFO("%s: reserve took %.2f ms, sched copies = %d\n",
541
+ __func__, (t_end_us - t_start_us)/1000.0, ggml_backend_sched_get_n_copies(sched.get()));
511
542
  }
512
543
 
513
544
  void llama_context::synchronize() {
545
+ if (!sched) {
546
+ return;
547
+ }
548
+
514
549
  ggml_backend_sched_synchronize(sched.get());
515
550
 
516
551
  // FIXME: if multiple single tokens are evaluated without a synchronization,
@@ -951,21 +986,41 @@ void llama_context::set_embeddings(bool value) {
951
986
  LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
952
987
 
953
988
  cparams.embeddings = value;
989
+
990
+ // TODO: not sure yet if we want to reserve here
991
+ //sched_need_reserve = true;
954
992
  }
955
993
 
956
994
  void llama_context::set_causal_attn(bool value) {
957
995
  LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
958
996
 
997
+ if (cparams.causal_attn == value) {
998
+ return;
999
+ }
1000
+
959
1001
  cparams.causal_attn = value;
1002
+
1003
+ sched_need_reserve = true;
960
1004
  }
961
1005
 
962
1006
  void llama_context::set_warmup(bool value) {
963
1007
  LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
964
1008
 
1009
+ if (cparams.warmup == value) {
1010
+ return;
1011
+ }
1012
+
965
1013
  cparams.warmup = value;
1014
+
1015
+ // warmups are usually with small batches, so no need to reserve
1016
+ //sched_need_reserve = true;
966
1017
  }
967
1018
 
968
1019
  bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
1020
+ if (!sampler && sampling.samplers.count(seq_id) == 0) {
1021
+ return true;
1022
+ }
1023
+
969
1024
  LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
970
1025
 
971
1026
  const bool can_offload =
@@ -985,12 +1040,18 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
985
1040
 
986
1041
  sampling.samplers[seq_id] = sampler;
987
1042
 
1043
+ sched_need_reserve = true;
1044
+
988
1045
  return true;
989
1046
  }
990
1047
 
991
1048
  if (sampler && !can_offload) {
992
1049
  LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
993
1050
 
1051
+ if (sampling.samplers.count(seq_id) > 0) {
1052
+ sched_need_reserve = true;
1053
+ }
1054
+
994
1055
  sampling.samplers.erase(seq_id);
995
1056
 
996
1057
  return false;
@@ -998,6 +1059,8 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
998
1059
 
999
1060
  sampling.samplers.erase(seq_id);
1000
1061
 
1062
+ sched_need_reserve = true;
1063
+
1001
1064
  return true;
1002
1065
  }
1003
1066
 
@@ -1006,16 +1069,27 @@ void llama_context::set_adapter_lora(
1006
1069
  float scale) {
1007
1070
  LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
1008
1071
 
1072
+ if (auto it = loras.find(adapter); it != loras.end()) {
1073
+ if (it->second == scale) {
1074
+ return;
1075
+ }
1076
+ }
1077
+
1009
1078
  loras[adapter] = scale;
1079
+
1080
+ sched_need_reserve = true;
1010
1081
  }
1011
1082
 
1012
1083
  bool llama_context::rm_adapter_lora(
1013
1084
  llama_adapter_lora * adapter) {
1014
1085
  LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
1015
1086
 
1016
- auto pos = loras.find(adapter);
1017
- if (pos != loras.end()) {
1018
- loras.erase(pos);
1087
+ auto it = loras.find(adapter);
1088
+ if (it != loras.end()) {
1089
+ loras.erase(it);
1090
+
1091
+ sched_need_reserve = true;
1092
+
1019
1093
  return true;
1020
1094
  }
1021
1095
 
@@ -1025,7 +1099,13 @@ bool llama_context::rm_adapter_lora(
1025
1099
  void llama_context::clear_adapter_lora() {
1026
1100
  LLAMA_LOG_DEBUG("%s: call\n", __func__);
1027
1101
 
1102
+ if (loras.empty()) {
1103
+ return;
1104
+ }
1105
+
1028
1106
  loras.clear();
1107
+
1108
+ sched_need_reserve = true;
1029
1109
  }
1030
1110
 
1031
1111
  bool llama_context::apply_adapter_cvec(
@@ -1036,6 +1116,8 @@ bool llama_context::apply_adapter_cvec(
1036
1116
  int32_t il_end) {
1037
1117
  LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
1038
1118
 
1119
+ // TODO: should we reserve?
1120
+
1039
1121
  return cvec.apply(model, data, len, n_embd, il_start, il_end);
1040
1122
  }
1041
1123
 
@@ -1138,6 +1220,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
1138
1220
  // TODO: this clear of the buffer can easily be forgotten - need something better
1139
1221
  embd_seq.clear();
1140
1222
 
1223
+ sched_reserve();
1224
+
1141
1225
  n_queued_tokens += n_tokens;
1142
1226
 
1143
1227
  // reserve output buffer
@@ -1177,7 +1261,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
1177
1261
  auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
1178
1262
 
1179
1263
  // extract logits
1180
- if (logits && t_logits) {
1264
+ if (logits && t_logits) {
1181
1265
  ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
1182
1266
  GGML_ASSERT(backend_res != nullptr);
1183
1267
  GGML_ASSERT(logits != nullptr);
@@ -1451,6 +1535,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
1451
1535
  embd_seq.clear();
1452
1536
  output_swaps.clear();
1453
1537
 
1538
+ sched_reserve();
1539
+
1454
1540
  bool did_optimize = false;
1455
1541
 
1456
1542
  // handle any pending shifts/copies
@@ -1955,7 +2041,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
1955
2041
  return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
1956
2042
  }
1957
2043
  uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
1958
- res += model.n_lora_nodes;
2044
+ for (const auto & lora : model.loras) {
2045
+ res += lora->get_n_nodes();
2046
+ }
1959
2047
  return res;
1960
2048
  }
1961
2049
 
@@ -40,6 +40,14 @@ struct llama_context {
40
40
 
41
41
  ~llama_context();
42
42
 
43
+ // reserve a new backend scheduler (if needed)
44
+ // for example, when:
45
+ // - changing loras
46
+ // - changing samplers
47
+ // - changing attention type
48
+ // - etc.
49
+ void sched_reserve();
50
+
43
51
  void synchronize();
44
52
 
45
53
  const llama_model & get_model() const;
@@ -314,6 +322,8 @@ private:
314
322
 
315
323
  ggml_backend_sched_ptr sched;
316
324
 
325
+ bool sched_need_reserve = true;
326
+
317
327
  ggml_backend_t backend_cpu = nullptr;
318
328
  std::vector<ggml_backend_ptr> backends;
319
329
 
@@ -30,10 +30,12 @@ struct llama_cparams {
30
30
  bool causal_attn;
31
31
  bool offload_kqv;
32
32
  bool flash_attn;
33
+ bool auto_fa;
33
34
  bool no_perf;
34
35
  bool warmup;
35
36
  bool op_offload;
36
37
  bool kv_unified;
38
+ bool pipeline_parallel;
37
39
 
38
40
  enum llama_pooling_type pooling_type;
39
41
 
@@ -200,42 +200,6 @@ uint32_t llama_hparams::n_layer_kv() const {
200
200
  return res;
201
201
  }
202
202
 
203
- bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
204
- assert(p0 >= 0 && p1 >= 0);
205
-
206
- switch (swa_type) {
207
- case LLAMA_SWA_TYPE_NONE:
208
- {
209
- } break;
210
- case LLAMA_SWA_TYPE_STANDARD:
211
- {
212
- if (p1 - p0 >= (int32_t) n_swa) {
213
- return true;
214
- }
215
- } break;
216
- case LLAMA_SWA_TYPE_CHUNKED:
217
- {
218
- const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
219
-
220
- if (p0 < pos_chunk_start) {
221
- return true;
222
- }
223
- } break;
224
- case LLAMA_SWA_TYPE_SYMMETRIC:
225
- {
226
- const int32_t half_n_swa = (int32_t) n_swa / 2;
227
- const int32_t pos_diff = p1 - p0;
228
-
229
- // Mask if outside the symmetric window
230
- if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
231
- return true;
232
- }
233
- } break;
234
- }
235
-
236
- return false;
237
- }
238
-
239
203
  bool llama_hparams::use_mrope() const {
240
204
  return rope_sections[0] > 0 && rope_sections[1] > 0;
241
205
  }