@fugood/llama.node 0.3.8 → 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.js +2 -2
  18. package/lib/binding.ts +52 -8
  19. package/lib/index.ts +3 -1
  20. package/package.json +8 -1
  21. package/src/LlamaCompletionWorker.cpp +33 -6
  22. package/src/LlamaCompletionWorker.h +3 -1
  23. package/src/LlamaContext.cpp +387 -28
  24. package/src/LlamaContext.h +5 -0
  25. package/src/common.hpp +19 -2
  26. package/src/llama.cpp/.github/workflows/build.yml +289 -107
  27. package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
  28. package/src/llama.cpp/.github/workflows/docker.yml +2 -1
  29. package/src/llama.cpp/.github/workflows/server.yml +25 -2
  30. package/src/llama.cpp/CMakeLists.txt +10 -19
  31. package/src/llama.cpp/cmake/build-info.cmake +1 -1
  32. package/src/llama.cpp/common/CMakeLists.txt +32 -0
  33. package/src/llama.cpp/common/arg.cpp +66 -16
  34. package/src/llama.cpp/common/chat-template.hpp +515 -0
  35. package/src/llama.cpp/common/chat.cpp +966 -0
  36. package/src/llama.cpp/common/chat.hpp +52 -0
  37. package/src/llama.cpp/common/common.cpp +159 -36
  38. package/src/llama.cpp/common/common.h +56 -14
  39. package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
  40. package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
  41. package/src/llama.cpp/common/llguidance.cpp +270 -0
  42. package/src/llama.cpp/common/log.cpp +1 -10
  43. package/src/llama.cpp/common/log.h +10 -0
  44. package/src/llama.cpp/common/minja.hpp +2868 -0
  45. package/src/llama.cpp/common/sampling.cpp +22 -1
  46. package/src/llama.cpp/common/sampling.h +3 -0
  47. package/src/llama.cpp/docs/build.md +54 -9
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
  49. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
  50. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  51. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
  52. package/src/llama.cpp/examples/llava/clip.cpp +133 -14
  53. package/src/llama.cpp/examples/llava/clip.h +2 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +22 -8
  55. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
  56. package/src/llama.cpp/examples/main/main.cpp +26 -25
  57. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
  58. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
  59. package/src/llama.cpp/examples/run/run.cpp +224 -69
  60. package/src/llama.cpp/examples/server/server.cpp +252 -81
  61. package/src/llama.cpp/examples/server/utils.hpp +73 -21
  62. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
  63. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
  65. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
  68. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
  69. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
  71. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
  73. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
  74. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
  76. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
  77. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
  78. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
  79. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  81. package/src/llama.cpp/ggml/src/ggml.c +23 -13
  82. package/src/llama.cpp/include/llama.h +14 -1
  83. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
  84. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
  85. package/src/llama.cpp/src/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/src/llama-arch.cpp +7 -2
  87. package/src/llama.cpp/src/llama-arch.h +3 -1
  88. package/src/llama.cpp/src/llama-chat.cpp +11 -2
  89. package/src/llama.cpp/src/llama-chat.h +1 -0
  90. package/src/llama.cpp/src/llama-grammar.cpp +86 -6
  91. package/src/llama.cpp/src/llama-grammar.h +22 -1
  92. package/src/llama.cpp/src/llama-mmap.cpp +1 -0
  93. package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
  94. package/src/llama.cpp/src/llama-model.cpp +76 -6
  95. package/src/llama.cpp/src/llama-sampling.cpp +47 -4
  96. package/src/llama.cpp/src/llama-vocab.cpp +10 -4
  97. package/src/llama.cpp/src/llama.cpp +181 -123
  98. package/src/llama.cpp/tests/CMakeLists.txt +4 -0
  99. package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
  100. package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
  101. package/src/llama.cpp/tests/test-chat.cpp +607 -0
  102. package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
  103. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
  104. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
  105. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
@@ -4610,7 +4610,8 @@ struct llm_build_context {
4610
4610
  ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
4611
4611
  cb(k_pe, "k_pe", il);
4612
4612
 
4613
- kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
4613
+ // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
4614
+ kv_compressed = ggml_cont(ctx0, kv_compressed);
4614
4615
  kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
4615
4616
  model.layers[il].attn_kv_a_norm, NULL,
4616
4617
  LLM_NORM_RMS, cb, il);
@@ -6464,7 +6465,8 @@ struct llm_build_context {
6464
6465
  ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
6465
6466
  cb(k_pe, "k_pe", il);
6466
6467
 
6467
- kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
6468
+ // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
6469
+ kv_compressed = ggml_cont(ctx0, kv_compressed);
6468
6470
  kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
6469
6471
  model.layers[il].attn_kv_a_norm, NULL,
6470
6472
  LLM_NORM_RMS, cb, il);
@@ -7215,17 +7217,30 @@ struct llm_build_context {
7215
7217
  struct ggml_tensor * Qcur = nullptr;
7216
7218
  struct ggml_tensor * Kcur = nullptr;
7217
7219
  struct ggml_tensor * Vcur = nullptr;
7218
-
7219
- cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
7220
- cb(cur, "wqkv", il);
7221
-
7222
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7223
- cb(cur, "bqkv", il);
7224
-
7225
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7226
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7227
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7228
-
7220
+ if (model.layers[il].wqkv == nullptr) {
7221
+ Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
7222
+ if (model.layers[il].bq) {
7223
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
7224
+ }
7225
+ Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
7226
+ if (model.layers[il].bk) {
7227
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
7228
+ }
7229
+ Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
7230
+ if (model.layers[il].bv) {
7231
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
7232
+ }
7233
+ } else {
7234
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
7235
+ cb(cur, "wqkv", il);
7236
+ if (model.layers[il].bqkv) {
7237
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7238
+ cb(cur, "bqkv", il);
7239
+ }
7240
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7241
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7242
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7243
+ }
7229
7244
  cb(Qcur, "Qcur", il);
7230
7245
  cb(Kcur, "Kcur", il);
7231
7246
  cb(Vcur, "Vcur", il);
@@ -7700,17 +7715,13 @@ struct llm_build_context {
7700
7715
  1
7701
7716
  );
7702
7717
 
7718
+ struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
7703
7719
  ggml_build_forward_expand(
7704
7720
  gf,
7705
7721
  ggml_cpy(
7706
7722
  ctx0,
7707
- wkv_states,
7708
- ggml_view_1d(
7709
- ctx0,
7710
- kv_self.v_l[il],
7711
- hparams.n_embd_v_s() * n_seqs,
7712
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
7713
- )
7723
+ ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0),
7724
+ ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
7714
7725
  )
7715
7726
  );
7716
7727
 
@@ -8432,74 +8443,33 @@ static enum ggml_status llama_graph_compute(
8432
8443
  return status;
8433
8444
  }
8434
8445
 
8435
- // decode a batch of tokens by evaluating the transformer
8436
- // in case of unsuccessful decoding (error or warning),
8437
- // the kv_cache state will be returned to its original state
8438
- // (for non-recurrent models) or cleaned (for recurrent models)
8439
- //
8440
- // - lctx: llama context
8441
- // - batch: batch to evaluate
8442
- //
8443
- // return 0 on success
8444
- // return positive int on warning
8445
- // return negative int on error
8446
- //
8447
- static int llama_decode_impl(
8448
- llama_context & lctx,
8449
- llama_batch inp_batch) {
8450
-
8451
- lctx.is_encoding = false;
8452
-
8453
- if (inp_batch.n_tokens == 0) {
8454
- LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
8455
- return -1;
8456
- }
8457
-
8458
- // temporary allocate memory for the input batch if needed
8459
- llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
8460
-
8461
- const llama_batch & batch = batch_allocr.batch;
8462
- const uint32_t n_tokens_all = batch.n_tokens;
8463
-
8446
+ static int llama_prepare_sbatch(
8447
+ llama_context & lctx,
8448
+ const llama_batch & batch,
8449
+ uint32_t & n_outputs) {
8464
8450
  const auto & model = lctx.model;
8465
- const auto & vocab = model.vocab;
8466
8451
  const auto & hparams = model.hparams;
8467
8452
  const auto & cparams = lctx.cparams;
8468
8453
 
8469
- GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
8454
+ const uint32_t n_tokens_all = batch.n_tokens;
8455
+ const int64_t n_embd = hparams.n_embd;
8456
+
8457
+ // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
8458
+ const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
8470
8459
 
8460
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
8471
8461
  if (batch.token) {
8472
8462
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
8473
- if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
8463
+ if (batch.token[i] < 0 || uint32_t(batch.token[i]) >= model.vocab.n_tokens()) {
8474
8464
  LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
8475
8465
  return -1;
8476
8466
  }
8477
8467
  }
8478
8468
  }
8479
-
8480
8469
  GGML_ASSERT(n_tokens_all <= cparams.n_batch);
8481
-
8482
8470
  GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
8483
8471
 
8484
- if (lctx.t_compute_start_us == 0) {
8485
- lctx.t_compute_start_us = ggml_time_us();
8486
- }
8487
8472
  lctx.n_queued_tokens += n_tokens_all;
8488
-
8489
- auto & kv_self = lctx.kv_self;
8490
- llama_kv_slot_restorer kv_slot_restorer(kv_self);
8491
-
8492
- const int64_t n_embd = hparams.n_embd;
8493
- const int64_t n_vocab = vocab.n_tokens();
8494
-
8495
- uint32_t n_outputs = 0;
8496
- uint32_t n_outputs_prev = 0;
8497
-
8498
- const auto n_ubatch = cparams.n_ubatch;
8499
-
8500
- // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
8501
- const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
8502
-
8503
8473
  lctx.embd_seq.clear();
8504
8474
 
8505
8475
  // count outputs
@@ -8515,7 +8485,7 @@ static int llama_decode_impl(
8515
8485
  }
8516
8486
 
8517
8487
  lctx.sbatch.from_batch(batch, n_embd,
8518
- /* simple_split */ !kv_self.recurrent,
8488
+ /* simple_split */ !lctx.kv_self.recurrent,
8519
8489
  /* logits_all */ n_outputs == n_tokens_all);
8520
8490
 
8521
8491
  // reserve output buffer
@@ -8524,70 +8494,148 @@ static int llama_decode_impl(
8524
8494
  return -2;
8525
8495
  };
8526
8496
 
8527
- while (lctx.sbatch.n_tokens > 0) {
8528
- llama_ubatch ubatch;
8529
- if (kv_self.recurrent) {
8530
- if (embd_pooled) {
8531
- // Pooled embeddings cannot be split across ubatches (yet)
8532
- ubatch = lctx.sbatch.split_seq(n_ubatch);
8533
- } else {
8534
- // recurrent model architectures are easier to implement
8535
- // with equal-length sequences
8536
- ubatch = lctx.sbatch.split_equal(n_ubatch);
8537
- }
8497
+ return 0;
8498
+ }
8499
+
8500
+ static int llama_prepare_ubatch(
8501
+ llama_context & lctx,
8502
+ llama_kv_slot_restorer & kv_slot_restorer,
8503
+ llama_ubatch & ubatch,
8504
+ const uint32_t n_outputs,
8505
+ const uint32_t n_tokens_all) {
8506
+ GGML_ASSERT(lctx.sbatch.n_tokens > 0);
8507
+
8508
+ auto & kv_self = lctx.kv_self;
8509
+ const auto & cparams = lctx.cparams;
8510
+ const auto & hparams = lctx.model.hparams;
8511
+
8512
+ // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
8513
+ const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
8514
+
8515
+ if (lctx.kv_self.recurrent) {
8516
+ if (embd_pooled) {
8517
+ // Pooled embeddings cannot be split across ubatches (yet)
8518
+ ubatch = lctx.sbatch.split_seq(cparams.n_ubatch);
8538
8519
  } else {
8539
- ubatch = lctx.sbatch.split_simple(n_ubatch);
8520
+ // recurrent model architectures are easier to implement
8521
+ // with equal-length sequences
8522
+ ubatch = lctx.sbatch.split_equal(cparams.n_ubatch);
8540
8523
  }
8541
- const uint32_t n_tokens = ubatch.n_tokens;
8524
+ } else {
8525
+ ubatch = lctx.sbatch.split_simple(cparams.n_ubatch);
8526
+ }
8542
8527
 
8543
- // count the outputs in this u_batch
8544
- {
8545
- int32_t n_outputs_new = 0;
8528
+ // count the outputs in this u_batch
8529
+ {
8530
+ int32_t n_outputs_new = 0;
8546
8531
 
8547
- if (n_outputs == n_tokens_all) {
8548
- n_outputs_new = n_tokens;
8549
- } else {
8550
- GGML_ASSERT(ubatch.output);
8551
- for (uint32_t i = 0; i < n_tokens; i++) {
8552
- n_outputs_new += (int32_t) (ubatch.output[i] != 0);
8553
- }
8532
+ if (n_outputs == n_tokens_all) {
8533
+ n_outputs_new = ubatch.n_tokens;
8534
+ } else {
8535
+ GGML_ASSERT(ubatch.output);
8536
+ for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
8537
+ n_outputs_new += int32_t(ubatch.output[i] != 0);
8554
8538
  }
8539
+ }
8555
8540
 
8556
- // needs to happen before the graph is built
8557
- lctx.n_outputs = n_outputs_new;
8541
+ // needs to happen before the graph is built
8542
+ lctx.n_outputs = n_outputs_new;
8543
+ }
8544
+
8545
+ // non-causal masks do not use the KV cache
8546
+ if (hparams.causal_attn) {
8547
+ llama_kv_cache_update(&lctx);
8548
+
8549
+ // if we have enough unused cells before the current head ->
8550
+ // better to start searching from the beginning of the cache, hoping to fill it
8551
+ if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
8552
+ kv_self.head = 0;
8558
8553
  }
8559
8554
 
8560
- int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8561
- ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
8555
+ const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
8556
+ if (!slot) {
8557
+ return 1;
8558
+ }
8559
+ kv_slot_restorer.save(slot);
8560
+
8561
+ if (!kv_self.recurrent) {
8562
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
8563
+ // after enough generations, the benefit from this heuristic disappears
8564
+ // if we start defragmenting the cache, the benefit from this will be more important
8565
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
8566
+ kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
8567
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
8568
+ }
8569
+ }
8562
8570
 
8563
- GGML_ASSERT(n_threads > 0);
8571
+ return 0;
8572
+ }
8564
8573
 
8565
- // non-causal masks do not use the KV cache
8566
- if (hparams.causal_attn) {
8567
- llama_kv_cache_update(&lctx);
8574
+ // decode a batch of tokens by evaluating the transformer
8575
+ // in case of unsuccessful decoding (error or warning),
8576
+ // the kv_cache state will be returned to its original state
8577
+ // (for non-recurrent models) or cleaned (for recurrent models)
8578
+ //
8579
+ // - lctx: llama context
8580
+ // - inp_batch: batch to evaluate
8581
+ //
8582
+ // return 0 on success
8583
+ // return positive int on warning
8584
+ // return negative int on error
8585
+ //
8586
+ static int llama_decode_impl(
8587
+ llama_context & lctx,
8588
+ llama_batch inp_batch) {
8568
8589
 
8569
- // if we have enough unused cells before the current head ->
8570
- // better to start searching from the beginning of the cache, hoping to fill it
8571
- if (kv_self.head > kv_self.used + 2*n_tokens) {
8572
- kv_self.head = 0;
8573
- }
8590
+ lctx.is_encoding = false;
8574
8591
 
8575
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
8576
- if (!slot) {
8577
- return 1;
8578
- }
8579
- kv_slot_restorer.save(slot);
8592
+ if (inp_batch.n_tokens == 0) {
8593
+ LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
8594
+ return -1;
8595
+ }
8596
+
8597
+ // temporarily allocate memory for the input batch if needed
8598
+ llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
8599
+ const llama_batch & batch = batch_allocr.batch;
8600
+
8601
+ const auto & model = lctx.model;
8602
+ const auto & vocab = model.vocab;
8603
+ const auto & hparams = model.hparams;
8604
+ const auto & cparams = lctx.cparams;
8605
+
8606
+ if (lctx.t_compute_start_us == 0) {
8607
+ lctx.t_compute_start_us = ggml_time_us();
8608
+ }
8609
+ auto & kv_self = lctx.kv_self;
8610
+ llama_kv_slot_restorer kv_slot_restorer(kv_self);
8611
+
8612
+ const int64_t n_embd = hparams.n_embd;
8613
+ const int64_t n_vocab = vocab.n_tokens();
8580
8614
 
8581
- if (!kv_self.recurrent) {
8582
- // a heuristic, to avoid attending the full cache if it is not yet utilized
8583
- // after enough generations, the benefit from this heuristic disappears
8584
- // if we start defragmenting the cache, the benefit from this will be more important
8585
- const uint32_t pad = llama_kv_cache_get_padding(cparams);
8586
- kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
8587
- //kv_self.n = llama_kv_cache_cell_max(kv_self);
8615
+ uint32_t n_outputs = 0;
8616
+ uint32_t n_outputs_prev = 0;
8617
+
8618
+ {
8619
+ const int ret = llama_prepare_sbatch(lctx, batch, n_outputs);
8620
+ if (ret != 0) {
8621
+ return ret;
8622
+ }
8623
+ }
8624
+
8625
+ while (lctx.sbatch.n_tokens > 0) {
8626
+ llama_ubatch ubatch;
8627
+ {
8628
+ const int ret = llama_prepare_ubatch(lctx, kv_slot_restorer, ubatch, n_outputs, batch.n_tokens);
8629
+ if (ret != 0) {
8630
+ return ret;
8588
8631
  }
8589
8632
  }
8590
8633
 
8634
+ const int n_threads = ubatch.n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8635
+ ggml_threadpool_t threadpool = ubatch.n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
8636
+
8637
+ GGML_ASSERT(n_threads > 0);
8638
+
8591
8639
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
8592
8640
 
8593
8641
  ggml_backend_sched_reset(lctx.sched.get());
@@ -8640,7 +8688,7 @@ static int llama_decode_impl(
8640
8688
 
8641
8689
  // update the kv ring buffer
8642
8690
  {
8643
- kv_self.head += n_tokens;
8691
+ kv_self.head += ubatch.n_tokens;
8644
8692
 
8645
8693
  // Ensure kv cache head points to a valid index.
8646
8694
  if (kv_self.head >= kv_self.size) {
@@ -9405,6 +9453,7 @@ static struct llama_model * llama_model_load_from_file_impl(
9405
9453
  model->devices.push_back(*dev);
9406
9454
  }
9407
9455
  } else {
9456
+ std::vector<ggml_backend_dev_t> rpc_servers;
9408
9457
  // use all available devices
9409
9458
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
9410
9459
  ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -9415,10 +9464,19 @@ static struct llama_model * llama_model_load_from_file_impl(
9415
9464
  break;
9416
9465
 
9417
9466
  case GGML_BACKEND_DEVICE_TYPE_GPU:
9418
- model->devices.push_back(dev);
9467
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
9468
+ if (ggml_backend_reg_name(reg) == std::string("RPC")) {
9469
+ rpc_servers.push_back(dev);
9470
+ } else {
9471
+ model->devices.push_back(dev);
9472
+ }
9419
9473
  break;
9420
9474
  }
9421
9475
  }
9476
+ // add RPC servers at the front of the list
9477
+ if (!rpc_servers.empty()) {
9478
+ model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
9479
+ }
9422
9480
  }
9423
9481
 
9424
9482
  // if using single GPU mode, remove all except the main GPU
@@ -86,6 +86,9 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE
86
86
  llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
87
87
  llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
88
88
 
89
+ if (LLAMA_LLGUIDANCE)
90
+ llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
91
+ endif ()
89
92
 
90
93
  if (NOT WIN32)
91
94
  # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
@@ -93,6 +96,7 @@ if (NOT WIN32)
93
96
  llama_target_and_test(test-grammar-parser.cpp)
94
97
  llama_target_and_test(test-grammar-integration.cpp)
95
98
  llama_target_and_test(test-llama-grammar.cpp)
99
+ llama_target_and_test(test-chat.cpp)
96
100
  # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
97
101
  if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
98
102
  llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)