@fugood/llama.node 0.3.9 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.js +2 -2
- package/lib/binding.ts +47 -8
- package/lib/index.js +21 -1
- package/lib/index.ts +31 -1
- package/package.json +12 -3
- package/src/LlamaCompletionWorker.cpp +33 -6
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +336 -28
- package/src/LlamaContext.h +2 -0
- package/src/common.hpp +19 -2
- package/src/llama.cpp/.github/workflows/build.yml +289 -107
- package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
- package/src/llama.cpp/.github/workflows/docker.yml +2 -1
- package/src/llama.cpp/.github/workflows/server.yml +25 -2
- package/src/llama.cpp/CMakeLists.txt +10 -19
- package/src/llama.cpp/cmake/build-info.cmake +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +32 -0
- package/src/llama.cpp/common/arg.cpp +66 -16
- package/src/llama.cpp/common/chat-template.hpp +515 -0
- package/src/llama.cpp/common/chat.cpp +966 -0
- package/src/llama.cpp/common/chat.hpp +52 -0
- package/src/llama.cpp/common/common.cpp +159 -36
- package/src/llama.cpp/common/common.h +56 -14
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
- package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
- package/src/llama.cpp/common/llguidance.cpp +270 -0
- package/src/llama.cpp/common/log.cpp +1 -10
- package/src/llama.cpp/common/log.h +10 -0
- package/src/llama.cpp/common/minja.hpp +2868 -0
- package/src/llama.cpp/common/sampling.cpp +22 -1
- package/src/llama.cpp/common/sampling.h +3 -0
- package/src/llama.cpp/docs/build.md +54 -9
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
- package/src/llama.cpp/examples/llava/clip.cpp +133 -14
- package/src/llama.cpp/examples/llava/clip.h +2 -0
- package/src/llama.cpp/examples/llava/llava.cpp +22 -8
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
- package/src/llama.cpp/examples/main/main.cpp +26 -25
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
- package/src/llama.cpp/examples/run/run.cpp +224 -69
- package/src/llama.cpp/examples/server/server.cpp +252 -81
- package/src/llama.cpp/examples/server/utils.hpp +73 -21
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
- package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
- package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +23 -13
- package/src/llama.cpp/include/llama.h +14 -1
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
- package/src/llama.cpp/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/src/llama-arch.cpp +7 -2
- package/src/llama.cpp/src/llama-arch.h +3 -1
- package/src/llama.cpp/src/llama-chat.cpp +11 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +86 -6
- package/src/llama.cpp/src/llama-grammar.h +22 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +76 -6
- package/src/llama.cpp/src/llama-sampling.cpp +47 -4
- package/src/llama.cpp/src/llama-vocab.cpp +10 -4
- package/src/llama.cpp/src/llama.cpp +181 -123
- package/src/llama.cpp/tests/CMakeLists.txt +4 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
- package/src/llama.cpp/tests/test-chat.cpp +607 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
|
@@ -4610,7 +4610,8 @@ struct llm_build_context {
|
|
|
4610
4610
|
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
4611
4611
|
cb(k_pe, "k_pe", il);
|
|
4612
4612
|
|
|
4613
|
-
|
|
4613
|
+
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
|
|
4614
|
+
kv_compressed = ggml_cont(ctx0, kv_compressed);
|
|
4614
4615
|
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
|
4615
4616
|
model.layers[il].attn_kv_a_norm, NULL,
|
|
4616
4617
|
LLM_NORM_RMS, cb, il);
|
|
@@ -6464,7 +6465,8 @@ struct llm_build_context {
|
|
|
6464
6465
|
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
6465
6466
|
cb(k_pe, "k_pe", il);
|
|
6466
6467
|
|
|
6467
|
-
|
|
6468
|
+
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
|
|
6469
|
+
kv_compressed = ggml_cont(ctx0, kv_compressed);
|
|
6468
6470
|
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
|
6469
6471
|
model.layers[il].attn_kv_a_norm, NULL,
|
|
6470
6472
|
LLM_NORM_RMS, cb, il);
|
|
@@ -7215,17 +7217,30 @@ struct llm_build_context {
|
|
|
7215
7217
|
struct ggml_tensor * Qcur = nullptr;
|
|
7216
7218
|
struct ggml_tensor * Kcur = nullptr;
|
|
7217
7219
|
struct ggml_tensor * Vcur = nullptr;
|
|
7218
|
-
|
|
7219
|
-
|
|
7220
|
-
|
|
7221
|
-
|
|
7222
|
-
|
|
7223
|
-
|
|
7224
|
-
|
|
7225
|
-
|
|
7226
|
-
|
|
7227
|
-
|
|
7228
|
-
|
|
7220
|
+
if (model.layers[il].wqkv == nullptr) {
|
|
7221
|
+
Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
|
7222
|
+
if (model.layers[il].bq) {
|
|
7223
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
7224
|
+
}
|
|
7225
|
+
Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
|
7226
|
+
if (model.layers[il].bk) {
|
|
7227
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
7228
|
+
}
|
|
7229
|
+
Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
|
7230
|
+
if (model.layers[il].bv) {
|
|
7231
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
7232
|
+
}
|
|
7233
|
+
} else {
|
|
7234
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
|
7235
|
+
cb(cur, "wqkv", il);
|
|
7236
|
+
if (model.layers[il].bqkv) {
|
|
7237
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7238
|
+
cb(cur, "bqkv", il);
|
|
7239
|
+
}
|
|
7240
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
7241
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
7242
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
7243
|
+
}
|
|
7229
7244
|
cb(Qcur, "Qcur", il);
|
|
7230
7245
|
cb(Kcur, "Kcur", il);
|
|
7231
7246
|
cb(Vcur, "Vcur", il);
|
|
@@ -7700,17 +7715,13 @@ struct llm_build_context {
|
|
|
7700
7715
|
1
|
|
7701
7716
|
);
|
|
7702
7717
|
|
|
7718
|
+
struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
|
|
7703
7719
|
ggml_build_forward_expand(
|
|
7704
7720
|
gf,
|
|
7705
7721
|
ggml_cpy(
|
|
7706
7722
|
ctx0,
|
|
7707
|
-
|
|
7708
|
-
ggml_view_1d(
|
|
7709
|
-
ctx0,
|
|
7710
|
-
kv_self.v_l[il],
|
|
7711
|
-
hparams.n_embd_v_s() * n_seqs,
|
|
7712
|
-
hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
|
|
7713
|
-
)
|
|
7723
|
+
ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0),
|
|
7724
|
+
ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
|
|
7714
7725
|
)
|
|
7715
7726
|
);
|
|
7716
7727
|
|
|
@@ -8432,74 +8443,33 @@ static enum ggml_status llama_graph_compute(
|
|
|
8432
8443
|
return status;
|
|
8433
8444
|
}
|
|
8434
8445
|
|
|
8435
|
-
|
|
8436
|
-
|
|
8437
|
-
|
|
8438
|
-
|
|
8439
|
-
//
|
|
8440
|
-
// - lctx: llama context
|
|
8441
|
-
// - batch: batch to evaluate
|
|
8442
|
-
//
|
|
8443
|
-
// return 0 on success
|
|
8444
|
-
// return positive int on warning
|
|
8445
|
-
// return negative int on error
|
|
8446
|
-
//
|
|
8447
|
-
static int llama_decode_impl(
|
|
8448
|
-
llama_context & lctx,
|
|
8449
|
-
llama_batch inp_batch) {
|
|
8450
|
-
|
|
8451
|
-
lctx.is_encoding = false;
|
|
8452
|
-
|
|
8453
|
-
if (inp_batch.n_tokens == 0) {
|
|
8454
|
-
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
|
8455
|
-
return -1;
|
|
8456
|
-
}
|
|
8457
|
-
|
|
8458
|
-
// temporary allocate memory for the input batch if needed
|
|
8459
|
-
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
|
|
8460
|
-
|
|
8461
|
-
const llama_batch & batch = batch_allocr.batch;
|
|
8462
|
-
const uint32_t n_tokens_all = batch.n_tokens;
|
|
8463
|
-
|
|
8446
|
+
static int llama_prepare_sbatch(
|
|
8447
|
+
llama_context & lctx,
|
|
8448
|
+
const llama_batch & batch,
|
|
8449
|
+
uint32_t & n_outputs) {
|
|
8464
8450
|
const auto & model = lctx.model;
|
|
8465
|
-
const auto & vocab = model.vocab;
|
|
8466
8451
|
const auto & hparams = model.hparams;
|
|
8467
8452
|
const auto & cparams = lctx.cparams;
|
|
8468
8453
|
|
|
8469
|
-
|
|
8454
|
+
const uint32_t n_tokens_all = batch.n_tokens;
|
|
8455
|
+
const int64_t n_embd = hparams.n_embd;
|
|
8456
|
+
|
|
8457
|
+
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
|
8458
|
+
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
|
8470
8459
|
|
|
8460
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
|
8471
8461
|
if (batch.token) {
|
|
8472
8462
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
|
8473
|
-
if (batch.token[i] < 0 || (
|
|
8463
|
+
if (batch.token[i] < 0 || uint32_t(batch.token[i]) >= model.vocab.n_tokens()) {
|
|
8474
8464
|
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
|
8475
8465
|
return -1;
|
|
8476
8466
|
}
|
|
8477
8467
|
}
|
|
8478
8468
|
}
|
|
8479
|
-
|
|
8480
8469
|
GGML_ASSERT(n_tokens_all <= cparams.n_batch);
|
|
8481
|
-
|
|
8482
8470
|
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
|
|
8483
8471
|
|
|
8484
|
-
if (lctx.t_compute_start_us == 0) {
|
|
8485
|
-
lctx.t_compute_start_us = ggml_time_us();
|
|
8486
|
-
}
|
|
8487
8472
|
lctx.n_queued_tokens += n_tokens_all;
|
|
8488
|
-
|
|
8489
|
-
auto & kv_self = lctx.kv_self;
|
|
8490
|
-
llama_kv_slot_restorer kv_slot_restorer(kv_self);
|
|
8491
|
-
|
|
8492
|
-
const int64_t n_embd = hparams.n_embd;
|
|
8493
|
-
const int64_t n_vocab = vocab.n_tokens();
|
|
8494
|
-
|
|
8495
|
-
uint32_t n_outputs = 0;
|
|
8496
|
-
uint32_t n_outputs_prev = 0;
|
|
8497
|
-
|
|
8498
|
-
const auto n_ubatch = cparams.n_ubatch;
|
|
8499
|
-
|
|
8500
|
-
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
|
8501
|
-
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
|
8502
|
-
|
|
8503
8473
|
lctx.embd_seq.clear();
|
|
8504
8474
|
|
|
8505
8475
|
// count outputs
|
|
@@ -8515,7 +8485,7 @@ static int llama_decode_impl(
|
|
|
8515
8485
|
}
|
|
8516
8486
|
|
|
8517
8487
|
lctx.sbatch.from_batch(batch, n_embd,
|
|
8518
|
-
/* simple_split */ !kv_self.recurrent,
|
|
8488
|
+
/* simple_split */ !lctx.kv_self.recurrent,
|
|
8519
8489
|
/* logits_all */ n_outputs == n_tokens_all);
|
|
8520
8490
|
|
|
8521
8491
|
// reserve output buffer
|
|
@@ -8524,70 +8494,148 @@ static int llama_decode_impl(
|
|
|
8524
8494
|
return -2;
|
|
8525
8495
|
};
|
|
8526
8496
|
|
|
8527
|
-
|
|
8528
|
-
|
|
8529
|
-
|
|
8530
|
-
|
|
8531
|
-
|
|
8532
|
-
|
|
8533
|
-
|
|
8534
|
-
|
|
8535
|
-
|
|
8536
|
-
|
|
8537
|
-
|
|
8497
|
+
return 0;
|
|
8498
|
+
}
|
|
8499
|
+
|
|
8500
|
+
static int llama_prepare_ubatch(
|
|
8501
|
+
llama_context & lctx,
|
|
8502
|
+
llama_kv_slot_restorer & kv_slot_restorer,
|
|
8503
|
+
llama_ubatch & ubatch,
|
|
8504
|
+
const uint32_t n_outputs,
|
|
8505
|
+
const uint32_t n_tokens_all) {
|
|
8506
|
+
GGML_ASSERT(lctx.sbatch.n_tokens > 0);
|
|
8507
|
+
|
|
8508
|
+
auto & kv_self = lctx.kv_self;
|
|
8509
|
+
const auto & cparams = lctx.cparams;
|
|
8510
|
+
const auto & hparams = lctx.model.hparams;
|
|
8511
|
+
|
|
8512
|
+
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
|
8513
|
+
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
|
8514
|
+
|
|
8515
|
+
if (lctx.kv_self.recurrent) {
|
|
8516
|
+
if (embd_pooled) {
|
|
8517
|
+
// Pooled embeddings cannot be split across ubatches (yet)
|
|
8518
|
+
ubatch = lctx.sbatch.split_seq(cparams.n_ubatch);
|
|
8538
8519
|
} else {
|
|
8539
|
-
|
|
8520
|
+
// recurrent model architectures are easier to implement
|
|
8521
|
+
// with equal-length sequences
|
|
8522
|
+
ubatch = lctx.sbatch.split_equal(cparams.n_ubatch);
|
|
8540
8523
|
}
|
|
8541
|
-
|
|
8524
|
+
} else {
|
|
8525
|
+
ubatch = lctx.sbatch.split_simple(cparams.n_ubatch);
|
|
8526
|
+
}
|
|
8542
8527
|
|
|
8543
|
-
|
|
8544
|
-
|
|
8545
|
-
|
|
8528
|
+
// count the outputs in this u_batch
|
|
8529
|
+
{
|
|
8530
|
+
int32_t n_outputs_new = 0;
|
|
8546
8531
|
|
|
8547
|
-
|
|
8548
|
-
|
|
8549
|
-
|
|
8550
|
-
|
|
8551
|
-
|
|
8552
|
-
|
|
8553
|
-
}
|
|
8532
|
+
if (n_outputs == n_tokens_all) {
|
|
8533
|
+
n_outputs_new = ubatch.n_tokens;
|
|
8534
|
+
} else {
|
|
8535
|
+
GGML_ASSERT(ubatch.output);
|
|
8536
|
+
for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
|
|
8537
|
+
n_outputs_new += int32_t(ubatch.output[i] != 0);
|
|
8554
8538
|
}
|
|
8539
|
+
}
|
|
8555
8540
|
|
|
8556
|
-
|
|
8557
|
-
|
|
8541
|
+
// needs to happen before the graph is built
|
|
8542
|
+
lctx.n_outputs = n_outputs_new;
|
|
8543
|
+
}
|
|
8544
|
+
|
|
8545
|
+
// non-causal masks do not use the KV cache
|
|
8546
|
+
if (hparams.causal_attn) {
|
|
8547
|
+
llama_kv_cache_update(&lctx);
|
|
8548
|
+
|
|
8549
|
+
// if we have enough unused cells before the current head ->
|
|
8550
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
|
8551
|
+
if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
|
|
8552
|
+
kv_self.head = 0;
|
|
8558
8553
|
}
|
|
8559
8554
|
|
|
8560
|
-
|
|
8561
|
-
|
|
8555
|
+
const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
|
8556
|
+
if (!slot) {
|
|
8557
|
+
return 1;
|
|
8558
|
+
}
|
|
8559
|
+
kv_slot_restorer.save(slot);
|
|
8560
|
+
|
|
8561
|
+
if (!kv_self.recurrent) {
|
|
8562
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
|
8563
|
+
// after enough generations, the benefit from this heuristic disappears
|
|
8564
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
|
8565
|
+
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
|
8566
|
+
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
|
8567
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
|
8568
|
+
}
|
|
8569
|
+
}
|
|
8562
8570
|
|
|
8563
|
-
|
|
8571
|
+
return 0;
|
|
8572
|
+
}
|
|
8564
8573
|
|
|
8565
|
-
|
|
8566
|
-
|
|
8567
|
-
|
|
8574
|
+
// decode a batch of tokens by evaluating the transformer
|
|
8575
|
+
// in case of unsuccessful decoding (error or warning),
|
|
8576
|
+
// the kv_cache state will be returned to its original state
|
|
8577
|
+
// (for non-recurrent models) or cleaned (for recurrent models)
|
|
8578
|
+
//
|
|
8579
|
+
// - lctx: llama context
|
|
8580
|
+
// - inp_batch: batch to evaluate
|
|
8581
|
+
//
|
|
8582
|
+
// return 0 on success
|
|
8583
|
+
// return positive int on warning
|
|
8584
|
+
// return negative int on error
|
|
8585
|
+
//
|
|
8586
|
+
static int llama_decode_impl(
|
|
8587
|
+
llama_context & lctx,
|
|
8588
|
+
llama_batch inp_batch) {
|
|
8568
8589
|
|
|
8569
|
-
|
|
8570
|
-
// better to start searching from the beginning of the cache, hoping to fill it
|
|
8571
|
-
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
|
8572
|
-
kv_self.head = 0;
|
|
8573
|
-
}
|
|
8590
|
+
lctx.is_encoding = false;
|
|
8574
8591
|
|
|
8575
|
-
|
|
8576
|
-
|
|
8577
|
-
|
|
8578
|
-
|
|
8579
|
-
|
|
8592
|
+
if (inp_batch.n_tokens == 0) {
|
|
8593
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
|
8594
|
+
return -1;
|
|
8595
|
+
}
|
|
8596
|
+
|
|
8597
|
+
// temporarily allocate memory for the input batch if needed
|
|
8598
|
+
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
|
|
8599
|
+
const llama_batch & batch = batch_allocr.batch;
|
|
8600
|
+
|
|
8601
|
+
const auto & model = lctx.model;
|
|
8602
|
+
const auto & vocab = model.vocab;
|
|
8603
|
+
const auto & hparams = model.hparams;
|
|
8604
|
+
const auto & cparams = lctx.cparams;
|
|
8605
|
+
|
|
8606
|
+
if (lctx.t_compute_start_us == 0) {
|
|
8607
|
+
lctx.t_compute_start_us = ggml_time_us();
|
|
8608
|
+
}
|
|
8609
|
+
auto & kv_self = lctx.kv_self;
|
|
8610
|
+
llama_kv_slot_restorer kv_slot_restorer(kv_self);
|
|
8611
|
+
|
|
8612
|
+
const int64_t n_embd = hparams.n_embd;
|
|
8613
|
+
const int64_t n_vocab = vocab.n_tokens();
|
|
8580
8614
|
|
|
8581
|
-
|
|
8582
|
-
|
|
8583
|
-
|
|
8584
|
-
|
|
8585
|
-
|
|
8586
|
-
|
|
8587
|
-
|
|
8615
|
+
uint32_t n_outputs = 0;
|
|
8616
|
+
uint32_t n_outputs_prev = 0;
|
|
8617
|
+
|
|
8618
|
+
{
|
|
8619
|
+
const int ret = llama_prepare_sbatch(lctx, batch, n_outputs);
|
|
8620
|
+
if (ret != 0) {
|
|
8621
|
+
return ret;
|
|
8622
|
+
}
|
|
8623
|
+
}
|
|
8624
|
+
|
|
8625
|
+
while (lctx.sbatch.n_tokens > 0) {
|
|
8626
|
+
llama_ubatch ubatch;
|
|
8627
|
+
{
|
|
8628
|
+
const int ret = llama_prepare_ubatch(lctx, kv_slot_restorer, ubatch, n_outputs, batch.n_tokens);
|
|
8629
|
+
if (ret != 0) {
|
|
8630
|
+
return ret;
|
|
8588
8631
|
}
|
|
8589
8632
|
}
|
|
8590
8633
|
|
|
8634
|
+
const int n_threads = ubatch.n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
|
8635
|
+
ggml_threadpool_t threadpool = ubatch.n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
|
8636
|
+
|
|
8637
|
+
GGML_ASSERT(n_threads > 0);
|
|
8638
|
+
|
|
8591
8639
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
|
8592
8640
|
|
|
8593
8641
|
ggml_backend_sched_reset(lctx.sched.get());
|
|
@@ -8640,7 +8688,7 @@ static int llama_decode_impl(
|
|
|
8640
8688
|
|
|
8641
8689
|
// update the kv ring buffer
|
|
8642
8690
|
{
|
|
8643
|
-
kv_self.head += n_tokens;
|
|
8691
|
+
kv_self.head += ubatch.n_tokens;
|
|
8644
8692
|
|
|
8645
8693
|
// Ensure kv cache head points to a valid index.
|
|
8646
8694
|
if (kv_self.head >= kv_self.size) {
|
|
@@ -9405,6 +9453,7 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
9405
9453
|
model->devices.push_back(*dev);
|
|
9406
9454
|
}
|
|
9407
9455
|
} else {
|
|
9456
|
+
std::vector<ggml_backend_dev_t> rpc_servers;
|
|
9408
9457
|
// use all available devices
|
|
9409
9458
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
9410
9459
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
@@ -9415,10 +9464,19 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
9415
9464
|
break;
|
|
9416
9465
|
|
|
9417
9466
|
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
|
9418
|
-
|
|
9467
|
+
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
9468
|
+
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
|
9469
|
+
rpc_servers.push_back(dev);
|
|
9470
|
+
} else {
|
|
9471
|
+
model->devices.push_back(dev);
|
|
9472
|
+
}
|
|
9419
9473
|
break;
|
|
9420
9474
|
}
|
|
9421
9475
|
}
|
|
9476
|
+
// add RPC servers at the front of the list
|
|
9477
|
+
if (!rpc_servers.empty()) {
|
|
9478
|
+
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
|
9479
|
+
}
|
|
9422
9480
|
}
|
|
9423
9481
|
|
|
9424
9482
|
// if using single GPU mode, remove all except the main GPU
|
|
@@ -86,6 +86,9 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE
|
|
|
86
86
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
|
87
87
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
|
88
88
|
|
|
89
|
+
if (LLAMA_LLGUIDANCE)
|
|
90
|
+
llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
|
91
|
+
endif ()
|
|
89
92
|
|
|
90
93
|
if (NOT WIN32)
|
|
91
94
|
# these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
|
|
@@ -93,6 +96,7 @@ if (NOT WIN32)
|
|
|
93
96
|
llama_target_and_test(test-grammar-parser.cpp)
|
|
94
97
|
llama_target_and_test(test-grammar-integration.cpp)
|
|
95
98
|
llama_target_and_test(test-llama-grammar.cpp)
|
|
99
|
+
llama_target_and_test(test-chat.cpp)
|
|
96
100
|
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
|
97
101
|
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
98
102
|
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|