npm - @fugood/llama.node - Versions diffs - 0.3.15 → 0.3.17 - Mend

@fugood/llama.node 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

package/CMakeLists.txt +3 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +5 -0
package/package.json +1 -1
package/src/LlamaCompletionWorker.cpp +8 -0
package/src/LlamaCompletionWorker.h +1 -0
package/src/LlamaContext.cpp +3 -2
package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
package/src/llama.cpp/.github/workflows/build.yml +70 -27
package/src/llama.cpp/.github/workflows/docker.yml +6 -6
package/src/llama.cpp/.github/workflows/server.yml +7 -11
package/src/llama.cpp/CMakeLists.txt +23 -1
package/src/llama.cpp/common/CMakeLists.txt +6 -3
package/src/llama.cpp/common/arg.cpp +809 -105
package/src/llama.cpp/common/arg.h +9 -0
package/src/llama.cpp/common/chat.cpp +1 -1
package/src/llama.cpp/common/common.cpp +31 -521
package/src/llama.cpp/common/common.h +17 -36
package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
package/src/llama.cpp/common/llguidance.cpp +30 -47
package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
package/src/llama.cpp/common/minja/minja.hpp +119 -93
package/src/llama.cpp/common/sampling.cpp +3 -0
package/src/llama.cpp/docs/build.md +122 -7
package/src/llama.cpp/examples/CMakeLists.txt +0 -9
package/src/llama.cpp/examples/batched/batched.cpp +1 -1
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
package/src/llama.cpp/examples/llava/clip.h +39 -22
package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
package/src/llama.cpp/examples/llava/llava.cpp +64 -52
package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
package/src/llama.cpp/examples/llava/mtmd.h +168 -0
package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
package/src/llama.cpp/examples/main/main.cpp +16 -5
package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
package/src/llama.cpp/examples/run/run.cpp +14 -28
package/src/llama.cpp/examples/server/httplib.h +313 -247
package/src/llama.cpp/examples/server/server.cpp +243 -139
package/src/llama.cpp/examples/server/utils.hpp +51 -2
package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
package/src/llama.cpp/examples/sycl/build.sh +2 -2
package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
package/src/llama.cpp/examples/tts/tts.cpp +14 -9
package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
package/src/llama.cpp/ggml/include/ggml.h +66 -99
package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
package/src/llama.cpp/ggml/src/ggml.c +141 -245
package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
package/src/llama.cpp/include/llama.h +30 -11
package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
package/src/llama.cpp/requirements/requirements-all.txt +2 -0
package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
package/src/llama.cpp/src/CMakeLists.txt +3 -2
package/src/llama.cpp/src/llama-adapter.cpp +37 -1
package/src/llama.cpp/src/llama-arch.cpp +161 -17
package/src/llama.cpp/src/llama-arch.h +16 -0
package/src/llama.cpp/src/llama-chat.cpp +82 -17
package/src/llama.cpp/src/llama-chat.h +6 -2
package/src/llama.cpp/src/llama-context.cpp +108 -92
package/src/llama.cpp/src/llama-context.h +1 -2
package/src/llama.cpp/src/llama-graph.cpp +189 -119
package/src/llama.cpp/src/llama-graph.h +26 -6
package/src/llama.cpp/src/llama-hparams.h +13 -0
package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
package/src/llama.cpp/src/llama-kv-cache.h +41 -115
package/src/llama.cpp/src/llama-memory.h +1 -1
package/src/llama.cpp/src/llama-mmap.cpp +1 -1
package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
package/src/llama.cpp/src/llama-model-loader.h +5 -3
package/src/llama.cpp/src/llama-model.cpp +1544 -291
package/src/llama.cpp/src/llama-model.h +13 -1
package/src/llama.cpp/src/llama-quant.cpp +29 -8
package/src/llama.cpp/src/llama-sampling.cpp +7 -1
package/src/llama.cpp/src/llama-vocab.cpp +44 -6
package/src/llama.cpp/src/llama.cpp +1 -1
package/src/llama.cpp/tests/CMakeLists.txt +43 -30
package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
package/src/llama.cpp/tests/test-chat.cpp +12 -2
package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295

package/src/llama.cpp/src/llama-context.cpp CHANGED Viewed

@@ -10,6 +10,7 @@
 #include <cstring>
 #include <stdexcept>
 #include <cinttypes>
+#include <cmath>
 //
 // llama_context
@@ -113,7 +114,7 @@ llama_context::llama_context(
     }
     if (n_ctx_per_seq > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
     }
@@ -255,7 +256,8 @@ llama_context::llama_context(
             model.n_devices() > 1 &&
             model.params.n_gpu_layers > (int) model.hparams.n_layer &&
             model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
-            cparams.offload_kqv;
+            cparams.offload_kqv &&
+            !model.has_tensor_overrides();
         // pipeline parallelism requires support for async compute and events in all devices
         if (pipeline_parallel) {
@@ -294,10 +296,7 @@ llama_context::llama_context(
         // TODO: something cleaner
         const auto n_outputs_save = n_outputs;
-        // max number of outputs
-        n_outputs = n_tokens;
-        LLAMA_LOG_DEBUG("%s: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
+        LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
         int n_splits_pp = -1;
         int n_nodes_pp  = -1;
@@ -313,8 +312,15 @@ llama_context::llama_context(
         // reserve pp graph first so that buffers are only allocated once
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            // max number of outputs
+            n_outputs = ubatch_pp.n_tokens;
+            LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
             auto * gf = graph_init();
             graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
             if (!ggml_backend_sched_reserve(sched.get(), gf)) {
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
@@ -326,11 +332,18 @@ llama_context::llama_context(
         // reserve with tg graph to get the number of splits and nodes
         {
             llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            n_outputs = ubatch_tg.n_tokens;
+            LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_tg.n_tokens, ubatch_tg.n_seqs);
             auto * gf = graph_init();
             graph_build(ctx_compute.get(), gf, ubatch_tg, LLM_GRAPH_TYPE_DEFAULT);
             if (!ggml_backend_sched_reserve(sched.get(), gf)) {
                 throw std::runtime_error("failed to allocate compute tg buffers");
             }
             n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
             n_nodes_tg  = ggml_graph_n_nodes(gf);
         }
@@ -338,8 +351,14 @@ llama_context::llama_context(
         // reserve again with pp graph to avoid ggml-alloc reallocations during inference
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            n_outputs = ubatch_pp.n_tokens;
+            LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
             auto * gf = graph_init();
             graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
             if (!ggml_backend_sched_reserve(sched.get(), gf)) {
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
@@ -450,12 +469,10 @@ ggml_tensor * llama_context::build_rope_shift(
         ggml_tensor * shift,
         ggml_tensor * factors,
               float   freq_base,
-              float   freq_scale,
-        ggml_backend_buffer * bbuf) const {
+              float   freq_scale) const {
     const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
     const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
-    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
     const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
     const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
@@ -464,23 +481,17 @@ ggml_tensor * llama_context::build_rope_shift(
     const auto & n_rot     = hparams.n_rot;
     const auto & rope_type = hparams.rope_type;
+    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
+    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
     ggml_tensor * tmp;
     if (ggml_is_quantized(cur->type)) {
         // dequantize to f32 -> RoPE -> quantize back
         tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
-        if (bbuf) {
-            for (const auto & backend : backends) {
-                // Figure out which backend KV cache belongs to
-                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
-                    ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
-                    break;
-                }
-            }
-        }
-        tmp = ggml_rope_ext_inplace(ctx0, tmp,
+        tmp = ggml_rope_ext(ctx0, tmp,
                 shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                 yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
@@ -560,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
                 ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
                 0);
-        ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
+        ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
         ggml_build_forward_expand(gf, cur);
     }
@@ -1184,33 +1195,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
-    // TODO: remove this stuff
-    class batch_guard {
-    public:
-        batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) {
-        }
-        ~batch_guard() {
-            if (!is_done) {
-                kv_slot_restorer.restore();
-            }
-        }
-        void done() {
-            is_done = true;
-        }
-        void save(const llama_kv_cache_slot_info & slot_info) {
-            kv_slot_restorer.save(slot_info);
-        }
-    private:
-        bool is_done = false;
-        llama_kv_slot_restorer kv_slot_restorer;
-    };
-    batch_guard bg(*kv_self);
+    llama_kv_cache_guard kv_guard(kv_self.get());
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
@@ -1263,6 +1248,9 @@ int llama_context::decode(llama_batch & inp_batch) {
         return -2;
     };
+    // handle any pending defrags/shifts
+    kv_self_update();
     int64_t n_outputs_prev = 0;
     while (sbatch.n_tokens > 0) {
@@ -1300,24 +1288,14 @@ int llama_context::decode(llama_batch & inp_batch) {
             n_outputs = n_outputs_new;
         }
-        // non-causal masks do not use the KV cache
-        if (hparams.causal_attn) {
-            kv_self_update();
-            // if we have enough unused cells before the current head ->
-            //   better to start searching from the beginning of the cache, hoping to fill it
-            if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) {
-                kv_self->head = 0;
-            }
+        // find KV slot
+        {
+            if (!kv_self->find_slot(ubatch)) {
+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-            const auto slot_info = kv_self->find_slot(ubatch);
-            if (!slot_info) {
-                LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
-                return -3;
+                return 1;
             }
-            bg.save(slot_info);
             if (!kv_self->recurrent) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
@@ -1354,16 +1332,6 @@ int llama_context::decode(llama_batch & inp_batch) {
             }
         }
-        // update the kv ring buffer
-        {
-            kv_self->head += ubatch.n_tokens;
-            // Ensure kv cache head points to a valid index.
-            if (kv_self->head >= kv_self->size) {
-                kv_self->head = 0;
-            }
-        }
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
@@ -1450,7 +1418,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     }
     // finalize the batch processing
-    bg.done();
+    kv_guard.commit();
     // set output mappings
     {
@@ -1568,8 +1536,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     // set all ids as invalid (negative)
     std::fill(output_ids.begin(), output_ids.end(), -1);
-    ggml_backend_buffer_clear(buf_output.get(), 0);
     this->n_outputs     = 0;
     this->n_outputs_max = n_outputs_max;
@@ -2299,11 +2265,6 @@ llama_context * llama_init_from_model(
         params.flash_attn = false;
     }
-    if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
-        LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
-        params.flash_attn = false;
-    }
     if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
         LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
         return nullptr;
@@ -2504,7 +2465,12 @@ int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
 }
 int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
-    return llama_kv_cache_n_tokens(ctx->get_kv_self());
+    const auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return 0;
+    }
+    return kv->get_n_tokens();
 }
 // deprecated
@@ -2513,7 +2479,12 @@ int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
 }
 int32_t llama_kv_self_used_cells(const llama_context * ctx) {
-    return llama_kv_cache_used_cells(ctx->get_kv_self());
+    const auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return 0;
+    }
+    return kv->get_used_cells();
 }
 // deprecated
@@ -2522,7 +2493,12 @@ void llama_kv_cache_clear(llama_context * ctx) {
 }
 void llama_kv_self_clear(llama_context * ctx) {
-    llama_kv_cache_clear(ctx->get_kv_self());
+    auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return;
+    }
+    kv->clear();
 }
 // deprecated
@@ -2539,7 +2515,12 @@ bool llama_kv_self_seq_rm(
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1) {
-    return llama_kv_cache_seq_rm(ctx->get_kv_self(), seq_id, p0, p1);
+    auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return true;
+    }
+    return kv->seq_rm(seq_id, p0, p1);
 }
 // deprecated
@@ -2558,7 +2539,12 @@ void llama_kv_self_seq_cp(
          llama_seq_id   seq_id_dst,
             llama_pos   p0,
             llama_pos   p1) {
-    return llama_kv_cache_seq_cp(ctx->get_kv_self(), seq_id_src, seq_id_dst, p0, p1);
+    auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return;
+    }
+    return kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 // deprecated
@@ -2569,7 +2555,12 @@ void llama_kv_cache_seq_keep(
 }
 void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_keep(ctx->get_kv_self(), seq_id);
+    auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return;
+    }
+    return kv->seq_keep(seq_id);
 }
 // deprecated
@@ -2588,7 +2579,12 @@ void llama_kv_self_seq_add(
             llama_pos   p0,
             llama_pos   p1,
             llama_pos   delta) {
-    return llama_kv_cache_seq_add(ctx->get_kv_self(), seq_id, p0, p1, delta);
+    auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return;
+    }
+    return kv->seq_add(seq_id, p0, p1, delta);
 }
 // deprecated
@@ -2607,7 +2603,12 @@ void llama_kv_self_seq_div(
             llama_pos   p0,
             llama_pos   p1,
                   int   d) {
-    return llama_kv_cache_seq_div(ctx->get_kv_self(), seq_id, p0, p1, d);
+    auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return;
+    }
+    return kv->seq_div(seq_id, p0, p1, d);
 }
 // deprecated
@@ -2616,7 +2617,12 @@ llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
 }
 llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_pos_max(ctx->get_kv_self(), seq_id);
+    const auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return 0;
+    }
+    return kv->seq_pos_max(seq_id);
 }
 // deprecated
@@ -2625,7 +2631,12 @@ void llama_kv_cache_defrag(llama_context * ctx) {
 }
 void llama_kv_self_defrag(llama_context * ctx) {
-    llama_kv_cache_defrag(ctx->get_kv_self());
+    auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return;
+    }
+    return kv->defrag();
 }
 // deprecated
@@ -2634,7 +2645,12 @@ bool llama_kv_cache_can_shift(const llama_context * ctx) {
 }
 bool llama_kv_self_can_shift(const llama_context * ctx) {
-    return llama_kv_cache_can_shift(ctx->get_kv_self());
+    const auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return false;
+    }
+    return kv->get_can_shift();
 }
 // deprecated

package/src/llama.cpp/src/llama-context.h CHANGED Viewed

@@ -170,8 +170,7 @@ private:
         ggml_tensor * shift,
         ggml_tensor * factors,
               float   freq_base,
-              float   freq_scale,
-        ggml_backend_buffer * bbuf) const;
+              float   freq_scale) const;
     llm_graph_result_ptr build_kv_self_shift(
             ggml_context * ctx0,