npm - @fugood/llama.node - Versions diffs - 0.4.7 → 0.5.0 - Mend

@fugood/llama.node 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +20 -6
package/lib/index.js +41 -17
package/lib/index.ts +50 -23
package/package.json +1 -1
package/src/LlamaCompletionWorker.cpp +9 -9
package/src/LlamaCompletionWorker.h +2 -2
package/src/LlamaContext.cpp +37 -18
package/src/LlamaContext.h +1 -0
package/src/TokenizeWorker.cpp +16 -12
package/src/TokenizeWorker.h +2 -2
package/src/common.hpp +54 -50
package/src/llama.cpp/.github/workflows/build.yml +2 -2
package/src/llama.cpp/.github/workflows/release.yml +152 -129
package/src/llama.cpp/.github/workflows/winget.yml +42 -0
package/src/llama.cpp/common/arg.cpp +14 -13
package/src/llama.cpp/common/common.cpp +4 -75
package/src/llama.cpp/common/common.h +7 -12
package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
package/src/llama.cpp/examples/simple/simple.cpp +1 -1
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
package/src/llama.cpp/ggml/include/ggml.h +11 -0
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
package/src/llama.cpp/ggml/src/ggml.c +64 -18
package/src/llama.cpp/include/llama.h +24 -124
package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
package/src/llama.cpp/src/llama-batch.cpp +3 -1
package/src/llama.cpp/src/llama-context.cpp +60 -110
package/src/llama.cpp/src/llama-graph.cpp +137 -233
package/src/llama.cpp/src/llama-graph.h +49 -7
package/src/llama.cpp/src/llama-hparams.cpp +17 -1
package/src/llama.cpp/src/llama-hparams.h +34 -5
package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
package/src/llama.cpp/src/llama-kv-cache.h +201 -85
package/src/llama.cpp/src/llama-memory.h +3 -2
package/src/llama.cpp/src/llama-model.cpp +273 -94
package/src/llama.cpp/src/llama-model.h +4 -1
package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
package/src/llama.cpp/tools/mtmd/clip.h +6 -4
package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
package/src/llama.cpp/tools/run/run.cpp +2 -2
package/src/llama.cpp/tools/server/server.cpp +158 -47
package/src/llama.cpp/tools/server/utils.hpp +71 -43
package/src/llama.cpp/tools/tts/tts.cpp +4 -2

package/src/llama.cpp/ggml/src/ggml.c CHANGED Viewed

@@ -64,12 +64,17 @@
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];
-#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
-    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
+#if defined(__linux__) || \
+    defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+    (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
+#if defined(__linux__)
+#include <sys/prctl.h>
+#endif
 #if defined(__ANDROID__)
 #include <unwind.h>
@@ -133,10 +138,36 @@ static void ggml_print_backtrace(void) {
     if (GGML_NO_BACKTRACE) {
         return;
     }
-    char attach[32];
-    snprintf(attach, sizeof(attach), "attach %d", getpid());
-    int pid = fork();
-    if (pid == 0) {
+#if defined(__linux__)
+    FILE * f = fopen("/proc/self/status", "r");
+    size_t size = 0;
+    char * line = NULL;
+    ssize_t length = 0;
+    while ((length = getline(&line, &size, f)) > 0) {
+        if (!strncmp(line, "TracerPid:", sizeof("TracerPid:") - 1) &&
+            (length != sizeof("TracerPid:\t0\n") - 1 || line[length - 2] != '0')) {
+            // Already being debugged, and the breakpoint is the later abort()
+            free(line);
+            fclose(f);
+            return;
+        }
+    }
+    free(line);
+    fclose(f);
+    int lock[2] = { -1, -1 };
+    (void) !pipe(lock); // Don't start gdb until after PR_SET_PTRACER
+#endif
+    const int parent_pid = getpid();
+    const int child_pid = fork();
+    if (child_pid < 0) { // error
+        return;
+    } else if (child_pid == 0) { // child
+        char attach[32];
+        snprintf(attach, sizeof(attach), "attach %d", parent_pid);
+#if defined(__linux__)
+        close(lock[1]);
+        (void) !read(lock[0], lock, 1);
+#endif
         // try gdb
         execlp("gdb", "gdb", "--batch",
             "-ex", "set style enabled on",
@@ -149,18 +180,18 @@ static void ggml_print_backtrace(void) {
         execlp("lldb", "lldb", "--batch",
             "-o", "bt",
             "-o", "quit",
-            "-p", attach,
+            "-p", &attach[sizeof("attach ") - 1],
             (char *) NULL);
-        exit(EXIT_FAILURE);
-    } else {
-        int wstatus;
-        waitpid(pid, &wstatus, 0);
-        if (WIFEXITED(wstatus)) {
-            if (WEXITSTATUS(wstatus) == EXIT_FAILURE) {
-                // gdb failed, fallback to backtrace_symbols
-                ggml_print_backtrace_symbols();
-            }
-        }
+        // gdb failed, fallback to backtrace_symbols
+        ggml_print_backtrace_symbols();
+        _Exit(0);
+    } else { // parent
+#if defined(__linux__)
+        prctl(PR_SET_PTRACER, child_pid);
+        close(lock[1]);
+        close(lock[0]);
+#endif
+        waitpid(child_pid, NULL, 0);
     }
 }
 #else
@@ -1068,9 +1099,10 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
     "HARDSWISH",
     "HARDSIGMOID",
     "EXP",
+    "GELU_ERF",
 };
-static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14");
+static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -2470,6 +2502,20 @@ struct ggml_tensor * ggml_gelu_inplace(
     return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
 }
+// ggml_gelu_erf
+struct ggml_tensor * ggml_gelu_erf(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_ERF);
+}
+struct ggml_tensor * ggml_gelu_erf_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_ERF);
+}
 // ggml_gelu_quick
 struct ggml_tensor * ggml_gelu_quick(

package/src/llama.cpp/include/llama.h CHANGED Viewed

@@ -361,10 +361,11 @@ extern "C" {
         // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
         bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-        bool no_perf;     // whether to measure performance timings
-        bool op_offload;  // whether to offload host tensor operations to device
+        bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // use flash attention [EXPERIMENTAL]
+        bool no_perf;     // measure performance timings
+        bool op_offload;  // offload host tensor operations to device
+        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
     };
     // model quantization parameters
@@ -607,71 +608,14 @@ extern "C" {
     // KV cache
     //
-    // TODO: start using struct llama_kv_cache
-    // Information associated with an individual cell in the KV cache view.
-    struct llama_kv_cache_view_cell {
-        // The position for this cell. Takes KV cache shifts into account.
-        // May be negative if the cell is not populated.
-        llama_pos pos;
-    };
-    // An updateable view of the KV cache.
-    struct llama_kv_cache_view {
-        // Number of KV cache cells. This will be the same as the context size.
-        int32_t n_cells;
-        // Maximum number of sequences that can exist in a cell. It's not an error
-        // if there are more sequences in a cell than this value, however they will
-        // not be visible in the view cells_sequences.
-        int32_t n_seq_max;
-        // Number of tokens in the cache. For example, if there are two populated
-        // cells, the first with 1 sequence id in it and the second with 2 sequence
-        // ids then you'll have 3 tokens.
-        int32_t token_count;
-        // Number of populated cache cells.
-        int32_t used_cells;
-        // Maximum contiguous empty slots in the cache.
-        int32_t max_contiguous;
-        // Index to the start of the max_contiguous slot range. Can be negative
-        // when cache is full.
-        int32_t max_contiguous_idx;
-        // Information for an individual cell.
-        struct llama_kv_cache_view_cell * cells;
-        // The sequences for each cell. There will be n_seq_max items per cell.
-        llama_seq_id * cells_sequences;
-    };
-    // Create an empty KV cache view. (use only for debugging purposes)
-    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
-    // Free a KV cache view. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
-    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-    ///
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
-    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
-            "use llama_kv_self_n_tokens instead");
+    DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
+               "Use llama_kv_self_seq_pos_max() instead");
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
-    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
-            "use llama_kv_self_used_cells instead");
+    DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
+               "Use llama_kv_self_seq_pos_max() instead");
     // Clear the KV cache - both cell info is erased and KV data is zeroed
     LLAMA_API void llama_kv_self_clear(
@@ -730,10 +674,18 @@ extern "C" {
                        llama_pos   p1,
                              int   d);
+    // Returns the smallest position present in the KV cache for the specified sequence
+    // This is typically non-zero only for SWA caches
+    // Return -1 if the sequence is empty
+    LLAMA_API llama_pos llama_kv_self_seq_pos_min(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
     // Returns the largest position present in the KV cache for the specified sequence
+    // Return -1 if the sequence is empty
     LLAMA_API llama_pos llama_kv_self_seq_pos_max(
             struct llama_context * ctx,
-                     llama_seq_id   seq_id);
+                    llama_seq_id   seq_id);
     // Defragment the KV cache
     // This will be applied:
@@ -747,61 +699,6 @@ extern "C" {
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
     LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
-    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx),
-            "use llama_kv_self_clear instead");
-    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1),
-            "use llama_kv_self_seq_rm instead");
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id_src,
-                    llama_seq_id   seq_id_dst,
-                       llama_pos   p0,
-                       llama_pos   p1),
-            "use llama_kv_self_seq_cp instead");
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id),
-            "use llama_kv_self_seq_keep instead");
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                       llama_pos   delta),
-            "use llama_kv_self_seq_add instead");
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d),
-            "use llama_kv_self_seq_div instead");
-    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id),
-            "use llama_kv_self_seq_pos_max instead");
-    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
-            "use llama_kv_self_defrag instead");
-    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
-            "use llama_kv_self_can_shift instead");
-    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
-            "use llama_kv_self_update instead");
     //
     // State / sessions
     //
@@ -943,9 +840,12 @@ extern "C" {
     // Requires KV cache.
     // For encode-decoder contexts, processes the batch using the decoder.
     // Positive return values does not mean a fatal error, but rather a warning.
-    //   0 - success
-    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    // < 0 - error. the KV cache state is restored to the state before this call
+    // Upon non-zero return values, the KV cache state is restored to the state before this call
+    //    0 - success
+    //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+    //    2 - aborted
+    //   -1 - invalid input batch
+    // < -1 - error
     LLAMA_API int32_t llama_decode(
             struct llama_context * ctx,
               struct llama_batch   batch);

package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt CHANGED Viewed

@@ -1,3 +1,7 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1
+torch~=2.2.1; platform_machine != "s390x"
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly
+torch>=0.0.0.dev0; platform_machine == "s390x"

package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt CHANGED Viewed

@@ -1,3 +1,7 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1
+torch~=2.2.1; platform_machine != "s390x"
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly
+torch>=0.0.0.dev0; platform_machine == "s390x"

package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 -r ./requirements-convert_hf_to_gguf.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly

package/src/llama.cpp/src/llama-batch.cpp CHANGED Viewed

@@ -1,5 +1,6 @@
 #include "llama-batch.h"
+#include <cassert>
 #include <cstring>
 #include <algorithm>
@@ -281,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
     batch = in_batch;
     GGML_ASSERT(batch.n_tokens > 0);
     if (!batch.pos) {
+        assert(p0 >= 0);
         pos.resize(batch.n_tokens);
         for (int32_t i = 0; i < batch.n_tokens; i++) {
-            pos[i] = i + p0;
+            pos[i] = p0 + i;
         }
         batch.pos = pos.data();
     }

package/src/llama.cpp/src/llama-context.cpp CHANGED Viewed

@@ -93,6 +93,7 @@ llama_context::llama_context(
     }
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
     cparams.op_offload = params.op_offload;
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
@@ -176,8 +177,9 @@ llama_context::llama_context(
     // init the memory module
     if (!hparams.vocab_only) {
         llama_memory_params params_mem = {
-            /*.type_k =*/ params.type_k,
-            /*.type_v =*/ params.type_v,
+            /*.type_k   =*/ params.type_k,
+            /*.type_v   =*/ params.type_v,
+            /*.swa_full =*/ params.swa_full,
         };
         memory.reset(model.create_memory(params_mem, cparams));
@@ -855,11 +857,17 @@ int llama_context::decode(llama_batch & inp_batch) {
         return -1;
     }
+    if (!inp_batch.pos) {
+        if (inp_batch.seq_id) {
+            LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
+            return -1;
+        }
+    }
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
     // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1);
     const llama_batch & batch = batch_allocr.batch;
@@ -947,8 +955,6 @@ int llama_context::decode(llama_batch & inp_batch) {
         // find KV slot
         if (!kv_self->find_slot(ubatch)) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
             return 1;
         }
@@ -2093,6 +2099,7 @@ llama_context_params llama_context_default_params() {
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
+        /*.swa_full                    =*/ true,
     };
     return result;
@@ -2287,65 +2294,51 @@ int32_t llama_apply_adapter_cvec(
     return res ? 0 : -1;
 }
-//
-// kv cache view
-//
-llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
-    const auto * kv = ctx->get_kv_self();
-    if (kv == nullptr) {
-        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
-        return {};
-    }
-    return llama_kv_cache_view_init(*kv, n_seq_max);
-}
-void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
-    const auto * kv = ctx->get_kv_self();
-    if (kv == nullptr) {
-        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
-        return;
-    }
-    llama_kv_cache_view_update(view, kv);
-}
 //
 // kv cache
 //
 // deprecated
-int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
-    return llama_kv_self_n_tokens(ctx);
-}
 int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
     const auto * kv = ctx->get_kv_self();
     if (!kv) {
         return 0;
     }
-    return kv->get_n_tokens();
-}
+    int32_t res = 0;
-// deprecated
-int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
-    return llama_kv_self_used_cells(ctx);
+    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
+        const llama_pos p0 = kv->seq_pos_min(s);
+        const llama_pos p1 = kv->seq_pos_max(s);
+        if (p0 >= 0) {
+            res += (p1 - p0) + 1;
+        }
+    }
+    return res;
 }
+// deprecated
+// note: this is the same as above - will be removed anyway, so it's ok
 int32_t llama_kv_self_used_cells(const llama_context * ctx) {
     const auto * kv = ctx->get_kv_self();
     if (!kv) {
         return 0;
     }
-    return kv->get_used_cells();
-}
+    int32_t res = 0;
-// deprecated
-void llama_kv_cache_clear(llama_context * ctx) {
-    llama_kv_self_clear(ctx);
+    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
+        const llama_pos p0 = kv->seq_pos_min(s);
+        const llama_pos p1 = kv->seq_pos_max(s);
+        if (p0 >= 0) {
+            res += (p1 - p0) + 1;
+        }
+    }
+    return res;
 }
 void llama_kv_self_clear(llama_context * ctx) {
@@ -2357,15 +2350,6 @@ void llama_kv_self_clear(llama_context * ctx) {
     kv->clear();
 }
-// deprecated
-bool llama_kv_cache_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
-}
 bool llama_kv_self_seq_rm(
         llama_context * ctx,
          llama_seq_id   seq_id,
@@ -2379,16 +2363,6 @@ bool llama_kv_self_seq_rm(
     return kv->seq_rm(seq_id, p0, p1);
 }
-// deprecated
-void llama_kv_cache_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
-}
 void llama_kv_self_seq_cp(
         llama_context * ctx,
          llama_seq_id   seq_id_src,
@@ -2403,13 +2377,6 @@ void llama_kv_self_seq_cp(
     kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
-// deprecated
-void llama_kv_cache_seq_keep(
-        llama_context * ctx,
-         llama_seq_id   seq_id) {
-    llama_kv_self_seq_keep(ctx, seq_id);
-}
 void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
     auto * kv = ctx->get_kv_self();
     if (!kv) {
@@ -2419,16 +2386,6 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
     kv->seq_keep(seq_id);
 }
-// deprecated
-void llama_kv_cache_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
-}
 void llama_kv_self_seq_add(
         llama_context * ctx,
          llama_seq_id   seq_id,
@@ -2443,16 +2400,6 @@ void llama_kv_self_seq_add(
     kv->seq_add(seq_id, p0, p1, delta);
 }
-// deprecated
-void llama_kv_cache_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
-}
 void llama_kv_self_seq_div(
         llama_context * ctx,
          llama_seq_id   seq_id,
@@ -2467,25 +2414,24 @@ void llama_kv_self_seq_div(
     kv->seq_div(seq_id, p0, p1, d);
 }
-// deprecated
-llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_self_seq_pos_max(ctx, seq_id);
+llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
+    const auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return -1;
+    }
+    return kv->seq_pos_min(seq_id);
 }
 llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
     const auto * kv = ctx->get_kv_self();
     if (!kv) {
-        return 0;
+        return -1;
     }
     return kv->seq_pos_max(seq_id);
 }
-// deprecated
-void llama_kv_cache_defrag(llama_context * ctx) {
-    llama_kv_self_defrag(ctx);
-}
 void llama_kv_self_defrag(llama_context * ctx) {
     auto * kv = ctx->get_kv_self();
     if (!kv) {
@@ -2496,11 +2442,6 @@ void llama_kv_self_defrag(llama_context * ctx) {
     kv->defrag_sched(-1.0f);
 }
-// deprecated
-bool llama_kv_cache_can_shift(const llama_context * ctx) {
-    return llama_kv_self_can_shift(ctx);
-}
 bool llama_kv_self_can_shift(const llama_context * ctx) {
     const auto * kv = ctx->get_kv_self();
     if (!kv) {
@@ -2510,11 +2451,6 @@ bool llama_kv_self_can_shift(const llama_context * ctx) {
     return kv->get_can_shift();
 }
-// deprecated
-void llama_kv_cache_update(llama_context * ctx) {
-    llama_kv_self_update(ctx);
-}
 // llama state API
 // deprecated
@@ -2637,7 +2573,21 @@ int32_t llama_encode(
 int32_t llama_decode(
         llama_context * ctx,
           llama_batch   batch) {
-    const int ret = ctx->decode(batch);
+    int ret = ctx->decode(batch);
+    // defrag and try again
+    // TODO: distinguish return code when we are sure that even after defrag there is no space available
+    if (ret == 1) {
+        llama_kv_self_defrag(ctx);
+        ret = ctx->decode(batch);
+        if (ret == 1) {
+            LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+            return ret;
+        }
+    }
     if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }