npm - cui-llama.rn - Versions diffs - 1.4.0 → 1.4.1 - Mend

cui-llama.rn 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

package/android/src/main/jni.cpp +9 -9
package/cpp/common.cpp +163 -60
package/cpp/common.h +43 -12
package/cpp/ggml-alloc.c +1042 -1037
package/cpp/ggml-backend-impl.h +255 -256
package/cpp/ggml-backend-reg.cpp +582 -582
package/cpp/ggml-backend.cpp +2002 -2002
package/cpp/ggml-backend.h +354 -352
package/cpp/ggml-common.h +1853 -1853
package/cpp/ggml-cpp.h +39 -39
package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
package/cpp/ggml-cpu-aarch64.h +8 -8
package/cpp/ggml-cpu-impl.h +386 -386
package/cpp/ggml-cpu-quants.c +10920 -10839
package/cpp/ggml-cpu-traits.cpp +36 -36
package/cpp/ggml-cpu-traits.h +38 -38
package/cpp/ggml-cpu.c +329 -60
package/cpp/ggml-cpu.cpp +10 -2
package/cpp/ggml-cpu.h +135 -135
package/cpp/ggml-impl.h +567 -567
package/cpp/ggml-metal-impl.h +17 -17
package/cpp/ggml-metal.m +4884 -4884
package/cpp/ggml-quants.c +5238 -5238
package/cpp/ggml-threading.h +14 -14
package/cpp/ggml.c +6514 -6448
package/cpp/ggml.h +2194 -2163
package/cpp/gguf.cpp +1329 -1325
package/cpp/gguf.h +202 -202
package/cpp/json-schema-to-grammar.cpp +1045 -1045
package/cpp/json-schema-to-grammar.h +8 -8
package/cpp/json.hpp +24766 -24766
package/cpp/llama-adapter.cpp +347 -346
package/cpp/llama-adapter.h +74 -73
package/cpp/llama-arch.cpp +1487 -1434
package/cpp/llama-arch.h +400 -395
package/cpp/llama-batch.cpp +368 -368
package/cpp/llama-batch.h +88 -88
package/cpp/llama-chat.cpp +578 -567
package/cpp/llama-chat.h +52 -51
package/cpp/llama-context.cpp +1775 -1771
package/cpp/llama-context.h +128 -128
package/cpp/llama-cparams.cpp +1 -1
package/cpp/llama-cparams.h +37 -37
package/cpp/llama-cpp.h +30 -30
package/cpp/llama-grammar.cpp +1139 -1139
package/cpp/llama-grammar.h +143 -143
package/cpp/llama-hparams.cpp +71 -71
package/cpp/llama-hparams.h +139 -140
package/cpp/llama-impl.cpp +167 -167
package/cpp/llama-impl.h +61 -61
package/cpp/llama-kv-cache.cpp +718 -718
package/cpp/llama-kv-cache.h +218 -218
package/cpp/llama-mmap.cpp +2 -1
package/cpp/llama-mmap.h +67 -67
package/cpp/llama-model-loader.cpp +1124 -1011
package/cpp/llama-model-loader.h +167 -158
package/cpp/llama-model.cpp +3997 -2202
package/cpp/llama-model.h +370 -391
package/cpp/llama-sampling.cpp +2408 -2406
package/cpp/llama-sampling.h +32 -48
package/cpp/llama-vocab.cpp +3247 -1982
package/cpp/llama-vocab.h +125 -182
package/cpp/llama.cpp +416 -2886
package/cpp/llama.h +1323 -1285
package/cpp/log.cpp +401 -401
package/cpp/log.h +121 -121
package/cpp/rn-llama.hpp +18 -12
package/cpp/sampling.cpp +505 -500
package/cpp/sgemm.cpp +2597 -2597
package/cpp/speculative.cpp +277 -274
package/cpp/speculative.h +28 -28
package/cpp/unicode.cpp +2 -3
package/package.json +1 -1

package/cpp/llama-kv-cache.h CHANGED Viewed

@@ -1,218 +1,218 @@
-#pragma once
-#include "llama.h"
-#include "ggml-cpp.h"
-#include <set>
-#include <vector>
-struct llama_kv_cell {
-    llama_pos pos   = -1;
-    llama_pos delta = 0;
-    int32_t   src   = -1; // used by recurrent state models to copy states
-    int32_t   tail  = -1;
-    std::set<llama_seq_id> seq_id;
-    bool has_seq_id(const llama_seq_id & id) const {
-        return seq_id.find(id) != seq_id.end();
-    }
-    bool is_empty() const {
-        return seq_id.empty();
-    }
-    bool is_same_seq(const llama_kv_cell & other) const {
-        return seq_id == other.seq_id;
-    }
-};
-// ring-buffer of cached KV data
-struct llama_kv_cache {
-    bool has_shift = false;
-    bool do_defrag = false;
-    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
-    bool v_trans   = true;  // the value tensor is transposed
-    bool can_shift = false;
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_internal also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
-    // computed before each graph build
-    uint32_t n = 0;
-    lm_ggml_type type_k = LM_GGML_TYPE_F16;
-    lm_ggml_type type_v = LM_GGML_TYPE_F16;
-    std::vector<llama_kv_cell> cells;
-    std::vector<struct lm_ggml_tensor *> k_l; // per layer
-    std::vector<struct lm_ggml_tensor *> v_l;
-    std::vector<lm_ggml_context_ptr> ctxs;
-    std::vector<lm_ggml_backend_buffer_ptr> bufs;
-    size_t total_size() const {
-        size_t size = 0;
-        for (const auto & buf : bufs) {
-            size += lm_ggml_backend_buffer_get_size(buf.get());
-        }
-        return size;
-    }
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos max_pos() const {
-        llama_pos max_pos = -1;
-        for (const auto & cell : cells) {
-            max_pos = std::max(max_pos, cell.pos);
-        }
-        return max_pos;
-    }
-};
-// a structure holds information about the slot found in llama_kv_cache_find_slot
-struct llama_kv_cache_slot_info {
-    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
-    bool found = false;                       // the slot was found
-    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
-    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
-    operator bool() const { return found; }
-};
-// TODO: maybe not needed
-uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
-bool llama_kv_cache_init(
-        struct llama_kv_cache & cache,
-            const llama_model & model,
-          const llama_cparams & cparams,
-                    lm_ggml_type   type_k,
-                    lm_ggml_type   type_v,
-                     uint32_t   kv_size,
-                         bool   offload);
-// find an empty slot of size "n_tokens" in the cache
-// updates the cache head
-// returns a structure holding information about the slot found
-// Note: On success, it's important that cache.head points
-// to the first cell of the slot.
-struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
-           struct llama_kv_cache & cache,
-       const struct llama_ubatch & batch);
-// find how many cells are currently in use
-uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
-void llama_kv_cache_clear(struct llama_kv_cache & cache);
-bool llama_kv_cache_seq_rm(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1);
-void llama_kv_cache_seq_cp(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id_src,
-                 llama_seq_id   seq_id_dst,
-                    llama_pos   p0,
-                    llama_pos   p1);
-void llama_kv_cache_seq_keep(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id);
-void llama_kv_cache_seq_add(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                    llama_pos   delta);
-void llama_kv_cache_seq_div(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                          int   d);
-llama_pos llama_kv_cache_seq_pos_max(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id);
-void llama_kv_cache_defrag(struct llama_kv_cache & cache);
-int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
-int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
-bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
-//
-// kv cache view
-//
-struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
-void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
-//
-// kv cache restore
-//
-// saves the kv_cache state for future recovery.
-// used to rollback llama_kv_cache_find_slot changes.
-struct llama_kv_slot_restorer {
-    struct llama_kv_cache_state {
-        uint32_t head = 0;
-        uint32_t n    = 0;
-    } old_state;
-    // for non-recurrent models only
-    // list of slots to restore
-    std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
-    bool do_restore = false;
-    explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
-        old_state.head = cache.head;
-        old_state.n    = cache.n;
-    }
-    // saves a slot information for future restoration
-    void save(const struct llama_kv_cache_slot_info & slot) {
-        if (slot) {
-            do_restore = true;
-            if (slot.boundaries.first != slot.boundaries.second) {
-                slot_boundaries.push_back(slot.boundaries);
-            }
-        }
-    }
-    // must be explicitly called to restore the kv_cache state
-    // and rollback changes from all llama_kv_cache_find_slot calls
-    void restore(struct llama_kv_cache & cache) {
-        if (do_restore) {
-            cache.head = old_state.head;
-            cache.n    = old_state.n;
-            if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
-                llama_kv_cache_seq_rm(cache, -1, -1, -1);
-            } else {
-                for (auto & slot : slot_boundaries) {
-                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
-                }
-            }
-        }
-    }
-};
+#pragma once
+#include "llama.h"
+#include "ggml-cpp.h"
+#include <set>
+#include <vector>
+struct llama_kv_cell {
+    llama_pos pos   = -1;
+    llama_pos delta = 0;
+    int32_t   src   = -1; // used by recurrent state models to copy states
+    int32_t   tail  = -1;
+    std::set<llama_seq_id> seq_id;
+    bool has_seq_id(const llama_seq_id & id) const {
+        return seq_id.find(id) != seq_id.end();
+    }
+    bool is_empty() const {
+        return seq_id.empty();
+    }
+    bool is_same_seq(const llama_kv_cell & other) const {
+        return seq_id == other.seq_id;
+    }
+};
+// ring-buffer of cached KV data
+struct llama_kv_cache {
+    bool has_shift = false;
+    bool do_defrag = false;
+    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
+    bool v_trans   = true;  // the value tensor is transposed
+    bool can_shift = false;
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+    // computed before each graph build
+    uint32_t n = 0;
+    lm_ggml_type type_k = LM_GGML_TYPE_F16;
+    lm_ggml_type type_v = LM_GGML_TYPE_F16;
+    std::vector<llama_kv_cell> cells;
+    std::vector<struct lm_ggml_tensor *> k_l; // per layer
+    std::vector<struct lm_ggml_tensor *> v_l;
+    std::vector<lm_ggml_context_ptr> ctxs;
+    std::vector<lm_ggml_backend_buffer_ptr> bufs;
+    size_t total_size() const {
+        size_t size = 0;
+        for (const auto & buf : bufs) {
+            size += lm_ggml_backend_buffer_get_size(buf.get());
+        }
+        return size;
+    }
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos max_pos() const {
+        llama_pos max_pos = -1;
+        for (const auto & cell : cells) {
+            max_pos = std::max(max_pos, cell.pos);
+        }
+        return max_pos;
+    }
+};
+// a structure holds information about the slot found in llama_kv_cache_find_slot
+struct llama_kv_cache_slot_info {
+    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
+    bool found = false;                       // the slot was found
+    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
+    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
+    operator bool() const { return found; }
+};
+// TODO: maybe not needed
+uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
+bool llama_kv_cache_init(
+        struct llama_kv_cache & cache,
+            const llama_model & model,
+          const llama_cparams & cparams,
+                    lm_ggml_type   type_k,
+                    lm_ggml_type   type_v,
+                     uint32_t   kv_size,
+                         bool   offload);
+// find an empty slot of size "n_tokens" in the cache
+// updates the cache head
+// returns a structure holding information about the slot found
+// Note: On success, it's important that cache.head points
+// to the first cell of the slot.
+struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
+           struct llama_kv_cache & cache,
+       const struct llama_ubatch & batch);
+// find how many cells are currently in use
+uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
+void llama_kv_cache_clear(struct llama_kv_cache & cache);
+bool llama_kv_cache_seq_rm(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id,
+                    llama_pos   p0,
+                    llama_pos   p1);
+void llama_kv_cache_seq_cp(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id_src,
+                 llama_seq_id   seq_id_dst,
+                    llama_pos   p0,
+                    llama_pos   p1);
+void llama_kv_cache_seq_keep(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id);
+void llama_kv_cache_seq_add(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id,
+                    llama_pos   p0,
+                    llama_pos   p1,
+                    llama_pos   delta);
+void llama_kv_cache_seq_div(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id,
+                    llama_pos   p0,
+                    llama_pos   p1,
+                          int   d);
+llama_pos llama_kv_cache_seq_pos_max(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id);
+void llama_kv_cache_defrag(struct llama_kv_cache & cache);
+int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
+int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
+bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
+//
+// kv cache view
+//
+struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
+void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
+//
+// kv cache restore
+//
+// saves the kv_cache state for future recovery.
+// used to rollback llama_kv_cache_find_slot changes.
+struct llama_kv_slot_restorer {
+    struct llama_kv_cache_state {
+        uint32_t head = 0;
+        uint32_t n    = 0;
+    } old_state;
+    // for non-recurrent models only
+    // list of slots to restore
+    std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
+    bool do_restore = false;
+    explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
+        old_state.head = cache.head;
+        old_state.n    = cache.n;
+    }
+    // saves a slot information for future restoration
+    void save(const struct llama_kv_cache_slot_info & slot) {
+        if (slot) {
+            do_restore = true;
+            if (slot.boundaries.first != slot.boundaries.second) {
+                slot_boundaries.push_back(slot.boundaries);
+            }
+        }
+    }
+    // must be explicitly called to restore the kv_cache state
+    // and rollback changes from all llama_kv_cache_find_slot calls
+    void restore(struct llama_kv_cache & cache) {
+        if (do_restore) {
+            cache.head = old_state.head;
+            cache.n    = old_state.n;
+            if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
+                llama_kv_cache_seq_rm(cache, -1, -1, -1);
+            } else {
+                for (auto & slot : slot_boundaries) {
+                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
+                }
+            }
+        }
+    }
+};

package/cpp/llama-mmap.cpp CHANGED Viewed

@@ -7,6 +7,7 @@
 #include <cstring>
 #include <climits>
 #include <stdexcept>
+#include <cerrno>
 #ifdef __has_include
     #if __has_include(<unistd.h>)
@@ -35,7 +36,7 @@
 // TODO: consider moving to llama-impl.h if needed in more places
 #if defined(_WIN32)
-std::string llama_format_win_err(DWORD err) {
+static std::string llama_format_win_err(DWORD err) {
     LPSTR buf;
     size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                  NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);

package/cpp/llama-mmap.h CHANGED Viewed

@@ -1,67 +1,67 @@
-#pragma once
-#include <memory>
-#include <vector>
-struct llama_file;
-struct llama_mmap;
-struct llama_mlock;
-using llama_files  = std::vector<std::unique_ptr<llama_file>>;
-using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
-using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
-struct llama_file {
-    llama_file(const char * fname, const char * mode);
-    ~llama_file();
-    size_t tell() const;
-    size_t size() const;
-    int file_id() const; // fileno overload
-    void seek(size_t offset, int whence) const;
-    void read_raw(void * ptr, size_t len) const;
-    uint32_t read_u32() const;
-    void write_raw(const void * ptr, size_t len) const;
-    void write_u32(uint32_t val) const;
-private:
-    struct impl;
-    std::unique_ptr<impl> pimpl;
-};
-struct llama_mmap {
-    llama_mmap(const llama_mmap &) = delete;
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false);
-    ~llama_mmap();
-    size_t size() const;
-    void * addr() const;
-    void unmap_fragment(size_t first, size_t last);
-    static const bool SUPPORTED;
-private:
-    struct impl;
-    std::unique_ptr<impl> pimpl;
-};
-struct llama_mlock {
-    llama_mlock();
-    ~llama_mlock();
-    void init(void * ptr);
-    void grow_to(size_t target_size);
-    static const bool SUPPORTED;
-private:
-    struct impl;
-    std::unique_ptr<impl> pimpl;
-};
-size_t llama_path_max();
+#pragma once
+#include <memory>
+#include <vector>
+struct llama_file;
+struct llama_mmap;
+struct llama_mlock;
+using llama_files  = std::vector<std::unique_ptr<llama_file>>;
+using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
+using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
+struct llama_file {
+    llama_file(const char * fname, const char * mode);
+    ~llama_file();
+    size_t tell() const;
+    size_t size() const;
+    int file_id() const; // fileno overload
+    void seek(size_t offset, int whence) const;
+    void read_raw(void * ptr, size_t len) const;
+    uint32_t read_u32() const;
+    void write_raw(const void * ptr, size_t len) const;
+    void write_u32(uint32_t val) const;
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+struct llama_mmap {
+    llama_mmap(const llama_mmap &) = delete;
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false);
+    ~llama_mmap();
+    size_t size() const;
+    void * addr() const;
+    void unmap_fragment(size_t first, size_t last);
+    static const bool SUPPORTED;
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+struct llama_mlock {
+    llama_mlock();
+    ~llama_mlock();
+    void init(void * ptr);
+    void grow_to(size_t target_size);
+    static const bool SUPPORTED;
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+size_t llama_path_max();