RubyGems - llama_cpp - Versions diffs - 0.10.1 → 0.10.2 - Mend

llama_cpp 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/ext/llama_cpp/llama_cpp.cpp +16 -1
data/ext/llama_cpp/src/ggml-alloc.c +12 -4
data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
data/ext/llama_cpp/src/ggml-backend.c +75 -5
data/ext/llama_cpp/src/ggml-backend.h +7 -0
data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
data/ext/llama_cpp/src/ggml-metal.h +3 -0
data/ext/llama_cpp/src/ggml-metal.m +190 -44
data/ext/llama_cpp/src/ggml-metal.metal +11 -2
data/ext/llama_cpp/src/ggml.c +262 -89
data/ext/llama_cpp/src/ggml.h +24 -10
data/ext/llama_cpp/src/llama.cpp +926 -780
data/ext/llama_cpp/src/llama.h +8 -3
data/lib/llama_cpp/version.rb +2 -2
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9bd5dbea8695fdb41ea6d97e372c2cea452ee8ed070e26bd558a720d6c24fe27
-  data.tar.gz: 8d7bfd02445df81644eebb3a6db49bb1ddf241a344fef76a0b949f1c12d0639d
+  metadata.gz: b4e94e20f142572fb46cff141e109025e1c5b91b9cd6cabfbaeac163a920bb82
+  data.tar.gz: 10fcd2922c057a0c960cbde83a0fa22a56eae6e21a73e20931cfe082108dec26
 SHA512:
-  metadata.gz: 116423f4581f605ee379bcd690299e152087c03665be6171bc137205f0824be5d5e0ce6d3c0b548fc6d28193c601679c3cc27a101b6f5b58968c69388a70cbfc
-  data.tar.gz: fd162b1c4e26732573d32ba7439fc44dd2ff09a9024c64ff5adce997e154a934d9556f0ff3cfd6b83dd7dcbfb8366cc6ce394b8324f39ef349d4cb5834ea43f1
+  metadata.gz: 570f5f257d947bad8489fcafe6c7cc0dc342d593132111d3479c0202e0998e444ca75c2140883dc8cf6a22f0f4f74fecd264a577cee6aabde4c168f5003b4c98
+  data.tar.gz: d105e41aac3bb39b2ee53c43eb320272ffa5290caa7e14e2f08bebbade94417aed065e01d23b8def9cce08fd6789576d33ba4082750647c0bbf9ff4c776db9ac

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,9 @@
+## [[0.10.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.1...v0.10.2)] - 2023-12-23
+- Bump bundled llama.cpp from b1641 to b1686.
+  - Add `LLAMA_FILE_MAGIC_GGLA` constant.
+  - Add `n_batch` method to `Context`.
 ## [[0.10.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.0...v0.10.1)] - 2023-12-16
 - Bump bundled llama.cpp from b1620 to b1641.

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -1949,6 +1949,7 @@ public:
     rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
     rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
+    rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
     rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
     rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
     rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2201,7 +2202,16 @@ private:
       rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
       return Qnil;
     }
-    return INT2NUM(llama_n_ctx(ptr->ctx));
+    return UINT2NUM(llama_n_ctx(ptr->ctx));
+  }
+  static VALUE _llama_context_n_batch(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    return UINT2NUM(llama_n_batch(ptr->ctx));
   }
   static VALUE _llama_context_get_timings(VALUE self) {
@@ -3146,6 +3156,11 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
   std::stringstream ss_magic;
+  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
+  ss_magic.str("");
+  ss_magic.clear(std::stringstream::goodbit);
   ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
   rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));

data/ext/llama_cpp/src/ggml-alloc.c CHANGED Viewed

@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
     if (update_backend) {
         view->backend = view->view_src->backend;
     }
-    view->buffer  = view->view_src->buffer;
+    // views are initialized in the alloc buffer rather than the view_src buffer
+    view->buffer  = alloc->buffer;
     view->data    = (char *)view->view_src->data + view->view_offs;
-    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
-    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
     assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
     if (!alloc->measure) {
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
 }
 void ggml_allocr_free(ggml_allocr_t alloc) {
+    if (alloc == NULL) {
+        return;
+    }
     ggml_gallocr_free(alloc->galloc);
     ggml_tallocr_free(alloc->talloc);
     free(alloc);
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
     }
     if (nbytes == 0) {
-        fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+        // all the tensors in the context are already allocated
         return NULL;
     }
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
             } else {
                 ggml_backend_view_init(buffer, t);
             }
+        } else {
+            if (t->view_src != NULL) {
+                // view of a pre-allocated tensor
+                ggml_backend_view_init(buffer, t);
+            }
         }
     }

data/ext/llama_cpp/src/ggml-backend-impl.h CHANGED Viewed

@@ -20,6 +20,9 @@ extern "C" {
         size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
         size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
         bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        // check if tensor data is in host memory
+        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
+        bool                  (*is_host)         (ggml_backend_buffer_type_t buft);
     };
     struct ggml_backend_buffer_type {
@@ -31,15 +34,16 @@ extern "C" {
     typedef void * ggml_backend_buffer_context_t;
     struct ggml_backend_buffer_i {
-        void     (*free_buffer)(ggml_backend_buffer_t buffer);
+        void   (*free_buffer)    (ggml_backend_buffer_t buffer);
         //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-        void *   (*get_base)   (ggml_backend_buffer_t buffer);
-        void     (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void     (*set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void     (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        void * (*get_base)       (ggml_backend_buffer_t buffer);
+        void   (*init_tensor)    (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void   (*set_tensor)     (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void   (*get_tensor)     (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
         // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
-        void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void   (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void   (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void   (*clear)          (ggml_backend_buffer_t buffer, uint8_t value);
     };
     struct ggml_backend_buffer {
@@ -78,7 +82,7 @@ extern "C" {
         void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
         void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*synchronize)     (ggml_backend_t backend);
+        void (*synchronize)(ggml_backend_t backend);
         // compute graph with a plan
         ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);

data/ext/llama_cpp/src/ggml-backend.c CHANGED Viewed

@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
     return buft->iface.supports_backend(buft, backend);
 }
+bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
+    if (buft->iface.is_host) {
+        return buft->iface.is_host(buft);
+    }
+    return false;
+}
 // backend buffer
 ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
     return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
 }
+void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    buffer->iface.clear(buffer, value);
+}
+bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
+    return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
+}
 ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
     return buffer->buft;
 }
@@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     free(buffer->context);
-    GGML_UNUSED(buffer);
 }
 static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
     GGML_UNUSED(buffer);
 }
+static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
 static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
     /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
@@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
     /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
     /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
     /* .cpy_tensor_to   = */ ggml_backend_cpu_buffer_cpy_tensor_to,
+    /* .clear           = */ ggml_backend_cpu_buffer_clear,
 };
 // for buffers from ptr, free is not called
@@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
     /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
     /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
     /* .cpy_tensor_to   = */ ggml_backend_cpu_buffer_cpy_tensor_to,
+    /* .clear           = */ ggml_backend_cpu_buffer_clear,
 };
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
@@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
     GGML_UNUSED(buft);
 }
+static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return true;
+    GGML_UNUSED(buft);
+}
 ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
         /* .iface = */ {
             /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
             /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
         },
         /* .context = */ NULL,
     };
-    return &ggml_backend_buffer_type_cpu;
+    return &ggml_backend_cpu_buffer_type;
 }
+#ifdef GGML_USE_CPU_HBM
+// buffer type HBM
+#include <hbwmalloc.h>
+static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    //void * ptr = hbw_malloc(size);
+    void * ptr;
+    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
+    }
+    // FIXME: this is a hack to avoid having to implement a new buffer type
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
+    return buffer;
+}
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .context  = */ NULL,
+    };
+    return &ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
 struct ggml_backend_cpu_context {
     int n_threads;
     void * work_data;
@@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
     cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-    cpu_plan->cgraph = *cgraph;
+    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
     if (cpu_plan->cplan.work_size > 0) {
         cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
@@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
 // utils
 void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->buffer == NULL);
-    GGML_ASSERT(tensor->data == NULL);
+    //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
     GGML_ASSERT(tensor->view_src != NULL);
     GGML_ASSERT(tensor->view_src->buffer != NULL);
     GGML_ASSERT(tensor->view_src->data != NULL);

data/ext/llama_cpp/src/ggml-backend.h CHANGED Viewed

@@ -21,6 +21,7 @@ extern "C" {
     GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
     GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
     GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+    GGML_API bool ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
     // buffer
     GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
@@ -29,6 +30,8 @@ extern "C" {
     GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
     GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
     GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API bool   ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
     GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
     //
@@ -76,6 +79,10 @@ extern "C" {
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+#ifdef GGML_USE_CPU_HBM
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif
     //
     // Backend registry
     //