RubyGems - llama_cpp - Versions diffs - 0.3.6 → 0.3.8 - Mend

llama_cpp 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +11 -0
data/ext/llama_cpp/extconf.rb +2 -2
data/ext/llama_cpp/llama_cpp.cpp +8 -0
data/ext/llama_cpp/src/ggml-alloc.c +44 -6
data/ext/llama_cpp/src/ggml-alloc.h +4 -0
data/ext/llama_cpp/src/ggml-cuda.cu +1398 -702
data/ext/llama_cpp/src/ggml-cuda.h +19 -23
data/ext/llama_cpp/src/ggml-metal.h +6 -3
data/ext/llama_cpp/src/ggml-metal.m +112 -146
data/ext/llama_cpp/src/ggml-metal.metal +471 -498
data/ext/llama_cpp/src/ggml.c +396 -150
data/ext/llama_cpp/src/ggml.h +113 -32
data/ext/llama_cpp/src/llama-util.h +51 -9
data/ext/llama_cpp/src/llama.cpp +390 -210
data/ext/llama_cpp/src/llama.h +20 -1
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +1 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 545786d4c9308ffe0f7e214a12427beaea0b26bec915ff84b16eed25ef1932a4
-  data.tar.gz: aaa0d4fc1710b13a26163306c8b51e423233c2f7e4b3d6127f94c9b6c4846f9c
+  metadata.gz: 8a6623a24970936369944231171226dda1ce579bf29fc3711f8923c8d2d22cba
+  data.tar.gz: dbff8f38ea54195b05fc0acbaf8fceb7fd6bfdc329100a18665ef2cba2fd5d81
 SHA512:
-  metadata.gz: 12b3ac122fd7ea59b51e2d6ff905ed1a71cf8a8b3650a269d4a3793ae32a0149f6836a792c8f216d0fdb0c39aeb3b47914e73ffc74b574bbe686660e6be84ea1
-  data.tar.gz: 5056b95552f3434692a6c19653810d77bb28ddf9b28abd78712ccfb4ee4f7d836a5d54e283513fcfc617cc79ffa7bb9257d4ac2b6d96ec89158bf94acd4cec86
+  metadata.gz: 710ab86cfea7b5f91a386bdf87872c1d19ba49057bc02aa11a4f0198aee404a2d5b931965fdeba40aa1353269f95a451090e261305931e31a182a078827ace80
+  data.tar.gz: ec4d956b5ab5ad665a0e99489b81b364b79ed39e74146629e4140240b5e176f4ef9dbf3d1c11acdb4098398114fbf055a2ad4f8251ed98ec42471a478f6dcaa2

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,14 @@
+## [[0.3.8](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.7...v0.3.8)] - 2023-08-19
+- Bump bundled llama.cpp from master-9ca4abe to master-097e121.
+  - Add `type` method to Model.
+- Revert pull request #2592 in llama.cpp.
+It seems that PWIN32_MEMORY_RANGE_ENTRY and WIN32_MEMORY_RANGE_ENTRY do not exist in mingw.
+## [[0.3.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.6...v0.3.7)] - 2023-08-12
+- Bump bundled llama.cpp from master-468ea24 to master-9ca4abe .
 ## [[0.3.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.5...v0.3.6)] - 2023-08-04
 - Bump bundled llama.cpp from master-1a94186 to master-468ea24.

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -52,8 +52,8 @@ end
 if with_config('metal')
   $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
   $CXXFLAGS << ' -DGGML_USE_METAL'
-  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
-  $objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
+  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
+  $objs = %w[ggml.o ggml-alloc.o llama.o llama_cpp.o ggml-metal.o]
   $objs << 'k_quants.o' unless with_config('no_k_quants')
 end

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -814,6 +814,7 @@ public:
     rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
     rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
     rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
+    rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
   }
 private:
@@ -1061,6 +1062,13 @@ private:
     RB_GC_GUARD(text_);
     return ret;
   }
+  static VALUE _llama_model_get_model_type(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    char buf[128];
+    ::llama_model_type(ptr->model, buf, sizeof(buf));
+    return rb_str_new_cstr(buf);
+  }
 };
 const rb_data_type_t RbLLaMAModel::llama_model_type = {

data/ext/llama_cpp/src/ggml-alloc.c CHANGED Viewed

@@ -67,6 +67,8 @@ struct ggml_allocr {
     struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
     size_t max_size;
     bool measure;
+    int parse_seq[GGML_MAX_NODES];
+    bool has_parse_seq;
 #ifdef GGML_ALLOCATOR_DEBUG
     struct ggml_tensor * allocated_tensors[1024];
@@ -111,10 +113,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
     size_t max_avail = 0;
-    // find the best fitting free block
+    // find the best fitting free block besides the last block
     int best_fit_block = -1;
     size_t best_fit_size = SIZE_MAX;
-    for (int i = 0; i < alloc->n_free_blocks; i++) {
+    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
         struct free_block * block = &alloc->free_blocks[i];
         max_avail = MAX(max_avail, block->size);
         if (block->size >= size && block->size <= best_fit_size) {
@@ -126,10 +128,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
     AT_PRINTF("block %d\n", best_fit_block);
     if (best_fit_block == -1) {
-        fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
-                __func__, size, max_avail);
-        GGML_ASSERT(!"not enough space in the buffer");
+        // the last block is our last resort
+        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        if (block->size >= size) {
+            best_fit_block = alloc->n_free_blocks - 1;
+            max_avail = MAX(max_avail, block->size);
+        } else {
+            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+                    __func__, size, max_avail);
+            GGML_ASSERT(!"not enough space in the buffer");
         return;
+        }
     }
     struct free_block * block = &alloc->free_blocks[best_fit_block];
     void * addr = block->addr;
@@ -229,6 +238,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
     alloc->n_free_blocks++;
 }
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
+    int pos = 0;
+    for (int i = 0; i < n; i++) {
+        if (list[i] != -1) {
+            alloc->parse_seq[pos] = list[i];
+            pos++;
+        }
+    }
+    alloc->has_parse_seq = true;
+}
 void ggml_allocr_reset(struct ggml_allocr * alloc) {
     alloc->n_free_blocks = 1;
     size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@@ -248,6 +268,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
         /*.hash_table    = */ {{0}},
         /*.max_size      = */ 0,
         /*.measure       = */ false,
+        /*.parse_seq     = */ {0},
+        /*.has_parse_seq = */ false,
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ = {0},
 #endif
@@ -275,6 +297,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
         /*.hash_table    = */ {{0}},
         /*.max_size      = */ 0,
         /*.measure       = */ true,
+        /*.parse_seq     = */ {0},
+        /*.has_parse_seq = */ false,
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ = {0},
 #endif
@@ -394,6 +418,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                     if (parent == NULL) {
                         break;
                     }
+                    // if the node's data is external, then we cannot re-use it
+                    if ((char *) parent->data < (char *) alloc->data ||
+                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
+                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+                        continue;
+                    }
                     struct hash_node * p_hn = hash_get(ht, parent);
                     if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
                         if (ggml_is_view(parent)) {
@@ -465,7 +497,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                 allocate_node(alloc, input);
             }
         }
-        for (int i = 0; i < gf->n_nodes; i++) {
+        for (int ind = 0; ind < gf->n_nodes; ind++) {
+            int i;
+            if (alloc->has_parse_seq) {
+                i = alloc->parse_seq[ind];
+            } else {
+                i = ind;
+            }
             struct ggml_tensor * node = gf->nodes[i];
             // allocate parents (leafs)

data/ext/llama_cpp/src/ggml-alloc.h CHANGED Viewed

@@ -10,6 +10,10 @@ extern "C" {
 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
 GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
 GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);