RubyGems - llama_cpp - Versions diffs - 0.3.0 → 0.3.1 - Mend

llama_cpp 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/ext/llama_cpp/llama_cpp.cpp +74 -1
data/ext/llama_cpp/src/ggml-cuda.cu +48 -17
data/ext/llama_cpp/src/ggml-cuda.h +1 -0
data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
data/ext/llama_cpp/src/llama.cpp +127 -222
data/ext/llama_cpp/src/llama.h +16 -6
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +2 -0
metadata +2 -2

data/ext/llama_cpp/src/llama.cpp CHANGED Viewed

@@ -364,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
     return size / ggml_blck_size(type);
 }
-struct llama_load_tensor_shard {
-    std::vector<uint32_t> ne;
-    size_t size;
-    enum ggml_type type;
-    size_t file_idx;
-    size_t file_off;
-    void calc_size() {
-        size = llama_calc_tensor_size(ne, type);
-    }
-};
-enum llama_split_type {
-    SPLIT_NONE,
-    SPLIT_BY_COLUMNS,
-    SPLIT_BY_ROWS
-};
 struct llama_load_tensor {
-    std::vector<llama_load_tensor_shard> shards;
     std::string name;
     enum ggml_type type = GGML_TYPE_F32;
-    llama_split_type split_type = SPLIT_NONE;
     std::vector<uint32_t> ne;
+    size_t file_off;
     size_t size;
     struct ggml_tensor * ggml_tensor = NULL;
     uint8_t * data;
-    llama_load_tensor(const std::string & name) : name(name) {}
-    void calc_all() {
-        calc_type();
-        calc_split_type();
-        calc_ne();
-        calc_size();
-    }
-    void calc_type() {
-        const auto & first_shard = shards.at(0);
-        for (const auto & shard : shards) {
-            if (shard.type != first_shard.type) {
-                throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
-            }
-        }
-        type = first_shard.type;
-    }
-    void calc_split_type() {
-        if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
-            shards.size() == 1) { // only one file?
-            split_type = SPLIT_NONE;
-        } else if (name.find("tok_embeddings.") == 0 ||
-            name.find(".attention.wo.weight") != std::string::npos ||
-            name.find(".feed_forward.w2.weight") != std::string::npos) {
-            split_type = SPLIT_BY_COLUMNS;
-        } else {
-            split_type = SPLIT_BY_ROWS;
-        }
-    }
-    void calc_ne() {
-        const auto & first_shard = shards.at(0);
-        for (const auto & shard : shards) {
-            if (shard.ne != first_shard.ne) {
-                throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
-                             name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
-            }
-        }
-        ne = first_shard.ne;
-        LLAMA_ASSERT(shards.size() <= UINT32_MAX);
-        uint32_t n_shards = (uint32_t) shards.size();
-        switch (split_type) {
-            case SPLIT_NONE:
-                ne = first_shard.ne;
-                break;
-            case SPLIT_BY_COLUMNS:
-                ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
-                      first_shard.ne[1]};
-                break;
-            case SPLIT_BY_ROWS:
-                ne = {first_shard.ne[0],
-                      checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
-                break;
-        }
-    }
-    void calc_size() {
-        size = llama_calc_tensor_size(ne, type);
-    }
 };
 struct llama_load_tensors_map {
@@ -476,13 +394,13 @@ struct llama_file_loader {
     llama_hparams hparams;
     llama_vocab vocab;
-    llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
+    llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
         : file(fname, "rb") {
         fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
         read_magic();
         read_hparams();
         read_vocab();
-        read_tensor_metadata(file_idx, tensors_map);
+        read_tensor_metadata(tensors_map);
     }
     void read_magic() {
         uint32_t magic = file.read_u32();
@@ -539,19 +457,19 @@ struct llama_file_loader {
             tok_score.score = score;
         }
     }
-    void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
+    void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
         while (file.tell() < file.size) {
-            llama_load_tensor_shard shard;
+            llama_load_tensor tensor;
             uint32_t n_dims = file.read_u32();
             uint32_t name_len = file.read_u32();
-            shard.type = (enum ggml_type) file.read_u32();
-            shard.ne.resize(n_dims);
-            file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
+            tensor.type = (enum ggml_type) file.read_u32();
+            tensor.ne.resize(n_dims);
+            file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
             std::string name = file.read_string(name_len);
             if (n_dims < 1 || n_dims > 2) {
                 throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
             }
-            switch (shard.type) {
+            switch (tensor.type) {
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q4_0:
@@ -566,30 +484,20 @@ struct llama_file_loader {
                 case GGML_TYPE_Q6_K:
                     break;
                 default: {
-                    throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
+                    throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
                 }
             }
-            if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
-                // skip to the next multiple of 32 bytes
-                file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
-            }
-            shard.file_idx = file_idx;
-            shard.file_off = file.tell();
+            // skip to the next multiple of 32 bytes
+            file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
-            shard.calc_size();
-            file.seek(shard.size, SEEK_CUR);
+            tensor.file_off = file.tell();
+            tensor.name = name;
+            tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
+            file.seek(tensor.size, SEEK_CUR);
-            auto it = tensors_map.name_to_idx.find(name);
-            size_t idx;
-            if (it != tensors_map.name_to_idx.end()) {
-                idx = it->second;
-            } else {
-                tensors_map.tensors.emplace_back(name);
-                idx = tensors_map.tensors.size() - 1;
-                tensors_map.name_to_idx.emplace(name, idx);
-            }
-            tensors_map.tensors.at(idx).shards.push_back(shard);
+            tensors_map.tensors.push_back(tensor);
+            tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
         }
     }
 };
@@ -659,56 +567,19 @@ struct llama_file_saver {
 };
 struct llama_model_loader {
-    std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
+    std::unique_ptr<llama_file_loader> file_loader;
     llama_load_tensors_map tensors_map;
     bool use_mmap;
     size_t num_ggml_tensors_created = 0;
     struct ggml_context * ggml_ctx = NULL;
     std::unique_ptr<llama_mmap> mapping;
-    llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
-        auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
-        file_loaders.emplace_back(first_file);
-        uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
-        for (uint32_t i = 1; i < n_parts; i++) {
-            std::string fname = fname_base + "." + std::to_string(i);
-            auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
-            file_loaders.emplace_back(ith_file);
-            if (ith_file->hparams != first_file->hparams) {
-                throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
-            }
-        }
+    llama_model_loader(const std::string & fname_base, bool use_mmap) {
+        file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
         if (!llama_mmap::SUPPORTED) {
             use_mmap = false;
         }
-        if (use_mmap && alignment_prevents_mmap()) {
-            fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
-            use_mmap = false;
-        }
         this->use_mmap = use_mmap;
-        for (llama_load_tensor & lt : tensors_map.tensors) {
-            lt.calc_all();
-        }
-    }
-    bool alignment_prevents_mmap() {
-        for (const llama_load_tensor & lt : tensors_map.tensors) {
-            for (const llama_load_tensor_shard & shard : lt.shards) {
-                if (shard.file_off & 3) {
-                    return true;
-                }
-            }
-        }
-        return false;
-    }
-    uint32_t guess_n_parts() const {
-        auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
-        if (it == tensors_map.name_to_idx.end()) {
-            throw std::runtime_error(std::string("missing tok_embeddings.weight"));
-        }
-        const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
-        return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
     }
     void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -774,7 +645,7 @@ struct llama_model_loader {
         }
         if (use_mmap) {
-            mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
+            mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
             if (lmlock) {
                 lmlock->init(mapping->addr);
             }
@@ -830,45 +701,13 @@ struct llama_model_loader {
     void load_data_for(llama_load_tensor & lt) {
         if (use_mmap) {
-            LLAMA_ASSERT(lt.shards.size() == 1);
-            lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
-        } else if (lt.split_type == SPLIT_NONE) {
-            llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
-            file.seek(lt.shards.at(0).file_off, SEEK_SET);
+            lt.data = (uint8_t *) mapping->addr + lt.file_off;
+        } else {
+            llama_file & file = file_loader->file;
+            file.seek(lt.file_off, SEEK_SET);
             file.read_raw(lt.data, lt.size);
-        } else if (lt.split_type == SPLIT_BY_ROWS) {
-            size_t offset = 0;
-            for (llama_load_tensor_shard & shard : lt.shards) {
-                llama_file & file = file_loaders.at(shard.file_idx)->file;
-                file.seek(shard.file_off, SEEK_SET);
-                file.read_raw(lt.data + offset, shard.size);
-                offset += shard.size;
-            }
-            LLAMA_ASSERT(offset == lt.size);
-        } else if (lt.split_type == SPLIT_BY_COLUMNS) {
-            // Let's load the data into temporary buffers to ensure the OS performs large loads.
-            std::vector<llama_buffer> tmp_bufs(lt.shards.size());
-            for (size_t i = 0; i < lt.shards.size(); i++) {
-                llama_load_tensor_shard & shard = lt.shards.at(i);
-                llama_file & file = file_loaders.at(shard.file_idx)->file;
-                file.seek(shard.file_off, SEEK_SET);
-                tmp_bufs.at(i).resize(shard.size);
-                file.read_raw(tmp_bufs.at(i).addr, shard.size);
-            }
-            // Then reshape.
-            size_t num_rows = lt.ne.at(1);
-            size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
-            size_t out_offset = 0;
-            for (size_t row = 0; row < num_rows; row++) {
-                for (llama_buffer & tmp_buf : tmp_bufs) {
-                    memcpy(lt.data + out_offset,
-                           tmp_buf.addr + row * per_shard_row_size,
-                           per_shard_row_size);
-                    out_offset += per_shard_row_size;
-                }
-            }
-            LLAMA_ASSERT(out_offset == lt.size);
         }
         if (0) {
             print_checksum(lt);
         }
@@ -938,7 +777,7 @@ static bool kv_cache_init(
 struct llama_context_params llama_context_default_params() {
     struct llama_context_params result = {
-        /*.seed                        =*/ -1,
+        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
         /*.n_ctx                       =*/ 512,
         /*.n_batch                     =*/ 512,
         /*.gpu_layers                  =*/ 0,
@@ -1067,12 +906,12 @@ static void llama_model_load_internal(
     model.t_start_us = ggml_time_us();
-    std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
+    std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
-    vocab = std::move(ml->file_loaders.at(0)->vocab);
-    model.hparams = ml->file_loaders.at(0)->hparams;
+    vocab = std::move(ml->file_loader->vocab);
+    model.hparams = ml->file_loader->hparams;
     model.n_gpu_layers = n_gpu_layers;
-    llama_file_version file_version = ml->file_loaders.at(0)->file_version;
+    llama_file_version file_version = ml->file_loader->file_version;
     auto & hparams = model.hparams;
     {
@@ -1106,7 +945,6 @@ static void llama_model_load_internal(
         fprintf(stderr, "%s: n_rot      = %u\n",  __func__, hparams.n_rot);
         fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
         fprintf(stderr, "%s: n_ff       = %u\n",  __func__, n_ff);
-        fprintf(stderr, "%s: n_parts    = %zu\n", __func__, ml->file_loaders.size());
         fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
     }
@@ -1369,22 +1207,26 @@ static bool llama_model_load(
 // evaluate the transformer
 //
-//   - lctx:         llama context
-//   - tokens:       new batch of tokens to process
-//   - n_past:       the context size so far
-//   - n_threads:    number of threads to use
-//   - cgraph_fname: filename of the exported computation graph
+//   - lctx:      llama context
+//   - tokens:    new batch of tokens to process
+//   - embd       embeddings input
+//   - n_tokens   number of tokens
+//   - n_past:    the context size so far
+//   - n_threads: number of threads to use
 //
 static bool llama_eval_internal(
-        llama_context &  lctx,
-    const llama_token *  tokens,
-            const int    n_tokens,
-            const int    n_past,
-            const int    n_threads,
+         llama_context & lctx,
+     const llama_token * tokens,
+           const float * embd,
+             const int   n_tokens,
+             const int   n_past,
+             const int   n_threads,
             const char * cgraph_fname) {
+    LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
     // enforce that the first token is BOS
-    if (n_past == 0 && tokens[0] != llama_token_bos()) {
+    if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
         fprintf(stderr, "%s: first token must be BOS\n", __func__);
         return false;
     }
@@ -1424,12 +1266,18 @@ static bool llama_eval_internal(
     ggml_cgraph gf = {};
     gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_set_name(embd, "embd");
-    memcpy(embd->data, tokens, N*ggml_element_size(embd));
     struct ggml_tensor * cur;
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
+    struct ggml_tensor * inpL;
+    if (tokens) {
+        struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+        ggml_set_name(embd, "embd");
+        memcpy(embd->data, tokens, N*ggml_element_size(embd));
+        inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
+    } else {
+        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+        memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+    }
     const int i_gpu_start = n_layer - n_gpu_layers;
     (void) i_gpu_start;
@@ -2451,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         nthread = std::thread::hardware_concurrency();
     }
-    std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
-                                                                            /*vocab_only*/ false));
-    llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
+    std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
+    llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
 #ifdef GGML_USE_K_QUANTS
     int n_attention_wv    = 0;
@@ -2654,6 +2501,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     }
 }
 //
 // interface implementation
 //
@@ -2692,7 +2541,7 @@ struct llama_context * llama_new_context_with_model(
     llama_context * ctx = new llama_context(*model, model->vocab);
-    if (params.seed < 0) {
+    if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
     }
@@ -2874,7 +2723,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     // create a name -> tensor map of the model to accelerate lookups
     std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
-    for (auto & kv: model.tensors_by_name) {
+    for (const auto & kv: model.tensors_by_name) {
         model_tensors.insert(kv);
     }
@@ -2885,7 +2734,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     llama_buffer base_buf;
     if (path_base_model) {
         fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
-        model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
+        model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
         size_t ctx_size;
         size_t mmapped_size;
@@ -2903,7 +2752,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
         // maybe this should in llama_model_loader
         if (model_loader->use_mmap) {
-            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
+            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
         }
     }
@@ -2964,7 +2813,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
                         return false;
                     }
         }
-        ggml_tensor* lora_tensor;
+        ggml_tensor * lora_tensor;
         if (n_dims == 2) {
             lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
         }
@@ -2972,6 +2821,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
             return 1;
         }
+        ggml_set_name(lora_tensor, "lora_tensor");
         // load tensor data
         size_t offset = fin.tellg();
@@ -2987,6 +2837,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
             ggml_tensor * dest_t = model_tensors[base_name];
+            offload_func_t offload_func = llama_nop;
+            offload_func_t offload_func_force_inplace = llama_nop;
+#ifdef GGML_USE_CUBLAS
+            if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
+                if (dest_t->type != GGML_TYPE_F16) {
+                    throw std::runtime_error(format(
+                        "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
+                }
+                offload_func = ggml_cuda_assign_buffers;
+                offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
+            }
+#endif // GGML_USE_CUBLAS
             ggml_tensor * base_t;
             if (model_loader) {
                 // load from base model
@@ -3014,7 +2879,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             }
             ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
+            GGML_ASSERT(loraA->type == GGML_TYPE_F32);
+            ggml_set_name(loraA, "loraA");
             ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
+            GGML_ASSERT(loraB->type == GGML_TYPE_F32);
+            ggml_set_name(loraB, "loraB");
             if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
                 fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -3024,19 +2894,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             // w = w + BA*s
             ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
+            offload_func(BA);
+            ggml_set_name(BA, "BA");
             if (scaling != 1.0f) {
                 ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
+                ggml_set_name(scale_tensor, "scale_tensor");
                 BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
+                offload_func(BA);
+                ggml_set_name(BA, "BA_scaled");
             }
             ggml_tensor * r;
             if (base_t == dest_t) {
                 r = ggml_add_inplace(lora_ctx, dest_t, BA);
+                offload_func_force_inplace(r);
+                ggml_set_name(r, "r_add_inplace");
             }
             else {
                 r = ggml_add(lora_ctx, base_t, BA);
+                offload_func(r);
+                ggml_set_name(r, "r_add");
                 r = ggml_cpy(lora_ctx, r, dest_t);
+                offload_func(r);
+                ggml_set_name(r, "r_cpy");
             }
             struct ggml_cgraph gf = ggml_build_forward(r);
@@ -3091,8 +2974,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
 #define LLAMA_MAX_RNG_STATE (64*1024)
-void llama_set_rng_seed(struct llama_context * ctx, int seed) {
-    if (seed < 0) {
+void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
         seed = time(NULL);
     }
     ctx->rng.seed(seed);
@@ -3421,7 +3304,29 @@ int llama_eval(
                          int   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
+    if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
+        fprintf(stderr, "%s: failed to eval\n", __func__);
+        return 1;
+    }
+    // get a more accurate load time, upon first eval
+    // TODO: fix this
+    if (!ctx->has_evaluated_once) {
+        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->has_evaluated_once = true;
+    }
+    return 0;
+}
+int llama_eval_embd(
+            struct llama_context * ctx,
+                     const float * embd,
+                             int   n_tokens,
+                             int   n_past,
+                             int   n_threads) {
+    if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
@@ -3442,7 +3347,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
     const std::vector<llama_token> tmp(n_batch, llama_token_bos());
-    if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
+    if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -46,6 +46,8 @@
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1
+#define LLAMA_DEFAULT_SEED           0xFFFFFFFF
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -81,11 +83,11 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
    struct llama_context_params {
-        int seed;                              // RNG seed, -1 for random
-        int n_ctx;                             // text context
-        int n_batch;                           // prompt processing batch size
-        int n_gpu_layers;                      // number of layers to store in VRAM
-        int main_gpu;                          // the GPU that is used for scratch and small tensors
+        uint32_t seed;                         // RNG seed, -1 for random
+        int32_t  n_ctx;                        // text context
+        int32_t  n_batch;                      // prompt processing batch size
+        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
+        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
         float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
@@ -196,7 +198,7 @@ extern "C" {
     LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
     // Sets the current rng seed.
-    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
     // Returns the maximum size in bytes of the state (rng, logits, embedding
     // and kv_cache) - will often be smaller after compacting tokens
@@ -226,6 +228,14 @@ extern "C" {
                              int   n_past,
                              int   n_threads);
+    // Same as llama_eval, but use float matrix input directly.
+    LLAMA_API int llama_eval_embd(
+            struct llama_context * ctx,
+                     const float * embd,
+                             int   n_tokens,
+                             int   n_past,
+                             int   n_threads);
     // Export a static computation graph for context of 511 and batch size of 1
     // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
     //       parameters here to keep things simple

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.3.0'
+  VERSION = '0.3.1'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-9d23589'
+  LLAMA_CPP_VERSION = 'master-b8c8dda'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -4,6 +4,7 @@ module LLaMACpp
   LLAMA_FILE_VERSION: String
   LLAMA_FILE_MAGIC: String
   LLAMA_FILE_MAGIC_UNVERSIONED: String
+  LLAMA_DEFALUT_SEED: String
   LLAMA_MAX_DEVICES: Integer
@@ -72,6 +73,7 @@ module LLaMACpp
     def initialize: (model: ::LLaMACpp::Model) -> void
     def embeddings: () -> Array[Float]
     def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
+    def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
     def eval_export: (String) -> bool
     def logits: () -> Array[Float]
     def n_ctx: () -> Integer

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.3.1
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-06-29 00:00:00.000000000 Z
+date: 2023-07-02 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email: